diff --git a/src/corpus.py b/src/corpus.py index 92f0e28..238caa1 100644 --- a/src/corpus.py +++ b/src/corpus.py @@ -40,12 +40,12 @@ def removeEmpty(data): def onlyStations(data): # DATA: List of Dictionaries # Iterates through the list and checks each item (which is a dictionary) for the STANOX key - # If the STANOX exists the dictionary will be appended to the `workingList` which is returned once iteration is complete + # If the 3ALPHA exists the dictionary will be appended to the `workingList` which is returned once iteration is complete preLength = len(data) log.out(f"corpus.onlyStations: Removing items without STANOX from {preLength} datapoints", "INFO") workingList = [] for item in data: - if 'STANOX' in item: + if '3ALPHA' in item: workingList.append(item) postLength = len(workingList) log.out(f"corpus.onlyStations: Removed {preLength - postLength} datapoints", "INFO") diff --git a/src/main.py b/src/main.py index 7652b59..8f955e5 100644 --- a/src/main.py +++ b/src/main.py @@ -14,19 +14,40 @@ # program. If not, see # https://git.fjla.uk/OwlBoard/db-manager/src/branch/main/LICENSE -print("main.py: Initialising db-manager") +version = "0.1.0" +print(f"main.py: Initialising db-manager v{version}") #Third Party Imports import os import time #Local Imports -import corpus -#import mongo +import corpus, mongo import logger as log -#This is only a test run: -rawCorpus = corpus.fetch() -cleanCorpus = corpus.removeEmpty(rawCorpus) -stationsOnly = corpus.onlyStations(cleanCorpus) -print("DONE") \ No newline at end of file +log.out("main.py: db-manager Initialised", "INFO") + +#Check & Update corpus/stations: + # If older than 12 days then update +corpusAge = int(time.time()) - mongo.metaCheckTime("corpus") +log.out(f'main.py: Corpus is {corpusAge}s old', "INFO") +if corpusAge > 1036800: + log.out('main.py: Updating CORPUS data', "INFO") + corpusData = corpus.removeEmpty(corpus.fetch()) + mongo.putBulkCorpus(corpusData) +else: + log.out('main.py: Not updating CORPUS data', "INFO") + +stationsAge = int(time.time()) - mongo.metaCheckTime("stations") +log.out(f'main.py: Stations is {stationsAge}s old', "INFO") +# While the source of stations data is CORPUS, this statement is based on corpusAge, when/if changing the source, it should be changed to use stationsAge +# if stationsAge is used now, there could be a situation where stationsAge tries to update but fails as corpusData doesn't exist. +if corpusAge > 1036800: + log.out('main.py: Updating stations data', "INFO") + stationData = corpus.onlyStations(corpusData) + mongo.putBulkStations(stationData) +else: + log.out('main.py: Not updating stations data', "INFO") + +# END +log.out(f"main.py: db-manager v{version} Complete", "INFO") \ No newline at end of file diff --git a/src/mongo.py b/src/mongo.py index 6f02310..2b910e0 100644 --- a/src/mongo.py +++ b/src/mongo.py @@ -1,9 +1,62 @@ import os from pymongo import MongoClient +import time import urllib.parse import logger as log log.out("mongo.py: Fetching configuration", "INFO") -db_url = os.getenv('OWL_DB_HOST') + ":" + os.getenv('OWL_DB_PORT') -db_user = urllib.parse.quote_plus(os.getenv('OWL_DB_USER')) -db_pass = urllib.parse.quote_plus(os.getenv('OWL_DB_PASS')) \ No newline at end of file +db_host = os.getenv('OWL_DB_HOST', 'localhost') +db_port = os.getenv('OWL_DB_PORT', 27017) +db_user = urllib.parse.quote_plus(os.getenv('OWL_DB_USER', "owl")) +db_pass = urllib.parse.quote_plus(os.getenv('OWL_DB_PASS', "twittwoo")) +db_name = os.getenv('OWL_DB_NAME', "owlboard") + +client = MongoClient(f"mongodb://{db_user}:{db_pass}@{db_host}:{db_port}") +db = client[db_name] + +def metaCheckTime(target): + col = db["meta"] + res = col.find_one({"target": target}) + log.out(f'mongo.metaUpdateTime: {target} last updated at {res["updated"]}', "INFO") + return res["updated"] + +def metaUpdateTime(target): + col = db["meta"] + log.out(f'mongo.metaUpdateTime: Updating updated time for {target}', "INFO") + res = col.update_one({"target": target}, {"$set":{"updated": int(time.time())}}, upsert=True) + +def getLength(collection): + col = db[collection] + return col.count_documents({}) + +def putBulkCorpus(data): + startCount = getLength("corpus") + col = db["corpus"] + if startCount > 0: + log.out(f'mongo.putBulkCorpus: Dropping {startCount} CORPUS documents', "INFO") + col.drop() + col.insert_many(data) + endCount = getLength("corpus") + log.out(f'mongo.putBulkCorpus: {endCount} documents inserted', "INFO") + log.out(f'mongo.putBulkCorpus: {endCount - startCount} new documents', "INFO") + #log.out('mongo.putBulkCorpus: Building collection indexes',"INFO") + # ADD INDEXES HERE + log.out('mongo.putBulkCorpus: Updating meta time',"INFO") + metaUpdateTime("corpus") + return + +def putBulkStations(data): + startCount = getLength("stations") + col = db["stations"] + if startCount > 0: + log.out(f'mongo.putBulkStations: Dropping {startCount} station documents', "INFO") + col.drop() + col.insert_many(data) + endCount = getLength("stations") + log.out(f'mongo.putBulkStations: {endCount} documents inserted', "INFO") + log.out(f'mongo.putBulkStations: {endCount - startCount} new documents', "INFO") + #log.out('mongo.putBulkStations: Building collection indexes',"INFO") + # ADD INDEXES HERE + log.out('mongo.putBulkStations: Updating meta time',"INFO") + metaUpdateTime("stations") + return \ No newline at end of file