diff --git a/src/corpus.py b/src/corpus.py index d4b078c..b8a0aeb 100644 --- a/src/corpus.py +++ b/src/corpus.py @@ -4,7 +4,8 @@ import requests import logger as log import zlib import json -import mongo +import datetime +import mongo, helpers CORPUS_URL = "https://publicdatafeeds.networkrail.co.uk/ntrod/SupportingFileAuthenticate?type=CORPUS" @@ -14,6 +15,33 @@ CORPUS_PASS = os.getenv('OWL_LDB_CORPUSPASS') log.out("corpus.py: CORPUS Module Loaded", "DBUG") +def runUpdate(): + required = isUpdateRequired() + if required: + corpus_data = fetch() + prepared_corpus = removeEmpty(corpus_data) + prepared_stations = onlyStations(prepared_corpus) + corpus_indexes = ["3ALPHA", "NLC"] + mongo.dropCollection("corpus") + mongo.putMany("corpus", prepared_corpus, corpus_indexes) + stations_indexes = ["3ALPHA", "STANOX", "TIPLOC"] + mongo.dropCollection("stations") + mongo.putMany("stations", prepared_stations, stations_indexes) + return + return + +def isUpdateRequired(): + update_time = mongo.metaCheckTime("corpus") + age = helpers.getAgeInSeconds(update_time) + readable_age = str(datetime.timedelta(seconds=age)) + log.out(f"corpus.isUpdateRequired: CORPUS data is {readable_age} seconds old.", "INFO") + if age > helpers.two_weeks_in_seconds: + log.out("corpus.isUpdateRequired: CORPUS data required update", "INFO") + return True + log.out("corpus.isUpdateRequired: CORPUS data does not need updating", "INFO") + return False + + def fetch(): log.out("corpus.fetch: Fetching CORPUS Data from Network Rail", "INFO") response = requests.get(CORPUS_URL, auth=(CORPUS_USER, CORPUS_PASS)) diff --git a/src/helpers.py b/src/helpers.py index d14829b..f5f568e 100644 --- a/src/helpers.py +++ b/src/helpers.py @@ -1,5 +1,8 @@ import time +one_day_in_seconds = 84600 +two_weeks_in_seconds = 1209600 + def getAgeInSeconds(updateTimeInSeconds :int): now = int(time.time()) ageInSeconds :int = now - updateTimeInSeconds diff --git a/src/main.py b/src/main.py index be81281..300d78f 100644 --- a/src/main.py +++ b/src/main.py @@ -14,7 +14,7 @@ # program. If not, see # https://git.fjla.uk/OwlBoard/db-manager/src/branch/main/LICENSE -version = "2023.6.6" +version = "2023.6.8" print(f"main.py: Initialising db-manager v{version}") #Third Party Imports @@ -23,7 +23,7 @@ import time # Import logger import logger as log -log.out(f"main.py: db-manager {version} Initialised", "INFO") +log.out(f"main.py: db-manager {version} Initialised on host {os.uname()[1]}", "INFO") #Local Imports import corpus, mongo, pis, mailer, timetable @@ -38,27 +38,8 @@ while dbReady is False: except: dbReady = False -#Check & Update corpus/stations: - # If older than 12 days then update -corpusAge = int(time.time()) - mongo.metaCheckTime("corpus") -log.out(f'main.py: Corpus is {corpusAge}s old', "INFO") -if corpusAge > 1036800: - log.out('main.py: Updating CORPUS data', "INFO") - corpusData = corpus.removeEmpty(corpus.fetch()) - mongo.putBulkCorpus(corpusData) -else: - log.out('main.py: Not updating CORPUS data until it is 1036800s old.', "INFO") - -stationsAge = int(time.time()) - mongo.metaCheckTime("stations") -log.out(f'main.py: Stations is {stationsAge}s old', "INFO") -# While the source of stations data is CORPUS, this statement is based on corpusAge, when/if changing the source, it should be changed to use stationsAge -# if stationsAge is used now, there could be a situation where stationsAge tries to update but fails as corpusData doesn't exist. -if corpusAge > 1036800: - log.out('main.py: Updating stations data', "INFO") - stationData = corpus.onlyStations(corpusData) - mongo.putBulkStations(stationData) -else: - log.out('main.py: Not updating stations data until it is 1036800s old.', "INFO") +## Run CORPUS Update +status = corpus.runUpdate() ## Run PIS Update pis.runUpdate() @@ -66,6 +47,7 @@ pis.runUpdate() ## Run Timetable Update timetable.runUpdate() +## Create general indexes log.out('main.py: Requesting TTL Index Creation', "INFO") mongo.createTtlIndex("users", "atime", 2629800) mongo.createTtlIndex("registrations", "time", 1800) @@ -73,7 +55,6 @@ mongo.createTtlIndex("registrations", "time", 1800) # Push version number to database for reporting mongo.putVersion(version) -# END log.out(f"main.py: db-manager v{version} Complete", "INFO") log.out(f"main.py: Mailing logs") mailer.submitLogs() \ No newline at end of file diff --git a/src/mongo.py b/src/mongo.py index e375cf1..7dd059e 100644 --- a/src/mongo.py +++ b/src/mongo.py @@ -1,6 +1,6 @@ import os from pymongo import MongoClient -import time +import time, datetime import urllib.parse import logger as log @@ -22,7 +22,8 @@ def metaCheckTime(target): incrementCounter("meta") if type(res) is dict: if 'updated' in res: - log.out(f'mongo.metaUpdateTime: {target} last updated at {res["updated"]}', "INFO") + readable_datetime = datetime.datetime.fromtimestamp(res["updated"]) + log.out(f'mongo.metaUpdateTime: {target} last updated at {readable_datetime}', "INFO") return res["updated"] log.out(f'mongo.metaUpdatetime: {target} does not exist', "EROR") return 0 @@ -50,67 +51,14 @@ def createTtlIndex(collection, field, time): col.create_index(field, expireAfterSeconds = time) log.out(f'mongo.createTtlIndex: Created TTL Index of {field} in {collection} to expire after {time} seconds', "INFO") -def putBulkCorpus(data): - collection = "corpus" - startCount = getLength(collection) - col = db[collection] - incrementCounter(collection) - if startCount > 0: - log.out(f'mongo.putBulkCorpus: Dropping {startCount} CORPUS documents', "INFO") - col.drop() - col.insert_many(data) - endCount = getLength(collection) - log.out(f'mongo.putBulkCorpus: {endCount} documents inserted', "INFO") - log.out(f'mongo.putBulkCorpus: {endCount - startCount} new documents', "INFO") - log.out('mongo.putBulkCorpus: Building collection indexes',"INFO") - createSingleIndex(collection, "NLC") - createSingleIndex(collection, "3ALPHA") - log.out('mongo.putBulkCorpus: Updating meta time',"INFO") - metaUpdateTime(collection) - return - -def putBulkStations(data): - collection = "stations" - startCount = getLength(collection) - col = db[collection] - incrementCounter("stations") - if startCount > 0: - log.out(f'mongo.putBulkStations: Dropping {startCount} station documents', "INFO") - col.drop() - col.insert_many(data) - endCount = getLength(collection) - log.out(f'mongo.putBulkStations: {endCount} documents inserted', "INFO") - log.out(f'mongo.putBulkStations: {endCount - startCount} new documents', "INFO") - log.out('mongo.putBulkStations: Building collection indexes',"INFO") - createSingleIndex(collection, "3ALPHA") - createSingleIndex(collection, "STANOX") - createSingleIndex(collection, "TIPLOC") - log.out('mongo.putBulkStations: Updating meta time',"INFO") - metaUpdateTime(collection) - return - -def putBulkPis(data): - collection = "pis" - startCount = getLength(collection) - col = db[collection] - incrementCounter(collection) - if startCount > 0: - log.out(f'mongo.putBulkPid: Dropping {startCount} pis documents', "INFO") - col.drop() - col.insert_many(data) - endCount = getLength(collection) - log.out(f'mongo.putBulkPis: {endCount} documents inserted', "INFO") - log.out(f'mongo.putBulkPis: {endCount-startCount} new documents', "INFO") - log.out('mongo.putBulkPis: Updating meta time', "INFO") - metaUpdateTime(collection) - return - -def putMany(collection :str, data :list): +def putMany(collection :str, data :list, indexed_fields :list = []): log.out(f"mongo.putMany: Inserting many documents to: {collection}") col = db[collection] incrementCounter(collection) col.insert_many(data) metaUpdateTime(collection) + for item in indexed_fields: + createSingleIndex(collection, item) def incrementCounter(target): diff --git a/src/pis.py b/src/pis.py index aa5ddb2..977c454 100644 --- a/src/pis.py +++ b/src/pis.py @@ -15,10 +15,9 @@ def runUpdate(): log.out(f"pis.runUpdate: Update required", "INFO") pis_data = load() pis_parsed = parse(pis_data) + pis_indexes = ["stops", "tiplocs"] mongo.dropCollection("pis") - mongo.putMany("pis", pis_parsed) - mongo.createSingleIndex("pis", "stops") - mongo.createSingleIndex("pis", "tiplocs") + mongo.putMany("pis", pis_parsed, pis_indexes) def requiresUpdate(): if REBUILD: @@ -42,7 +41,7 @@ def load(): pis = yaml.safe_load(data) return pis["pis"] except yaml.YAMLError as exc: - print(exc) + log.out(f"pis.load: Error loading YAML: {exc}", "EROR") return exc def parse(codeList): @@ -51,7 +50,6 @@ def parse(codeList): log.out(f"pis.parse: Removing duplicate codes & adding TIPLOCs") for i in codeList: stops = i['stops'] - print(stops) code = i['code'] for ii in codeList: if stops == ii['stops'] and code != ii['code']: @@ -63,10 +61,9 @@ def parse(codeList): # show 0 results. tiplocs = [] for iii in stops: - print(iii) tiplocs.append(getTiploc(iii)) i['tiplocs'] = tiplocs - print(f"pis.parse: Removed {StartLen - len(codeList)} duplicates") + log.out(f"pis.parse: Removed {StartLen - len(codeList)} duplicates", "INFO") return codeList def getTiploc(crs :str): diff --git a/src/timetable.py b/src/timetable.py index c11e712..98127f3 100644 --- a/src/timetable.py +++ b/src/timetable.py @@ -49,8 +49,9 @@ def isUpdateRequired(): timetableLength = mongo.getLength("timetable") log.out(f"timetable.isUpdateRequired: timetable collection contains {timetableLength} documents", "DBUG") timetableUpdateTime = mongo.metaCheckTime("timetable") - log.out(f"timetable.isUpdateRequired: Timetable last updated at {timetableUpdateTime}", "INFO") timetableDataAge = helpers.getAgeInSeconds(timetableUpdateTime) + readable_age = str(timedelta(seconds=timetableDataAge)) + log.out(f"timetable.isUpdateRequired: Timetable data age: {readable_age}", "INFO") if (timetableDataAge >= twoDayinSecs and isAfter0800) or REBUILD: log.out(f"timetable.isUpdateRequired: timetable collection requires rebuild", "INFO") return "full"