From 7122233ac3e8b8066d742be1a43087e23f2f4c52 Mon Sep 17 00:00:00 2001 From: Fred Boniface Date: Wed, 31 May 2023 22:09:09 +0100 Subject: [PATCH] PIS Update based on hash rather than time --- .dockerignore | 1 + .gitignore | 1 + src/corpus.py | 3 ++- src/mailer.py | 3 +++ src/main.py | 25 ++++++++---------------- src/mongo.py | 44 ++++++++++++++++++++++++++++++++++++++---- src/pis.py | 50 ++++++++++++++++++++++++++++-------------------- src/timetable.py | 14 +++++++++----- 8 files changed, 93 insertions(+), 48 deletions(-) diff --git a/.dockerignore b/.dockerignore index 25dab36..181c149 100644 --- a/.dockerignore +++ b/.dockerignore @@ -4,6 +4,7 @@ Dockerfile .dockerignore .gitignore LICENSE +dbman-log # ---> Python # Byte-compiled / optimized / DLL files diff --git a/.gitignore b/.gitignore index 958561c..183755f 100644 --- a/.gitignore +++ b/.gitignore @@ -1,5 +1,6 @@ run.sh env +dbman-log # ---> Python # Byte-compiled / optimized / DLL files diff --git a/src/corpus.py b/src/corpus.py index 0c63997..d4b078c 100644 --- a/src/corpus.py +++ b/src/corpus.py @@ -9,10 +9,11 @@ import mongo CORPUS_URL = "https://publicdatafeeds.networkrail.co.uk/ntrod/SupportingFileAuthenticate?type=CORPUS" #Fetch Configuration -log.out("corpus.py: Fetching CORPUS Configuration", "INFO") CORPUS_USER = os.getenv('OWL_LDB_CORPUSUSER') CORPUS_PASS = os.getenv('OWL_LDB_CORPUSPASS') +log.out("corpus.py: CORPUS Module Loaded", "DBUG") + def fetch(): log.out("corpus.fetch: Fetching CORPUS Data from Network Rail", "INFO") response = requests.get(CORPUS_URL, auth=(CORPUS_USER, CORPUS_PASS)) diff --git a/src/mailer.py b/src/mailer.py index 3d57e6d..7d5ec01 100644 --- a/src/mailer.py +++ b/src/mailer.py @@ -15,6 +15,9 @@ # https://git.fjla.uk/OwlBoard/db-manager/src/branch/main/LICENSE import smtplib, ssl, os +import logger as log + +log.out("mailer.py: Mailer module loaded", "DBUG") def submitLogs(): text :str = fetchLogs() diff --git a/src/main.py b/src/main.py index 57da117..128ae12 100644 --- a/src/main.py +++ b/src/main.py @@ -14,21 +14,21 @@ # program. If not, see # https://git.fjla.uk/OwlBoard/db-manager/src/branch/main/LICENSE -version = "2023.5.7" +version = "2023.5.9" print(f"main.py: Initialising db-manager v{version}") #Third Party Imports import os import time +# Import logger +import logger as log +log.out(f"main.py: db-manager {version} Initialised", "INFO") + #Local Imports import corpus, mongo, pis, mailer, timetable -import logger as log -log.out("main.py: db-manager Initialised", "INFO") - - -#Ensure count document exists in meta, wrap in while look to prevent crashing if the DB is not ready: +# Ensure count document exists in meta, wrap in while look to prevent crashing if the DB is not ready: dbReady = False while dbReady is False: try: @@ -59,17 +59,8 @@ if corpusAge > 1036800: else: log.out('main.py: Not updating stations data until it is 1036800s old.', "INFO") -#Check & Update pis data: - # If older than 2 days then update -pisAge = int(time.time()) - mongo.metaCheckTime("pis") -log.out(f'main.py: PIS Data is {pisAge}s old', "INFO") -if pisAge > 43200: # Temporarily set to 15 minutes - log.out('main.py: Updating PIS data', "INFO") - pisData = pis.load() - pisParsed = pis.parse(pisData) - mongo.putBulkPis(pisParsed) -else: - log.out('main.py: Not updating PIS data until is it 1036800s old', "INFO") +## Run PIS Update +pis.runUpdate() ## Run Timetable Update timetable.runUpdate() diff --git a/src/mongo.py b/src/mongo.py index 3b61057..f55acd5 100644 --- a/src/mongo.py +++ b/src/mongo.py @@ -4,17 +4,18 @@ import time import urllib.parse import logger as log -log.out("mongo.py: Fetching configuration", "INFO") db_host = os.getenv('OWL_DB_HOST', 'localhost') db_port = os.getenv('OWL_DB_PORT', 27017) db_user = urllib.parse.quote_plus(os.getenv('OWL_DB_USER', "owl")) db_pass = urllib.parse.quote_plus(os.getenv('OWL_DB_PASS', "twittwoo")) db_name = os.getenv('OWL_DB_NAME', "owlboard") -log.out(f"mongo.py: Connecting to database at {db_host}:{db_port}", "DBUG") +log.out(f"mongo.py: Database URI: {db_host}:{db_port}", "DBUG") client = MongoClient(f"mongodb://{db_user}:{db_pass}@{db_host}:{db_port}") db = client[db_name] +log.out("mongo.py: Mongo module loaded", "DBUG") + def metaCheckTime(target): col = db["meta"] res = col.find_one({"target": target, "type": "collection"}) @@ -104,6 +105,13 @@ def putBulkPis(data): metaUpdateTime(collection) return +def putMany(collection :str, data :list): + log.out(f"mongo.putMany: Inserting many documents to: {collection}") + col = db[collection] + incrementCounter(collection) + col.insert_many(data) + metaUpdateTime(collection) + def incrementCounter(target): collection = "meta" @@ -116,7 +124,6 @@ def metaCounters(): collection = "meta" col = db[collection] res = col.find_one({"target": "counters","type": "count"}) - log.out(f'mongo.metaCounters: Query returned `{res}`', "DEBG") if type(res) is dict: if 'since' in res: log.out('mongo.metaCounters: counters already exists, skipping', "INFO") @@ -139,10 +146,39 @@ def putTimetable(data): res = col.insert_many(data) def dropCollection(collection): + log.out(f"mongo.dropCollection: Dropping collection '{collection}'") col = db[collection] res = col.drop() def deleteTimetableData(query): collection = "timetable" col = db[collection] - res = col.delete_one(query) \ No newline at end of file + res = col.delete_one(query) + +def getMetaHash(target): + collection = "meta" + col = db[collection] + res = col.find_one({"target": target, "type": "collection"}) + incrementCounter("meta") + if type(res) is dict: + if 'updated' in res: + try: + log.out(f'mongo.metaGetHash: {target} hash is {res["hash"]}', 'INFO') + return res["hash"] + except: + return None + return None + +def putMetaHash(target :str, hash :str): + collection = "meta" + col = db[collection] + filter = { + "target": target, + "type": "collection" + } + update = { + "target": target, + "type": "collection", + "hash": hash + } + res = col.update_one(filter, {"$set": update}, upsert=True) \ No newline at end of file diff --git a/src/pis.py b/src/pis.py index 65ae7cb..86060a9 100644 --- a/src/pis.py +++ b/src/pis.py @@ -1,9 +1,34 @@ -import yaml +import yaml, hashlib +import logger as log +import mongo -print("PIS Module imported") +print("pis.py: PIS Module Loaded", "DBUG") +file_location :str = "/app/data/pis/gwr.yaml" + +def runUpdate(): + if (not requiresUpdate()): + log.out('pis.runUpdate: PIS Codes do not need updating', 'INFO') + return + log.out(f"pis.runUpdate: Update required", "INFO") + pis_data = load() + pis_parsed = parse(pis_data) + mongo.dropCollection("pis") + mongo.putMany("pis", pis_parsed) + +def requiresUpdate(): + currentHash = mongo.getMetaHash("pis") + with open(file_location, "r") as f: + text = f.read() + newHash = hashlib.md5(text.encode()).hexdigest() + if currentHash is None or newHash != currentHash: + log.out(f"pis.requiresUpdate: currentHash: {currentHash}, newHash: {newHash}", "INFO") + mongo.putMetaHash("pis", newHash) + return True + mongo.putMetaHash("pis", newHash) + return False def load(): # Programatically add a `toc` field to each entry. - with open("/app/data/pis/gwr.yaml", "r") as data: + with open(file_location, "r") as data: try: pis = yaml.safe_load(data) print(pis) @@ -12,9 +37,6 @@ def load(): # Programatically add a `toc` field to each entry. print(exc) return exc -## Do some magic here so that if any pis["pis"]["stops"][0] field contains 'reverse' then get the stops for the code stored in pis["pis"]["stops"][1] -## reverse the stops and store that. - def parse(codeList): StartLen = len(codeList) print(f"pis.parse: codeList starting length: {StartLen}") @@ -26,18 +48,4 @@ def parse(codeList): print(f"Identical stopping pattern found: {ii['code']}") codeList.remove(ii) print(f"pis.parse: Removed {StartLen - len(codeList)} duplicates") - return codeList - -def devLoad(): # Programatically add a `toc` field to each entry. - with open("/home/fred.boniface/git/owlboard/db-manager/data/pis/gwr.yaml", "r") as data: - try: - pis = yaml.safe_load(data) - print(pis) - return pis["pis"] - except yaml.YAMLError as exc: - print(exc) - return exc - -def dev(): - data = devLoad() - parse(data) \ No newline at end of file + return codeList \ No newline at end of file diff --git a/src/timetable.py b/src/timetable.py index cb3cfc6..4fcfaf2 100644 --- a/src/timetable.py +++ b/src/timetable.py @@ -14,6 +14,8 @@ # program. If not, see # https://git.fjla.uk/OwlBoard/db-manager/src/branch/main/LICENSE +REBUILD :bool = False ## Set to true to rebuild database + #Imports import os import requests @@ -27,20 +29,22 @@ from datetime import datetime, timedelta # This module downloads a single TOCs Schedule data now = datetime.now() yesterday = now - timedelta(days=1) -yesterdayDay = yesterday.strftime("%a").lower() +yesterdayDay = yesterday.strftime("%a").lower() TOC_Code = "EF" # Business code for GWR fullDataUrl = f"https://publicdatafeeds.networkrail.co.uk/ntrod/CifFileAuthenticate?type=CIF_{TOC_Code}_TOC_FULL_DAILY&day=toc-full" updateDataUrl = f"https://publicdatafeeds.networkrail.co.uk/ntrod/CifFileAuthenticate?type=CIF_{TOC_Code}_TOC_UPDATE_DAILY&day=toc-update-{yesterdayDay}" CORPUS_USER = os.getenv('OWL_LDB_CORPUSUSER') CORPUS_PASS = os.getenv('OWL_LDB_CORPUSPASS') +log.out("timetable.py: Timetable module loaded", "DBUG") + # Determine state of current Timetable Database def isUpdateRequired(): timetableLength = mongo.getLength("timetable") log.out(f"timetable.isUpdateRequired: timetable collection contains {timetableLength} documents", "DBUG") timetableUpdateDate = mongo.metaCheckTime("timetable") log.out(f"timetable.isUpdateRequired: Timetable last updated at {timetableUpdateDate}", "INFO") - if (not timetableLength or int(time.time()) > timetableUpdateDate + 172800): + if (not timetableLength or int(time.time()) > timetableUpdateDate + 172800 or REBUILD): log.out(f"timetable.isUpdateRequired: timetable collection requires rebuild", "INFO") return "full" if (int(time.time()) > (timetableUpdateDate + 86400)): @@ -99,8 +103,8 @@ def insertSchedule(sch_record): schedule = sch_record['JsonScheduleV1'] scheduleId = schedule['CIF_train_uid'] transactionType = schedule['transaction_type'] - if ('schedule_start_date' in sch_record): - scheduleStart = _helpParseDate(sch_record['schedule_start_date']) + if ('schedule_start_date' in schedule): + scheduleStart = _helpParseDate(schedule['schedule_start_date']) else: now = datetime.now() scheduleStart = now.replace(hour=0,minute=0,second=0,microsecond=0) @@ -167,7 +171,7 @@ def _helpParseDays(string): selectedDays = [daysList[i] for i, value in enumerate(string) if value == "1"] return selectedDays -def _helpParseDate(string, time): +def _helpParseDate(string :str, time :str = "false"): # Incoming string contains date in format %Y-%m-%d, if the time signified end of schedule, # append 23:59:59 to the time, else append 00:00:00 to the string. if time == "end":