#Imports import os import requests import logger as log import zlib import json import datetime import mongo, helpers #Fetch Configuration CORPUS_USER = os.getenv('OWL_LDB_CORPUSUSER') CORPUS_PASS = os.getenv('OWL_LDB_CORPUSPASS') log.out("corpus.py: CORPUS Module Loaded", "DBUG") def runUpdate(): required = isUpdateRequired() if required: corpus_data = fetch() prepared_corpus = removeEmpty(corpus_data) prepared_stations = onlyStations(prepared_corpus) corpus_indexes = ["3ALPHA", "NLC"] mongo.dropCollection("corpus") mongo.putMany("corpus", prepared_corpus, corpus_indexes) stations_indexes = ["3ALPHA", "STANOX", "TIPLOC"] mongo.dropCollection("stations") mongo.putMany("stations", prepared_stations, stations_indexes) return return def isUpdateRequired(): update_time = mongo.metaCheckTime("corpus") age = helpers.getAgeInSeconds(update_time) readable_age = str(datetime.timedelta(seconds=age)) log.out(f"corpus.isUpdateRequired: CORPUS data is {readable_age} seconds old.", "INFO") if age > helpers.two_weeks_in_seconds: log.out("corpus.isUpdateRequired: CORPUS data required update", "INFO") return True log.out("corpus.isUpdateRequired: CORPUS data does not need updating", "INFO") return False def fetch(): log.out("corpus.fetch: Fetching CORPUS Data from Network Rail", "INFO") response = requests.get(helpers.corpus_data_url, auth=(CORPUS_USER, CORPUS_PASS)) mongo.incrementCounter("corpus_api") log.out("corpus.fetch: Decompressing & parsing CORPUS data", "INFO") parsed = json.loads(zlib.decompress(response.content, 16+zlib.MAX_WBITS).decode()) return parsed['TIPLOCDATA'] def removeEmpty(data): # DATA: List of Dictionaries # Iterates through the list, # Iterates through each iterated list item (will be dictionary) # Adds items that are not equal to " " to a new 'workingDictionary' # Appends the working dictionary to the 'workingList' before moving on to the next item in the list # Finally returns the 'workingList' which is the CORPUS data without any " " values. corpusLength = len(data) log.out(f"corpus.removeEmpty: Removing empty strings from {corpusLength} CORPUS documents", "INFO") workingList = [] for item in data: workingDict = {} for key in item: if item[key] != " ": workingDict.update({key: item[key]}) workingList.append(workingDict) return workingList def onlyStations(data): # DATA: List of Dictionaries # Iterates through the list and checks each item (which is a dictionary) for the STANOX key # If the 3ALPHA exists the relevant fields will be appended to the `workingDict` which is appended to the workingList # the workingList is finally returned. preLength = len(data) log.out(f"corpus.onlyStations: Removing items without `3ALPHA` from {preLength} documents", "INFO") workingList = [] for item in data: workingDict = {} if '3ALPHA' in item: workingDict.update(item) workingDict.pop("NLC", False) workingDict.pop("NLCDESC16", False) workingDict.pop("UIC", False) workingList.append(workingDict) postLength = len(workingList) log.out(f"corpus.onlyStations: Removed {preLength - postLength} documents", "INFO") return workingList