This repository has been archived on 2024-11-02. You can view files and clone it, but cannot push or open issues or pull requests.
db-manager/src/corpus.py
Fred Boniface 0f7693d798 Bug Fixes:
- Out of date schedules were not being removed from timetable
 - Timetable update was at risk of missing a day
 - cif-data file was not being removed after mailing - mainly an issue when not containerized
 - Added additional helper variables
2023-06-07 21:14:49 +01:00

87 lines
3.3 KiB
Python

#Imports
import os
import requests
import logger as log
import zlib
import json
import datetime
import mongo, helpers
#Fetch Configuration
CORPUS_USER = os.getenv('OWL_LDB_CORPUSUSER')
CORPUS_PASS = os.getenv('OWL_LDB_CORPUSPASS')
log.out("corpus.py: CORPUS Module Loaded", "DBUG")
def runUpdate():
required = isUpdateRequired()
if required:
corpus_data = fetch()
prepared_corpus = removeEmpty(corpus_data)
prepared_stations = onlyStations(prepared_corpus)
corpus_indexes = ["3ALPHA", "NLC"]
mongo.dropCollection("corpus")
mongo.putMany("corpus", prepared_corpus, corpus_indexes)
stations_indexes = ["3ALPHA", "STANOX", "TIPLOC"]
mongo.dropCollection("stations")
mongo.putMany("stations", prepared_stations, stations_indexes)
return
return
def isUpdateRequired():
update_time = mongo.metaCheckTime("corpus")
age = helpers.getAgeInSeconds(update_time)
readable_age = str(datetime.timedelta(seconds=age))
log.out(f"corpus.isUpdateRequired: CORPUS data is {readable_age} seconds old.", "INFO")
if age > helpers.two_weeks_in_seconds:
log.out("corpus.isUpdateRequired: CORPUS data required update", "INFO")
return True
log.out("corpus.isUpdateRequired: CORPUS data does not need updating", "INFO")
return False
def fetch():
log.out("corpus.fetch: Fetching CORPUS Data from Network Rail", "INFO")
response = requests.get(helpers.corpus_data_url, auth=(CORPUS_USER, CORPUS_PASS))
mongo.incrementCounter("corpus_api")
log.out("corpus.fetch: Decompressing & parsing CORPUS data", "INFO")
parsed = json.loads(zlib.decompress(response.content, 16+zlib.MAX_WBITS).decode())
return parsed['TIPLOCDATA']
def removeEmpty(data):
# DATA: List of Dictionaries
# Iterates through the list,
# Iterates through each iterated list item (will be dictionary)
# Adds items that are not equal to " " to a new 'workingDictionary'
# Appends the working dictionary to the 'workingList' before moving on to the next item in the list
# Finally returns the 'workingList' which is the CORPUS data without any " " values.
corpusLength = len(data)
log.out(f"corpus.removeEmpty: Removing empty strings from {corpusLength} CORPUS documents", "INFO")
workingList = []
for item in data:
workingDict = {}
for key in item:
if item[key] != " ":
workingDict.update({key: item[key]})
workingList.append(workingDict)
return workingList
def onlyStations(data):
# DATA: List of Dictionaries
# Iterates through the list and checks each item (which is a dictionary) for the STANOX key
# If the 3ALPHA exists the relevant fields will be appended to the `workingDict` which is appended to the workingList
# the workingList is finally returned.
preLength = len(data)
log.out(f"corpus.onlyStations: Removing items without `3ALPHA` from {preLength} documents", "INFO")
workingList = []
for item in data:
workingDict = {}
if '3ALPHA' in item:
workingDict.update(item)
workingDict.pop("NLC", False)
workingDict.pop("NLCDESC16", False)
workingDict.pop("UIC", False)
workingList.append(workingDict)
postLength = len(workingList)
log.out(f"corpus.onlyStations: Removed {preLength - postLength} documents", "INFO")
return workingList