90 lines
3.5 KiB
Python
90 lines
3.5 KiB
Python
#Imports
|
|
import os
|
|
import requests
|
|
import logger as log
|
|
import zlib
|
|
import json
|
|
import datetime
|
|
import mongo, helpers
|
|
|
|
CORPUS_URL = "https://publicdatafeeds.networkrail.co.uk/ntrod/SupportingFileAuthenticate?type=CORPUS"
|
|
|
|
#Fetch Configuration
|
|
CORPUS_USER = os.getenv('OWL_LDB_CORPUSUSER')
|
|
CORPUS_PASS = os.getenv('OWL_LDB_CORPUSPASS')
|
|
|
|
log.out("corpus.py: CORPUS Module Loaded", "DBUG")
|
|
|
|
def runUpdate():
|
|
required = isUpdateRequired()
|
|
if required:
|
|
corpus_data = fetch()
|
|
prepared_corpus = removeEmpty(corpus_data)
|
|
prepared_stations = onlyStations(prepared_corpus)
|
|
corpus_indexes = ["3ALPHA", "NLC"]
|
|
mongo.dropCollection("corpus")
|
|
mongo.putMany("corpus", prepared_corpus, corpus_indexes)
|
|
stations_indexes = ["3ALPHA", "STANOX", "TIPLOC"]
|
|
mongo.dropCollection("stations")
|
|
mongo.putMany("stations", prepared_stations, stations_indexes)
|
|
return
|
|
return
|
|
|
|
def isUpdateRequired():
|
|
update_time = mongo.metaCheckTime("corpus")
|
|
age = helpers.getAgeInSeconds(update_time)
|
|
readable_age = str(datetime.timedelta(seconds=age))
|
|
log.out(f"corpus.isUpdateRequired: CORPUS data is {readable_age} seconds old.", "INFO")
|
|
if age > helpers.two_weeks_in_seconds:
|
|
log.out("corpus.isUpdateRequired: CORPUS data required update", "INFO")
|
|
return True
|
|
log.out("corpus.isUpdateRequired: CORPUS data does not need updating", "INFO")
|
|
return False
|
|
|
|
|
|
def fetch():
|
|
log.out("corpus.fetch: Fetching CORPUS Data from Network Rail", "INFO")
|
|
response = requests.get(CORPUS_URL, auth=(CORPUS_USER, CORPUS_PASS))
|
|
mongo.incrementCounter("corpus_api")
|
|
log.out("corpus.fetch: Decompressing & parsing CORPUS data", "INFO")
|
|
parsed = json.loads(zlib.decompress(response.content, 16+zlib.MAX_WBITS).decode())
|
|
return parsed['TIPLOCDATA']
|
|
|
|
def removeEmpty(data):
|
|
# DATA: List of Dictionaries
|
|
# Iterates through the list,
|
|
# Iterates through each iterated list item (will be dictionary)
|
|
# Adds items that are not equal to " " to a new 'workingDictionary'
|
|
# Appends the working dictionary to the 'workingList' before moving on to the next item in the list
|
|
# Finally returns the 'workingList' which is the CORPUS data without any " " values.
|
|
corpusLength = len(data)
|
|
log.out(f"corpus.removeEmpty: Removing empty strings from {corpusLength} CORPUS documents", "INFO")
|
|
workingList = []
|
|
for item in data:
|
|
workingDict = {}
|
|
for key in item:
|
|
if item[key] != " ":
|
|
workingDict.update({key: item[key]})
|
|
workingList.append(workingDict)
|
|
return workingList
|
|
|
|
def onlyStations(data):
|
|
# DATA: List of Dictionaries
|
|
# Iterates through the list and checks each item (which is a dictionary) for the STANOX key
|
|
# If the 3ALPHA exists the relevant fields will be appended to the `workingDict` which is appended to the workingList
|
|
# the workingList is finally returned.
|
|
preLength = len(data)
|
|
log.out(f"corpus.onlyStations: Removing items without `3ALPHA` from {preLength} documents", "INFO")
|
|
workingList = []
|
|
for item in data:
|
|
workingDict = {}
|
|
if '3ALPHA' in item:
|
|
workingDict.update(item)
|
|
workingDict.pop("NLC", False)
|
|
workingDict.pop("NLCDESC16", False)
|
|
workingDict.pop("UIC", False)
|
|
workingList.append(workingDict)
|
|
postLength = len(workingList)
|
|
log.out(f"corpus.onlyStations: Removed {preLength - postLength} documents", "INFO")
|
|
log.out(f"Yes, I am aware there are not {postLength} stations but the data includes NI, TfL, some bus, tram and closed stations too","OTHR")
|
|
return workingList |