db-manager/src/corpus.py

#Imports
import os
import requests
import logger as log
import zlib
import json
import datetime
import mongo, helpers

CORPUS_URL = "https://publicdatafeeds.networkrail.co.uk/ntrod/SupportingFileAuthenticate?type=CORPUS"

#Fetch Configuration
CORPUS_USER = os.getenv('OWL_LDB_CORPUSUSER')
CORPUS_PASS = os.getenv('OWL_LDB_CORPUSPASS')

log.out("corpus.py: CORPUS Module Loaded", "DBUG")

def runUpdate():
  required = isUpdateRequired()
  if required:
    corpus_data = fetch()
    prepared_corpus = removeEmpty(corpus_data)
    prepared_stations = onlyStations(prepared_corpus)
    corpus_indexes = ["3ALPHA", "NLC"]
    mongo.dropCollection("corpus")
    mongo.putMany("corpus", prepared_corpus, corpus_indexes)
    stations_indexes = ["3ALPHA", "STANOX", "TIPLOC"]
    mongo.dropCollection("stations")
    mongo.putMany("stations", prepared_stations, stations_indexes)
    return
  return

def isUpdateRequired():
  update_time = mongo.metaCheckTime("corpus")
  age = helpers.getAgeInSeconds(update_time)
  readable_age = str(datetime.timedelta(seconds=age))
  log.out(f"corpus.isUpdateRequired: CORPUS data is {readable_age} seconds old.", "INFO")
  if age > helpers.two_weeks_in_seconds:
    log.out("corpus.isUpdateRequired: CORPUS data required update", "INFO")
    return True
  log.out("corpus.isUpdateRequired: CORPUS data does not need updating", "INFO")
  return False


def fetch():
    log.out("corpus.fetch: Fetching CORPUS Data from Network Rail", "INFO")
    response = requests.get(CORPUS_URL, auth=(CORPUS_USER, CORPUS_PASS))
    mongo.incrementCounter("corpus_api")
    log.out("corpus.fetch: Decompressing & parsing CORPUS data", "INFO")
    parsed = json.loads(zlib.decompress(response.content, 16+zlib.MAX_WBITS).decode())
    return parsed['TIPLOCDATA']

def removeEmpty(data):
    # DATA: List of Dictionaries
    # Iterates through the list,
    # Iterates through each iterated list item (will be dictionary)
    # Adds items that are not equal to " " to a new 'workingDictionary'
    # Appends the working dictionary to the 'workingList' before moving on to the next item in the list
    # Finally returns the 'workingList' which is the CORPUS data without any " " values.
    corpusLength = len(data)
    log.out(f"corpus.removeEmpty: Removing empty strings from {corpusLength} CORPUS documents", "INFO")
    workingList = []
    for item in data:
        workingDict = {}
        for key in item:
            if item[key] != " ":
              workingDict.update({key: item[key]})
        workingList.append(workingDict)
    return workingList

def onlyStations(data):
    # DATA: List of Dictionaries
    # Iterates through the list and checks each item (which is a dictionary) for the STANOX key
    # If the 3ALPHA exists the relevant fields will be appended to the `workingDict` which is appended to the workingList
    # the workingList is finally returned.
    preLength = len(data)
    log.out(f"corpus.onlyStations: Removing items without `3ALPHA` from {preLength} documents", "INFO")
    workingList = []
    for item in data:
        workingDict = {}
        if '3ALPHA' in item:
            workingDict.update(item)
            workingDict.pop("NLC", False)
            workingDict.pop("NLCDESC16", False)
            workingDict.pop("UIC", False)
            workingList.append(workingDict)
    postLength = len(workingList)
    log.out(f"corpus.onlyStations: Removed {preLength - postLength} documents", "INFO")
    log.out(f"Yes, I am aware there are not {postLength} stations but the data includes NI, TfL, some bus, tram and closed stations too","OTHR")
    return workingList