db-manager/src/corpus.py

#Imports
import os
import requests
import logger as log
import zlib
import json
import mongo

CORPUS_URL = "https://publicdatafeeds.networkrail.co.uk/ntrod/SupportingFileAuthenticate?type=CORPUS"

#Fetch Configuration
CORPUS_USER = os.getenv('OWL_LDB_CORPUSUSER')
CORPUS_PASS = os.getenv('OWL_LDB_CORPUSPASS')

log.out("corpus.py: CORPUS Module Loaded", "DBUG")

def fetch():
    log.out("corpus.fetch: Fetching CORPUS Data from Network Rail", "INFO")
    response = requests.get(CORPUS_URL, auth=(CORPUS_USER, CORPUS_PASS))
    mongo.incrementCounter("corpus_api")
    log.out("corpus.fetch: Decompressing & parsing CORPUS data", "INFO")
    parsed = json.loads(zlib.decompress(response.content, 16+zlib.MAX_WBITS).decode())
    return parsed['TIPLOCDATA']

def removeEmpty(data):
    # DATA: List of Dictionaries
    # Iterates through the list,
    # Iterates through each iterated list item (will be dictionary)
    # Adds items that are not equal to " " to a new 'workingDictionary'
    # Appends the working dictionary to the 'workingList' before moving on to the next item in the list
    # Finally returns the 'workingList' which is the CORPUS data without any " " values.
    corpusLength = len(data)
    log.out(f"corpus.removeEmpty: Removing empty strings from {corpusLength} CORPUS documents", "INFO")
    workingList = []
    for item in data:
        workingDict = {}
        for key in item:
            if item[key] != " ":
              workingDict.update({key: item[key]})
        workingList.append(workingDict)
    return workingList

def onlyStations(data):
    # DATA: List of Dictionaries
    # Iterates through the list and checks each item (which is a dictionary) for the STANOX key
    # If the 3ALPHA exists the relevant fields will be appended to the `workingDict` which is appended to the workingList
    # the workingList is finally returned.
    preLength = len(data)
    log.out(f"corpus.onlyStations: Removing items without `3ALPHA` from {preLength} documents", "INFO")
    workingList = []
    for item in data:
        workingDict = {}
        if '3ALPHA' in item:
            workingDict.update(item)
            workingDict.pop("NLC", False)
            workingDict.pop("NLCDESC16", False)
            workingDict.pop("UIC", False)
            workingList.append(workingDict)
    postLength = len(workingList)
    log.out(f"corpus.onlyStations: Removed {preLength - postLength} documents", "INFO")
    log.out(f"Yes, I am aware there are not {postLength} stations but the data includes NI, TfL, some bus, tram and closed stations too","OTHR")
    return workingList