db-manager/src/corpus.py

61 lines
2.5 KiB
Python
Raw Normal View History

2023-02-11 15:16:25 +00:00
#Imports
import os
import requests
import logger as log
2023-02-11 16:48:42 +00:00
import zlib
2023-02-11 20:10:11 +00:00
import json
import mongo
2023-02-11 15:16:25 +00:00
CORPUS_URL = "https://publicdatafeeds.networkrail.co.uk/ntrod/SupportingFileAuthenticate?type=CORPUS"
#Fetch Configuration
2023-02-12 20:53:59 +00:00
log.out("corpus.py: Fetching CORPUS Configuration", "INFO")
2023-02-11 15:16:25 +00:00
CORPUS_USER = os.getenv('OWL_LDB_CORPUSUSER')
CORPUS_PASS = os.getenv('OWL_LDB_CORPUSPASS')
2023-02-11 16:48:42 +00:00
def fetch():
log.out("corpus.fetch: Fetching CORPUS Data from Network Rail", "INFO")
response = requests.get(CORPUS_URL, auth=(CORPUS_USER, CORPUS_PASS))
incrementCounter("corpus_api")
2023-02-11 20:10:11 +00:00
log.out("corpus.fetch: Decompressing & parsing CORPUS data", "INFO")
parsed = json.loads(zlib.decompress(response.content, 16+zlib.MAX_WBITS).decode())
return parsed['TIPLOCDATA']
def removeEmpty(data):
# DATA: List of Dictionaries
# Iterates through the list,
# Iterates through each iterated list item (will be dictionary)
# Adds items that are not equal to " " to a new 'workingDictionary'
# Appends the working dictionary to the 'workingList' before moving on to the next item in the list
# Finally returns the 'workingList' which is the CORPUS data without any " " values.
corpusLength = len(data)
2023-02-12 20:53:59 +00:00
log.out(f"corpus.removeEmpty: Removing empty strings from {corpusLength} CORPUS documents", "INFO")
2023-02-11 20:10:11 +00:00
workingList = []
for item in data:
workingDict = {}
for key in item:
if item[key] != " ":
workingDict.update({key: item[key]})
workingList.append(workingDict)
return workingList
def onlyStations(data):
# DATA: List of Dictionaries
# Iterates through the list and checks each item (which is a dictionary) for the STANOX key
2023-02-12 21:36:41 +00:00
# If the 3ALPHA exists the relevant fields will be appended to the `workingDict` which is appended to the workingList
# the workingList is finally returned.
2023-02-11 20:10:11 +00:00
preLength = len(data)
2023-02-12 20:53:59 +00:00
log.out(f"corpus.onlyStations: Removing items without `3ALPHA` from {preLength} documents", "INFO")
2023-02-11 20:10:11 +00:00
workingList = []
for item in data:
2023-02-12 21:36:41 +00:00
workingDict = {}
2023-02-11 22:05:30 +00:00
if '3ALPHA' in item:
2023-02-12 21:36:41 +00:00
workingDict.update(item)
workingDict.pop("NLC", False)
workingDict.pop("NLCDESC16", False)
workingDict.pop("UIC", False)
workingList.append(workingDict)
2023-02-11 20:10:11 +00:00
postLength = len(workingList)
2023-02-12 20:53:59 +00:00
log.out(f"corpus.onlyStations: Removed {preLength - postLength} documents", "INFO")
2023-02-11 20:10:11 +00:00
log.out(f"Yes, I am aware there are not {postLength} stations but the data includes NI, TfL, some bus, tram and closed stations too","OTHR")
return workingList