diff --git a/src/corpus.py b/src/corpus.py index c4c5c51..92f0e28 100644 --- a/src/corpus.py +++ b/src/corpus.py @@ -3,6 +3,7 @@ import os import requests import logger as log import zlib +import json CORPUS_URL = "https://publicdatafeeds.networkrail.co.uk/ntrod/SupportingFileAuthenticate?type=CORPUS" @@ -14,39 +15,39 @@ CORPUS_PASS = os.getenv('OWL_LDB_CORPUSPASS') def fetch(): log.out("corpus.fetch: Fetching CORPUS Data from Network Rail", "INFO") response = requests.get(CORPUS_URL, auth=(CORPUS_USER, CORPUS_PASS)) - log.out("corpus.fetch: Decompressing CORPUS data", "INFO") - return zlib.decompress(response.content, 16+zlib.MAX_WBITS) + log.out("corpus.fetch: Decompressing & parsing CORPUS data", "INFO") + parsed = json.loads(zlib.decompress(response.content, 16+zlib.MAX_WBITS).decode()) + return parsed['TIPLOCDATA'] -def removeEmpty(list): - for dict in list: - print(dict) +def removeEmpty(data): + # DATA: List of Dictionaries + # Iterates through the list, + # Iterates through each iterated list item (will be dictionary) + # Adds items that are not equal to " " to a new 'workingDictionary' + # Appends the working dictionary to the 'workingList' before moving on to the next item in the list + # Finally returns the 'workingList' which is the CORPUS data without any " " values. + corpusLength = len(data) + log.out(f"corpus.removeEmpty: Removing empty strings from {corpusLength} CORPUS datapoints", "INFO") + workingList = [] + for item in data: + workingDict = {} + for key in item: + if item[key] != " ": + workingDict.update({key: item[key]}) + workingList.append(workingDict) + return workingList - -ORIGINAL METHOD FROM FILE: -#! /usr/bin/python3 - -import json - -print("Opening CORPUSExtract.json") -with open("./CORPUSExtract.json", "r") as input_file: - dict = json.load(input_file) - list = dict['TIPLOCDATA'] - -cleanList = [] - -print("Processing data") -for item in list: - if item["3ALPHA"] != ' ' and item["STANOX"] != ' ': - item.pop("UIC") - item.pop("NLCDESC16") - item.pop("NLC") - cleanList.append(item) - -print("Saving data") - -cleanDict = {"data":cleanList} -with open("CorpusClean.json", "w") as output_file: - output_file.write(json.dumps(cleanDict)) - -print(cleanList) -print("Processed.") \ No newline at end of file +def onlyStations(data): + # DATA: List of Dictionaries + # Iterates through the list and checks each item (which is a dictionary) for the STANOX key + # If the STANOX exists the dictionary will be appended to the `workingList` which is returned once iteration is complete + preLength = len(data) + log.out(f"corpus.onlyStations: Removing items without STANOX from {preLength} datapoints", "INFO") + workingList = [] + for item in data: + if 'STANOX' in item: + workingList.append(item) + postLength = len(workingList) + log.out(f"corpus.onlyStations: Removed {preLength - postLength} datapoints", "INFO") + log.out(f"Yes, I am aware there are not {postLength} stations but the data includes NI, TfL, some bus, tram and closed stations too","OTHR") + return workingList \ No newline at end of file diff --git a/src/main.py b/src/main.py index 6eb5937..7652b59 100644 --- a/src/main.py +++ b/src/main.py @@ -22,9 +22,11 @@ import time #Local Imports import corpus -import mongo +#import mongo import logger as log -#Fetch Environment Variables -log.out("main.py: Trying to print CORPUS Data", 'INFO') -print(corpus.fetchCorpus()) \ No newline at end of file +#This is only a test run: +rawCorpus = corpus.fetch() +cleanCorpus = corpus.removeEmpty(rawCorpus) +stationsOnly = corpus.onlyStations(cleanCorpus) +print("DONE") \ No newline at end of file