diff --git a/src/corpus.py b/src/corpus.py index 6a39743..f8058c7 100644 --- a/src/corpus.py +++ b/src/corpus.py @@ -2,21 +2,55 @@ import os import requests import logger as log +import zlib CORPUS_URL = "https://publicdatafeeds.networkrail.co.uk/ntrod/SupportingFileAuthenticate?type=CORPUS" #Fetch Configuration -log.out("corpus.py: Fetching CORPUS Configuration", "INFO") +log.out("corpus: Fetching CORPUS Configuration", "INFO") CORPUS_USER = os.getenv('OWL_LDB_CORPUSUSER') CORPUS_PASS = os.getenv('OWL_LDB_CORPUSPASS') -def hello(): - print("hello") - return +def fetch(): + log.out("corpus.fetch: Fetching CORPUS Data from Network Rail", "INFO") + response = requests.get(CORPUS_URL, auth=(CORPUS_USER, CORPUS_PASS)) + log.out("corpus.fetch: Decompressing CORPUS data", "INFO") + return zlib.decompress(response.content, 16+zlib.MAX_WBITS) -def fetchCorpus(): - r = requests.get(CORPUS_URL, auth=(CORPUS_USER, CORPUS_PASS)) - ## Need to ungzip the response - should be able to do with no additional imports - ## See: https://stackoverflow.com/questions/40756106/decompress-python-requests-response-with-zlib - ## Need to return the result - return \ No newline at end of file +def removeEmpty(list): + for item in list: + if item["3ALPHA"] != ' ' and item["STANOX"] != ' ': + item.pop("UIC") + item.pop("NLCDESC16") + item.pop("NLC") + cleanList.append(item) + + +ORIGINAL METHOD FROM FILE: +#! /usr/bin/python3 + +import json + +print("Opening CORPUSExtract.json") +with open("./CORPUSExtract.json", "r") as input_file: + dict = json.load(input_file) + list = dict['TIPLOCDATA'] + +cleanList = [] + +print("Processing data") +for item in list: + if item["3ALPHA"] != ' ' and item["STANOX"] != ' ': + item.pop("UIC") + item.pop("NLCDESC16") + item.pop("NLC") + cleanList.append(item) + +print("Saving data") + +cleanDict = {"data":cleanList} +with open("CorpusClean.json", "w") as output_file: + output_file.write(json.dumps(cleanDict)) + +print(cleanList) +print("Processed.") \ No newline at end of file