CORPUS Work

2023-02-11 16:48:42 +00:00
parent e910b7a84c
commit 50387bc6b2
1 changed files with 44 additions and 10 deletions
--- a/src/corpus.py
+++ b/src/corpus.py
@@ -2,21 +2,55 @@
 import os
 import requests
 import logger as log
 import zlib
 CORPUS_URL = "https://publicdatafeeds.networkrail.co.uk/ntrod/SupportingFileAuthenticate?type=CORPUS"
 #Fetch Configuration
-log.out("corpus.py: Fetching CORPUS Configuration", "INFO")
+log.out("corpus: Fetching CORPUS Configuration", "INFO")
 CORPUS_USER = os.getenv('OWL_LDB_CORPUSUSER')
 CORPUS_PASS = os.getenv('OWL_LDB_CORPUSPASS')
-def hello():
+def fetch():
-    print("hello")
+    log.out("corpus.fetch: Fetching CORPUS Data from Network Rail", "INFO")
-    return
+    response = requests.get(CORPUS_URL, auth=(CORPUS_USER, CORPUS_PASS))
    log.out("corpus.fetch: Decompressing CORPUS data", "INFO")
    return zlib.decompress(response.content, 16+zlib.MAX_WBITS)
-def fetchCorpus():
+def removeEmpty(list):
-    r = requests.get(CORPUS_URL, auth=(CORPUS_USER, CORPUS_PASS))
+    for item in list:
-    ## Need to ungzip the response - should be able to do with no additional imports
+        if item["3ALPHA"] != ' ' and item["STANOX"] != ' ':
-      ## See: https://stackoverflow.com/questions/40756106/decompress-python-requests-response-with-zlib
+            item.pop("UIC")
-    ## Need to return the result
+            item.pop("NLCDESC16")
-    return
+            item.pop("NLC")
            cleanList.append(item)
 ORIGINAL METHOD FROM FILE:
 #! /usr/bin/python3
 import json
 print("Opening CORPUSExtract.json")
 with open("./CORPUSExtract.json", "r") as input_file:
    dict = json.load(input_file)
    list = dict['TIPLOCDATA']
 cleanList = []
 print("Processing data")
 for item in list:
    if item["3ALPHA"] != ' ' and item["STANOX"] != ' ':
        item.pop("UIC")
        item.pop("NLCDESC16")
        item.pop("NLC")
        cleanList.append(item)
 print("Saving data")
 cleanDict = {"data":cleanList}
 with open("CorpusClean.json", "w") as output_file:
    output_file.write(json.dumps(cleanDict))
 print(cleanList)
 print("Processed.")