CORPUS Work

2023-02-11 16:48:42 +00:00
parent e910b7a84c
commit 50387bc6b2
1 changed files with 44 additions and 10 deletions
--- a/src/corpus.py
+++ b/src/corpus.py
@@ -2,21 +2,55 @@
 import os
 import requests
 import logger as log
+import zlib

 CORPUS_URL = "https://publicdatafeeds.networkrail.co.uk/ntrod/SupportingFileAuthenticate?type=CORPUS"

 #Fetch Configuration
-log.out("corpus.py: Fetching CORPUS Configuration", "INFO")
+log.out("corpus: Fetching CORPUS Configuration", "INFO")
 CORPUS_USER = os.getenv('OWL_LDB_CORPUSUSER')
 CORPUS_PASS = os.getenv('OWL_LDB_CORPUSPASS')

-def hello():
-    print("hello")
-    return
+def fetch():
+    log.out("corpus.fetch: Fetching CORPUS Data from Network Rail", "INFO")
+    response = requests.get(CORPUS_URL, auth=(CORPUS_USER, CORPUS_PASS))
+    log.out("corpus.fetch: Decompressing CORPUS data", "INFO")
+    return zlib.decompress(response.content, 16+zlib.MAX_WBITS)

-def fetchCorpus():
-    r = requests.get(CORPUS_URL, auth=(CORPUS_USER, CORPUS_PASS))
-    ## Need to ungzip the response - should be able to do with no additional imports
-      ## See: https://stackoverflow.com/questions/40756106/decompress-python-requests-response-with-zlib
-    ## Need to return the result
-    return
+def removeEmpty(list):
+    for item in list:
+        if item["3ALPHA"] != ' ' and item["STANOX"] != ' ':
+            item.pop("UIC")
+            item.pop("NLCDESC16")
+            item.pop("NLC")
+            cleanList.append(item)
+
+
+ORIGINAL METHOD FROM FILE:
+#! /usr/bin/python3
+
+import json
+
+print("Opening CORPUSExtract.json")
+with open("./CORPUSExtract.json", "r") as input_file:
+    dict = json.load(input_file)
+    list = dict['TIPLOCDATA']
+
+cleanList = []
+
+print("Processing data")
+for item in list:
+    if item["3ALPHA"] != ' ' and item["STANOX"] != ' ':
+        item.pop("UIC")
+        item.pop("NLCDESC16")
+        item.pop("NLC")
+        cleanList.append(item)
+
+print("Saving data")
+
+cleanDict = {"data":cleanList}
+with open("CorpusClean.json", "w") as output_file:
+    output_file.write(json.dumps(cleanDict))
+
+print(cleanList)
+print("Processed.")