Finalise? corpus.py

2023-02-11 20:10:11 +00:00 · 2023-02-11 20:10:11 +00:00 · 72445785a5
commit 72445785a5
parent 5b3965d50e
2 changed files with 41 additions and 38 deletions
--- a/src/corpus.py
+++ b/src/corpus.py
@ -3,6 +3,7 @@ import os
 import requests
 import logger as log
 import zlib
 import json
 CORPUS_URL = "https://publicdatafeeds.networkrail.co.uk/ntrod/SupportingFileAuthenticate?type=CORPUS"
@ -14,39 +15,39 @@ CORPUS_PASS = os.getenv('OWL_LDB_CORPUSPASS')
 def fetch():
    log.out("corpus.fetch: Fetching CORPUS Data from Network Rail", "INFO")
    response = requests.get(CORPUS_URL, auth=(CORPUS_USER, CORPUS_PASS))
-    log.out("corpus.fetch: Decompressing CORPUS data", "INFO")
+    log.out("corpus.fetch: Decompressing & parsing CORPUS data", "INFO")
-    return zlib.decompress(response.content, 16+zlib.MAX_WBITS)
+    parsed = json.loads(zlib.decompress(response.content, 16+zlib.MAX_WBITS).decode())
    return parsed['TIPLOCDATA']
-def removeEmpty(list):
+def removeEmpty(data):
-    for dict in list:
+    # DATA: List of Dictionaries
-      print(dict)
+    # Iterates through the list,
    # Iterates through each iterated list item (will be dictionary)
    # Adds items that are not equal to " " to a new 'workingDictionary'
    # Appends the working dictionary to the 'workingList' before moving on to the next item in the list
    # Finally returns the 'workingList' which is the CORPUS data without any " " values.
    corpusLength = len(data)
    log.out(f"corpus.removeEmpty: Removing empty strings from {corpusLength} CORPUS datapoints", "INFO")
    workingList = []
    for item in data:
        workingDict = {}
        for key in item:
            if item[key] != " ":
              workingDict.update({key: item[key]})
        workingList.append(workingDict)
    return workingList
-
+def onlyStations(data):
-ORIGINAL METHOD FROM FILE:
+    # DATA: List of Dictionaries
-#! /usr/bin/python3
+    # Iterates through the list and checks each item (which is a dictionary) for the STANOX key
-
+    # If the STANOX exists the dictionary will be appended to the `workingList` which is returned once iteration is complete
-import json
+    preLength = len(data)
-
+    log.out(f"corpus.onlyStations: Removing items without STANOX from {preLength} datapoints", "INFO")
-print("Opening CORPUSExtract.json")
+    workingList = []
-with open("./CORPUSExtract.json", "r") as input_file:
+    for item in data:
-    dict = json.load(input_file)
+        if 'STANOX' in item:
-    list = dict['TIPLOCDATA']
+            workingList.append(item)
-
+    postLength = len(workingList)
-cleanList = []
+    log.out(f"corpus.onlyStations: Removed {preLength - postLength} datapoints", "INFO")
-
+    log.out(f"Yes, I am aware there are not {postLength} stations but the data includes NI, TfL, some bus, tram and closed stations too","OTHR")
-print("Processing data")
+    return workingList
 for item in list:
    if item["3ALPHA"] != ' ' and item["STANOX"] != ' ':
        item.pop("UIC")
        item.pop("NLCDESC16")
        item.pop("NLC")
        cleanList.append(item)
 print("Saving data")
 cleanDict = {"data":cleanList}
 with open("CorpusClean.json", "w") as output_file:
    output_file.write(json.dumps(cleanDict))
 print(cleanList)
 print("Processed.")
--- a/src/main.py
+++ b/src/main.py
@ -22,9 +22,11 @@ import time
 #Local Imports
 import corpus
-import mongo
+#import mongo
 import logger as log
-#Fetch Environment Variables
+#This is only a test run:
-log.out("main.py: Trying to print CORPUS Data", 'INFO')
+rawCorpus = corpus.fetch()
-print(corpus.fetchCorpus())
+cleanCorpus = corpus.removeEmpty(rawCorpus)
 stationsOnly = corpus.onlyStations(cleanCorpus)
 print("DONE")