Finalise? corpus.py

This commit is contained in:
Fred Boniface 2023-02-11 20:10:11 +00:00
parent 5b3965d50e
commit 72445785a5
2 changed files with 41 additions and 38 deletions

View File

@ -3,6 +3,7 @@ import os
import requests import requests
import logger as log import logger as log
import zlib import zlib
import json
CORPUS_URL = "https://publicdatafeeds.networkrail.co.uk/ntrod/SupportingFileAuthenticate?type=CORPUS" CORPUS_URL = "https://publicdatafeeds.networkrail.co.uk/ntrod/SupportingFileAuthenticate?type=CORPUS"
@ -14,39 +15,39 @@ CORPUS_PASS = os.getenv('OWL_LDB_CORPUSPASS')
def fetch(): def fetch():
log.out("corpus.fetch: Fetching CORPUS Data from Network Rail", "INFO") log.out("corpus.fetch: Fetching CORPUS Data from Network Rail", "INFO")
response = requests.get(CORPUS_URL, auth=(CORPUS_USER, CORPUS_PASS)) response = requests.get(CORPUS_URL, auth=(CORPUS_USER, CORPUS_PASS))
log.out("corpus.fetch: Decompressing CORPUS data", "INFO") log.out("corpus.fetch: Decompressing & parsing CORPUS data", "INFO")
return zlib.decompress(response.content, 16+zlib.MAX_WBITS) parsed = json.loads(zlib.decompress(response.content, 16+zlib.MAX_WBITS).decode())
return parsed['TIPLOCDATA']
def removeEmpty(list): def removeEmpty(data):
for dict in list: # DATA: List of Dictionaries
print(dict) # Iterates through the list,
# Iterates through each iterated list item (will be dictionary)
# Adds items that are not equal to " " to a new 'workingDictionary'
# Appends the working dictionary to the 'workingList' before moving on to the next item in the list
# Finally returns the 'workingList' which is the CORPUS data without any " " values.
corpusLength = len(data)
log.out(f"corpus.removeEmpty: Removing empty strings from {corpusLength} CORPUS datapoints", "INFO")
workingList = []
for item in data:
workingDict = {}
for key in item:
if item[key] != " ":
workingDict.update({key: item[key]})
workingList.append(workingDict)
return workingList
def onlyStations(data):
ORIGINAL METHOD FROM FILE: # DATA: List of Dictionaries
#! /usr/bin/python3 # Iterates through the list and checks each item (which is a dictionary) for the STANOX key
# If the STANOX exists the dictionary will be appended to the `workingList` which is returned once iteration is complete
import json preLength = len(data)
log.out(f"corpus.onlyStations: Removing items without STANOX from {preLength} datapoints", "INFO")
print("Opening CORPUSExtract.json") workingList = []
with open("./CORPUSExtract.json", "r") as input_file: for item in data:
dict = json.load(input_file) if 'STANOX' in item:
list = dict['TIPLOCDATA'] workingList.append(item)
postLength = len(workingList)
cleanList = [] log.out(f"corpus.onlyStations: Removed {preLength - postLength} datapoints", "INFO")
log.out(f"Yes, I am aware there are not {postLength} stations but the data includes NI, TfL, some bus, tram and closed stations too","OTHR")
print("Processing data") return workingList
for item in list:
if item["3ALPHA"] != ' ' and item["STANOX"] != ' ':
item.pop("UIC")
item.pop("NLCDESC16")
item.pop("NLC")
cleanList.append(item)
print("Saving data")
cleanDict = {"data":cleanList}
with open("CorpusClean.json", "w") as output_file:
output_file.write(json.dumps(cleanDict))
print(cleanList)
print("Processed.")

View File

@ -22,9 +22,11 @@ import time
#Local Imports #Local Imports
import corpus import corpus
import mongo #import mongo
import logger as log import logger as log
#Fetch Environment Variables #This is only a test run:
log.out("main.py: Trying to print CORPUS Data", 'INFO') rawCorpus = corpus.fetch()
print(corpus.fetchCorpus()) cleanCorpus = corpus.removeEmpty(rawCorpus)
stationsOnly = corpus.onlyStations(cleanCorpus)
print("DONE")