Finalise? corpus.py
This commit is contained in:
parent
5b3965d50e
commit
72445785a5
@ -3,6 +3,7 @@ import os
|
||||
import requests
|
||||
import logger as log
|
||||
import zlib
|
||||
import json
|
||||
|
||||
CORPUS_URL = "https://publicdatafeeds.networkrail.co.uk/ntrod/SupportingFileAuthenticate?type=CORPUS"
|
||||
|
||||
@ -14,39 +15,39 @@ CORPUS_PASS = os.getenv('OWL_LDB_CORPUSPASS')
|
||||
def fetch():
|
||||
log.out("corpus.fetch: Fetching CORPUS Data from Network Rail", "INFO")
|
||||
response = requests.get(CORPUS_URL, auth=(CORPUS_USER, CORPUS_PASS))
|
||||
log.out("corpus.fetch: Decompressing CORPUS data", "INFO")
|
||||
return zlib.decompress(response.content, 16+zlib.MAX_WBITS)
|
||||
log.out("corpus.fetch: Decompressing & parsing CORPUS data", "INFO")
|
||||
parsed = json.loads(zlib.decompress(response.content, 16+zlib.MAX_WBITS).decode())
|
||||
return parsed['TIPLOCDATA']
|
||||
|
||||
def removeEmpty(list):
|
||||
for dict in list:
|
||||
print(dict)
|
||||
def removeEmpty(data):
|
||||
# DATA: List of Dictionaries
|
||||
# Iterates through the list,
|
||||
# Iterates through each iterated list item (will be dictionary)
|
||||
# Adds items that are not equal to " " to a new 'workingDictionary'
|
||||
# Appends the working dictionary to the 'workingList' before moving on to the next item in the list
|
||||
# Finally returns the 'workingList' which is the CORPUS data without any " " values.
|
||||
corpusLength = len(data)
|
||||
log.out(f"corpus.removeEmpty: Removing empty strings from {corpusLength} CORPUS datapoints", "INFO")
|
||||
workingList = []
|
||||
for item in data:
|
||||
workingDict = {}
|
||||
for key in item:
|
||||
if item[key] != " ":
|
||||
workingDict.update({key: item[key]})
|
||||
workingList.append(workingDict)
|
||||
return workingList
|
||||
|
||||
|
||||
ORIGINAL METHOD FROM FILE:
|
||||
#! /usr/bin/python3
|
||||
|
||||
import json
|
||||
|
||||
print("Opening CORPUSExtract.json")
|
||||
with open("./CORPUSExtract.json", "r") as input_file:
|
||||
dict = json.load(input_file)
|
||||
list = dict['TIPLOCDATA']
|
||||
|
||||
cleanList = []
|
||||
|
||||
print("Processing data")
|
||||
for item in list:
|
||||
if item["3ALPHA"] != ' ' and item["STANOX"] != ' ':
|
||||
item.pop("UIC")
|
||||
item.pop("NLCDESC16")
|
||||
item.pop("NLC")
|
||||
cleanList.append(item)
|
||||
|
||||
print("Saving data")
|
||||
|
||||
cleanDict = {"data":cleanList}
|
||||
with open("CorpusClean.json", "w") as output_file:
|
||||
output_file.write(json.dumps(cleanDict))
|
||||
|
||||
print(cleanList)
|
||||
print("Processed.")
|
||||
def onlyStations(data):
|
||||
# DATA: List of Dictionaries
|
||||
# Iterates through the list and checks each item (which is a dictionary) for the STANOX key
|
||||
# If the STANOX exists the dictionary will be appended to the `workingList` which is returned once iteration is complete
|
||||
preLength = len(data)
|
||||
log.out(f"corpus.onlyStations: Removing items without STANOX from {preLength} datapoints", "INFO")
|
||||
workingList = []
|
||||
for item in data:
|
||||
if 'STANOX' in item:
|
||||
workingList.append(item)
|
||||
postLength = len(workingList)
|
||||
log.out(f"corpus.onlyStations: Removed {preLength - postLength} datapoints", "INFO")
|
||||
log.out(f"Yes, I am aware there are not {postLength} stations but the data includes NI, TfL, some bus, tram and closed stations too","OTHR")
|
||||
return workingList
|
10
src/main.py
10
src/main.py
@ -22,9 +22,11 @@ import time
|
||||
|
||||
#Local Imports
|
||||
import corpus
|
||||
import mongo
|
||||
#import mongo
|
||||
import logger as log
|
||||
|
||||
#Fetch Environment Variables
|
||||
log.out("main.py: Trying to print CORPUS Data", 'INFO')
|
||||
print(corpus.fetchCorpus())
|
||||
#This is only a test run:
|
||||
rawCorpus = corpus.fetch()
|
||||
cleanCorpus = corpus.removeEmpty(rawCorpus)
|
||||
stationsOnly = corpus.onlyStations(cleanCorpus)
|
||||
print("DONE")
|
Reference in New Issue
Block a user