Finalise? corpus.py
This commit is contained in:
parent
5b3965d50e
commit
72445785a5
@ -3,6 +3,7 @@ import os
|
|||||||
import requests
|
import requests
|
||||||
import logger as log
|
import logger as log
|
||||||
import zlib
|
import zlib
|
||||||
|
import json
|
||||||
|
|
||||||
CORPUS_URL = "https://publicdatafeeds.networkrail.co.uk/ntrod/SupportingFileAuthenticate?type=CORPUS"
|
CORPUS_URL = "https://publicdatafeeds.networkrail.co.uk/ntrod/SupportingFileAuthenticate?type=CORPUS"
|
||||||
|
|
||||||
@ -14,39 +15,39 @@ CORPUS_PASS = os.getenv('OWL_LDB_CORPUSPASS')
|
|||||||
def fetch():
|
def fetch():
|
||||||
log.out("corpus.fetch: Fetching CORPUS Data from Network Rail", "INFO")
|
log.out("corpus.fetch: Fetching CORPUS Data from Network Rail", "INFO")
|
||||||
response = requests.get(CORPUS_URL, auth=(CORPUS_USER, CORPUS_PASS))
|
response = requests.get(CORPUS_URL, auth=(CORPUS_USER, CORPUS_PASS))
|
||||||
log.out("corpus.fetch: Decompressing CORPUS data", "INFO")
|
log.out("corpus.fetch: Decompressing & parsing CORPUS data", "INFO")
|
||||||
return zlib.decompress(response.content, 16+zlib.MAX_WBITS)
|
parsed = json.loads(zlib.decompress(response.content, 16+zlib.MAX_WBITS).decode())
|
||||||
|
return parsed['TIPLOCDATA']
|
||||||
|
|
||||||
def removeEmpty(list):
|
def removeEmpty(data):
|
||||||
for dict in list:
|
# DATA: List of Dictionaries
|
||||||
print(dict)
|
# Iterates through the list,
|
||||||
|
# Iterates through each iterated list item (will be dictionary)
|
||||||
|
# Adds items that are not equal to " " to a new 'workingDictionary'
|
||||||
|
# Appends the working dictionary to the 'workingList' before moving on to the next item in the list
|
||||||
|
# Finally returns the 'workingList' which is the CORPUS data without any " " values.
|
||||||
|
corpusLength = len(data)
|
||||||
|
log.out(f"corpus.removeEmpty: Removing empty strings from {corpusLength} CORPUS datapoints", "INFO")
|
||||||
|
workingList = []
|
||||||
|
for item in data:
|
||||||
|
workingDict = {}
|
||||||
|
for key in item:
|
||||||
|
if item[key] != " ":
|
||||||
|
workingDict.update({key: item[key]})
|
||||||
|
workingList.append(workingDict)
|
||||||
|
return workingList
|
||||||
|
|
||||||
|
def onlyStations(data):
|
||||||
ORIGINAL METHOD FROM FILE:
|
# DATA: List of Dictionaries
|
||||||
#! /usr/bin/python3
|
# Iterates through the list and checks each item (which is a dictionary) for the STANOX key
|
||||||
|
# If the STANOX exists the dictionary will be appended to the `workingList` which is returned once iteration is complete
|
||||||
import json
|
preLength = len(data)
|
||||||
|
log.out(f"corpus.onlyStations: Removing items without STANOX from {preLength} datapoints", "INFO")
|
||||||
print("Opening CORPUSExtract.json")
|
workingList = []
|
||||||
with open("./CORPUSExtract.json", "r") as input_file:
|
for item in data:
|
||||||
dict = json.load(input_file)
|
if 'STANOX' in item:
|
||||||
list = dict['TIPLOCDATA']
|
workingList.append(item)
|
||||||
|
postLength = len(workingList)
|
||||||
cleanList = []
|
log.out(f"corpus.onlyStations: Removed {preLength - postLength} datapoints", "INFO")
|
||||||
|
log.out(f"Yes, I am aware there are not {postLength} stations but the data includes NI, TfL, some bus, tram and closed stations too","OTHR")
|
||||||
print("Processing data")
|
return workingList
|
||||||
for item in list:
|
|
||||||
if item["3ALPHA"] != ' ' and item["STANOX"] != ' ':
|
|
||||||
item.pop("UIC")
|
|
||||||
item.pop("NLCDESC16")
|
|
||||||
item.pop("NLC")
|
|
||||||
cleanList.append(item)
|
|
||||||
|
|
||||||
print("Saving data")
|
|
||||||
|
|
||||||
cleanDict = {"data":cleanList}
|
|
||||||
with open("CorpusClean.json", "w") as output_file:
|
|
||||||
output_file.write(json.dumps(cleanDict))
|
|
||||||
|
|
||||||
print(cleanList)
|
|
||||||
print("Processed.")
|
|
10
src/main.py
10
src/main.py
@ -22,9 +22,11 @@ import time
|
|||||||
|
|
||||||
#Local Imports
|
#Local Imports
|
||||||
import corpus
|
import corpus
|
||||||
import mongo
|
#import mongo
|
||||||
import logger as log
|
import logger as log
|
||||||
|
|
||||||
#Fetch Environment Variables
|
#This is only a test run:
|
||||||
log.out("main.py: Trying to print CORPUS Data", 'INFO')
|
rawCorpus = corpus.fetch()
|
||||||
print(corpus.fetchCorpus())
|
cleanCorpus = corpus.removeEmpty(rawCorpus)
|
||||||
|
stationsOnly = corpus.onlyStations(cleanCorpus)
|
||||||
|
print("DONE")
|
Reference in New Issue
Block a user