CORPUS Work
This commit is contained in:
parent
e910b7a84c
commit
50387bc6b2
@ -2,21 +2,55 @@
|
|||||||
import os
|
import os
|
||||||
import requests
|
import requests
|
||||||
import logger as log
|
import logger as log
|
||||||
|
import zlib
|
||||||
|
|
||||||
CORPUS_URL = "https://publicdatafeeds.networkrail.co.uk/ntrod/SupportingFileAuthenticate?type=CORPUS"
|
CORPUS_URL = "https://publicdatafeeds.networkrail.co.uk/ntrod/SupportingFileAuthenticate?type=CORPUS"
|
||||||
|
|
||||||
#Fetch Configuration
|
#Fetch Configuration
|
||||||
log.out("corpus.py: Fetching CORPUS Configuration", "INFO")
|
log.out("corpus: Fetching CORPUS Configuration", "INFO")
|
||||||
CORPUS_USER = os.getenv('OWL_LDB_CORPUSUSER')
|
CORPUS_USER = os.getenv('OWL_LDB_CORPUSUSER')
|
||||||
CORPUS_PASS = os.getenv('OWL_LDB_CORPUSPASS')
|
CORPUS_PASS = os.getenv('OWL_LDB_CORPUSPASS')
|
||||||
|
|
||||||
def hello():
|
def fetch():
|
||||||
print("hello")
|
log.out("corpus.fetch: Fetching CORPUS Data from Network Rail", "INFO")
|
||||||
return
|
response = requests.get(CORPUS_URL, auth=(CORPUS_USER, CORPUS_PASS))
|
||||||
|
log.out("corpus.fetch: Decompressing CORPUS data", "INFO")
|
||||||
|
return zlib.decompress(response.content, 16+zlib.MAX_WBITS)
|
||||||
|
|
||||||
def fetchCorpus():
|
def removeEmpty(list):
|
||||||
r = requests.get(CORPUS_URL, auth=(CORPUS_USER, CORPUS_PASS))
|
for item in list:
|
||||||
## Need to ungzip the response - should be able to do with no additional imports
|
if item["3ALPHA"] != ' ' and item["STANOX"] != ' ':
|
||||||
## See: https://stackoverflow.com/questions/40756106/decompress-python-requests-response-with-zlib
|
item.pop("UIC")
|
||||||
## Need to return the result
|
item.pop("NLCDESC16")
|
||||||
return
|
item.pop("NLC")
|
||||||
|
cleanList.append(item)
|
||||||
|
|
||||||
|
|
||||||
|
ORIGINAL METHOD FROM FILE:
|
||||||
|
#! /usr/bin/python3
|
||||||
|
|
||||||
|
import json
|
||||||
|
|
||||||
|
print("Opening CORPUSExtract.json")
|
||||||
|
with open("./CORPUSExtract.json", "r") as input_file:
|
||||||
|
dict = json.load(input_file)
|
||||||
|
list = dict['TIPLOCDATA']
|
||||||
|
|
||||||
|
cleanList = []
|
||||||
|
|
||||||
|
print("Processing data")
|
||||||
|
for item in list:
|
||||||
|
if item["3ALPHA"] != ' ' and item["STANOX"] != ' ':
|
||||||
|
item.pop("UIC")
|
||||||
|
item.pop("NLCDESC16")
|
||||||
|
item.pop("NLC")
|
||||||
|
cleanList.append(item)
|
||||||
|
|
||||||
|
print("Saving data")
|
||||||
|
|
||||||
|
cleanDict = {"data":cleanList}
|
||||||
|
with open("CorpusClean.json", "w") as output_file:
|
||||||
|
output_file.write(json.dumps(cleanDict))
|
||||||
|
|
||||||
|
print(cleanList)
|
||||||
|
print("Processed.")
|
Reference in New Issue
Block a user