db-manager/src/corpus.py

52 lines
1.3 KiB
Python
Raw Normal View History

2023-02-11 15:16:25 +00:00
#Imports
import os
import requests
import logger as log
2023-02-11 16:48:42 +00:00
import zlib
2023-02-11 15:16:25 +00:00
CORPUS_URL = "https://publicdatafeeds.networkrail.co.uk/ntrod/SupportingFileAuthenticate?type=CORPUS"
#Fetch Configuration
2023-02-11 16:48:42 +00:00
log.out("corpus: Fetching CORPUS Configuration", "INFO")
2023-02-11 15:16:25 +00:00
CORPUS_USER = os.getenv('OWL_LDB_CORPUSUSER')
CORPUS_PASS = os.getenv('OWL_LDB_CORPUSPASS')
2023-02-11 16:48:42 +00:00
def fetch():
log.out("corpus.fetch: Fetching CORPUS Data from Network Rail", "INFO")
response = requests.get(CORPUS_URL, auth=(CORPUS_USER, CORPUS_PASS))
log.out("corpus.fetch: Decompressing CORPUS data", "INFO")
return zlib.decompress(response.content, 16+zlib.MAX_WBITS)
2023-02-11 15:16:25 +00:00
2023-02-11 16:48:42 +00:00
def removeEmpty(list):
2023-02-11 17:40:48 +00:00
for dict in list:
print(dict)
2023-02-11 16:48:42 +00:00
ORIGINAL METHOD FROM FILE:
#! /usr/bin/python3
import json
print("Opening CORPUSExtract.json")
with open("./CORPUSExtract.json", "r") as input_file:
dict = json.load(input_file)
list = dict['TIPLOCDATA']
cleanList = []
print("Processing data")
for item in list:
if item["3ALPHA"] != ' ' and item["STANOX"] != ' ':
item.pop("UIC")
item.pop("NLCDESC16")
item.pop("NLC")
cleanList.append(item)
print("Saving data")
cleanDict = {"data":cleanList}
with open("CorpusClean.json", "w") as output_file:
output_file.write(json.dumps(cleanDict))
print(cleanList)
print("Processed.")