62 lines
		
	
	
		
			2.5 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
			
		
		
	
	
			62 lines
		
	
	
		
			2.5 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
#Imports
 | 
						|
import os
 | 
						|
import requests
 | 
						|
import logger as log
 | 
						|
import zlib
 | 
						|
import json
 | 
						|
import mongo
 | 
						|
 | 
						|
CORPUS_URL = "https://publicdatafeeds.networkrail.co.uk/ntrod/SupportingFileAuthenticate?type=CORPUS"
 | 
						|
 | 
						|
#Fetch Configuration
 | 
						|
CORPUS_USER = os.getenv('OWL_LDB_CORPUSUSER')
 | 
						|
CORPUS_PASS = os.getenv('OWL_LDB_CORPUSPASS')
 | 
						|
 | 
						|
log.out("corpus.py: CORPUS Module Loaded", "DBUG")
 | 
						|
 | 
						|
def fetch():
 | 
						|
    log.out("corpus.fetch: Fetching CORPUS Data from Network Rail", "INFO")
 | 
						|
    response = requests.get(CORPUS_URL, auth=(CORPUS_USER, CORPUS_PASS))
 | 
						|
    mongo.incrementCounter("corpus_api")
 | 
						|
    log.out("corpus.fetch: Decompressing & parsing CORPUS data", "INFO")
 | 
						|
    parsed = json.loads(zlib.decompress(response.content, 16+zlib.MAX_WBITS).decode())
 | 
						|
    return parsed['TIPLOCDATA']
 | 
						|
 | 
						|
def removeEmpty(data):
 | 
						|
    # DATA: List of Dictionaries
 | 
						|
    # Iterates through the list,
 | 
						|
    # Iterates through each iterated list item (will be dictionary)
 | 
						|
    # Adds items that are not equal to " " to a new 'workingDictionary'
 | 
						|
    # Appends the working dictionary to the 'workingList' before moving on to the next item in the list
 | 
						|
    # Finally returns the 'workingList' which is the CORPUS data without any " " values.
 | 
						|
    corpusLength = len(data)
 | 
						|
    log.out(f"corpus.removeEmpty: Removing empty strings from {corpusLength} CORPUS documents", "INFO")
 | 
						|
    workingList = []
 | 
						|
    for item in data:
 | 
						|
        workingDict = {}
 | 
						|
        for key in item:
 | 
						|
            if item[key] != " ":
 | 
						|
              workingDict.update({key: item[key]})
 | 
						|
        workingList.append(workingDict)
 | 
						|
    return workingList
 | 
						|
 | 
						|
def onlyStations(data):
 | 
						|
    # DATA: List of Dictionaries
 | 
						|
    # Iterates through the list and checks each item (which is a dictionary) for the STANOX key
 | 
						|
    # If the 3ALPHA exists the relevant fields will be appended to the `workingDict` which is appended to the workingList
 | 
						|
    # the workingList is finally returned.
 | 
						|
    preLength = len(data)
 | 
						|
    log.out(f"corpus.onlyStations: Removing items without `3ALPHA` from {preLength} documents", "INFO")
 | 
						|
    workingList = []
 | 
						|
    for item in data:
 | 
						|
        workingDict = {}
 | 
						|
        if '3ALPHA' in item:
 | 
						|
            workingDict.update(item)
 | 
						|
            workingDict.pop("NLC", False)
 | 
						|
            workingDict.pop("NLCDESC16", False)
 | 
						|
            workingDict.pop("UIC", False)
 | 
						|
            workingList.append(workingDict)
 | 
						|
    postLength = len(workingList)
 | 
						|
    log.out(f"corpus.onlyStations: Removed {preLength - postLength} documents", "INFO")
 | 
						|
    log.out(f"Yes, I am aware there are not {postLength} stations but the data includes NI, TfL, some bus, tram and closed stations too","OTHR")
 | 
						|
    return workingList |