Add indexes
This commit is contained in:
parent
4c10cb6667
commit
e2a00aa5e3
@ -8,7 +8,7 @@ import json
|
|||||||
CORPUS_URL = "https://publicdatafeeds.networkrail.co.uk/ntrod/SupportingFileAuthenticate?type=CORPUS"
|
CORPUS_URL = "https://publicdatafeeds.networkrail.co.uk/ntrod/SupportingFileAuthenticate?type=CORPUS"
|
||||||
|
|
||||||
#Fetch Configuration
|
#Fetch Configuration
|
||||||
log.out("corpus: Fetching CORPUS Configuration", "INFO")
|
log.out("corpus.py: Fetching CORPUS Configuration", "INFO")
|
||||||
CORPUS_USER = os.getenv('OWL_LDB_CORPUSUSER')
|
CORPUS_USER = os.getenv('OWL_LDB_CORPUSUSER')
|
||||||
CORPUS_PASS = os.getenv('OWL_LDB_CORPUSPASS')
|
CORPUS_PASS = os.getenv('OWL_LDB_CORPUSPASS')
|
||||||
|
|
||||||
@ -27,7 +27,7 @@ def removeEmpty(data):
|
|||||||
# Appends the working dictionary to the 'workingList' before moving on to the next item in the list
|
# Appends the working dictionary to the 'workingList' before moving on to the next item in the list
|
||||||
# Finally returns the 'workingList' which is the CORPUS data without any " " values.
|
# Finally returns the 'workingList' which is the CORPUS data without any " " values.
|
||||||
corpusLength = len(data)
|
corpusLength = len(data)
|
||||||
log.out(f"corpus.removeEmpty: Removing empty strings from {corpusLength} CORPUS datapoints", "INFO")
|
log.out(f"corpus.removeEmpty: Removing empty strings from {corpusLength} CORPUS documents", "INFO")
|
||||||
workingList = []
|
workingList = []
|
||||||
for item in data:
|
for item in data:
|
||||||
workingDict = {}
|
workingDict = {}
|
||||||
@ -42,12 +42,12 @@ def onlyStations(data):
|
|||||||
# Iterates through the list and checks each item (which is a dictionary) for the STANOX key
|
# Iterates through the list and checks each item (which is a dictionary) for the STANOX key
|
||||||
# If the 3ALPHA exists the dictionary will be appended to the `workingList` which is returned once iteration is complete
|
# If the 3ALPHA exists the dictionary will be appended to the `workingList` which is returned once iteration is complete
|
||||||
preLength = len(data)
|
preLength = len(data)
|
||||||
log.out(f"corpus.onlyStations: Removing items without STANOX from {preLength} datapoints", "INFO")
|
log.out(f"corpus.onlyStations: Removing items without `3ALPHA` from {preLength} documents", "INFO")
|
||||||
workingList = []
|
workingList = []
|
||||||
for item in data:
|
for item in data:
|
||||||
if '3ALPHA' in item:
|
if '3ALPHA' in item:
|
||||||
workingList.append(item)
|
workingList.append(item)
|
||||||
postLength = len(workingList)
|
postLength = len(workingList)
|
||||||
log.out(f"corpus.onlyStations: Removed {preLength - postLength} datapoints", "INFO")
|
log.out(f"corpus.onlyStations: Removed {preLength - postLength} documents", "INFO")
|
||||||
log.out(f"Yes, I am aware there are not {postLength} stations but the data includes NI, TfL, some bus, tram and closed stations too","OTHR")
|
log.out(f"Yes, I am aware there are not {postLength} stations but the data includes NI, TfL, some bus, tram and closed stations too","OTHR")
|
||||||
return workingList
|
return workingList
|
38
src/mongo.py
38
src/mongo.py
@ -11,7 +11,7 @@ db_user = urllib.parse.quote_plus(os.getenv('OWL_DB_USER', "owl"))
|
|||||||
db_pass = urllib.parse.quote_plus(os.getenv('OWL_DB_PASS', "twittwoo"))
|
db_pass = urllib.parse.quote_plus(os.getenv('OWL_DB_PASS', "twittwoo"))
|
||||||
db_name = os.getenv('OWL_DB_NAME', "owlboard")
|
db_name = os.getenv('OWL_DB_NAME', "owlboard")
|
||||||
|
|
||||||
log.out(f"mongo.py: Connecting to database at {db_host}:{db_port}", "INFOm")
|
log.out(f"mongo.py: Connecting to database at {db_host}:{db_port}", "INFO")
|
||||||
client = MongoClient(f"mongodb://{db_user}:{db_pass}@{db_host}:{db_port}")
|
client = MongoClient(f"mongodb://{db_user}:{db_pass}@{db_host}:{db_port}")
|
||||||
db = client[db_name]
|
db = client[db_name]
|
||||||
|
|
||||||
@ -30,34 +30,46 @@ def getLength(collection):
|
|||||||
col = db[collection]
|
col = db[collection]
|
||||||
return col.count_documents({})
|
return col.count_documents({})
|
||||||
|
|
||||||
|
def createSingleIndex(collection, field):
|
||||||
|
col = db[collection]
|
||||||
|
col.create_index(field)
|
||||||
|
log.out(f'mongo.createSingleIndex: Created index of {field} in {collection}', "INFO")
|
||||||
|
return
|
||||||
|
|
||||||
|
|
||||||
def putBulkCorpus(data):
|
def putBulkCorpus(data):
|
||||||
startCount = getLength("corpus")
|
collection = "corpus"
|
||||||
col = db["corpus"]
|
startCount = getLength(collection)
|
||||||
|
col = db[collection]
|
||||||
if startCount > 0:
|
if startCount > 0:
|
||||||
log.out(f'mongo.putBulkCorpus: Dropping {startCount} CORPUS documents', "INFO")
|
log.out(f'mongo.putBulkCorpus: Dropping {startCount} CORPUS documents', "INFO")
|
||||||
col.drop()
|
col.drop()
|
||||||
col.insert_many(data)
|
col.insert_many(data)
|
||||||
endCount = getLength("corpus")
|
endCount = getLength(collection)
|
||||||
log.out(f'mongo.putBulkCorpus: {endCount} documents inserted', "INFO")
|
log.out(f'mongo.putBulkCorpus: {endCount} documents inserted', "INFO")
|
||||||
log.out(f'mongo.putBulkCorpus: {endCount - startCount} new documents', "INFO")
|
log.out(f'mongo.putBulkCorpus: {endCount - startCount} new documents', "INFO")
|
||||||
#log.out('mongo.putBulkCorpus: Building collection indexes',"INFO")
|
log.out('mongo.putBulkCorpus: Building collection indexes',"INFO")
|
||||||
# ADD INDEXES HERE
|
createSingleIndex(collection, "NLC")
|
||||||
|
createSingleIndex(collection, "3ALPHA")
|
||||||
log.out('mongo.putBulkCorpus: Updating meta time',"INFO")
|
log.out('mongo.putBulkCorpus: Updating meta time',"INFO")
|
||||||
metaUpdateTime("corpus")
|
metaUpdateTime(collection)
|
||||||
return
|
return
|
||||||
|
|
||||||
def putBulkStations(data):
|
def putBulkStations(data):
|
||||||
startCount = getLength("stations")
|
collection = "stations"
|
||||||
col = db["stations"]
|
startCount = getLength(collection)
|
||||||
|
col = db[collection]
|
||||||
if startCount > 0:
|
if startCount > 0:
|
||||||
log.out(f'mongo.putBulkStations: Dropping {startCount} station documents', "INFO")
|
log.out(f'mongo.putBulkStations: Dropping {startCount} station documents', "INFO")
|
||||||
col.drop()
|
col.drop()
|
||||||
col.insert_many(data)
|
col.insert_many(data)
|
||||||
endCount = getLength("stations")
|
endCount = getLength(collection)
|
||||||
log.out(f'mongo.putBulkStations: {endCount} documents inserted', "INFO")
|
log.out(f'mongo.putBulkStations: {endCount} documents inserted', "INFO")
|
||||||
log.out(f'mongo.putBulkStations: {endCount - startCount} new documents', "INFO")
|
log.out(f'mongo.putBulkStations: {endCount - startCount} new documents', "INFO")
|
||||||
#log.out('mongo.putBulkStations: Building collection indexes',"INFO")
|
log.out('mongo.putBulkStations: Building collection indexes',"INFO")
|
||||||
# ADD INDEXES HERE
|
createSingleIndex(collection, "3ALPHA")
|
||||||
|
createSingleIndex(collection, "STANOX")
|
||||||
|
createSingleIndex(collection, "TIPLOC")
|
||||||
log.out('mongo.putBulkStations: Updating meta time',"INFO")
|
log.out('mongo.putBulkStations: Updating meta time',"INFO")
|
||||||
metaUpdateTime("stations")
|
metaUpdateTime(collection)
|
||||||
return
|
return
|
Reference in New Issue
Block a user