From e2a00aa5e393b90f99cc8215f39c6b7a5cc15cc5 Mon Sep 17 00:00:00 2001 From: Fred Boniface Date: Sun, 12 Feb 2023 20:53:59 +0000 Subject: [PATCH] Add indexes --- src/corpus.py | 8 ++++---- src/mongo.py | 38 +++++++++++++++++++++++++------------- 2 files changed, 29 insertions(+), 17 deletions(-) diff --git a/src/corpus.py b/src/corpus.py index 238caa1..6e564b4 100644 --- a/src/corpus.py +++ b/src/corpus.py @@ -8,7 +8,7 @@ import json CORPUS_URL = "https://publicdatafeeds.networkrail.co.uk/ntrod/SupportingFileAuthenticate?type=CORPUS" #Fetch Configuration -log.out("corpus: Fetching CORPUS Configuration", "INFO") +log.out("corpus.py: Fetching CORPUS Configuration", "INFO") CORPUS_USER = os.getenv('OWL_LDB_CORPUSUSER') CORPUS_PASS = os.getenv('OWL_LDB_CORPUSPASS') @@ -27,7 +27,7 @@ def removeEmpty(data): # Appends the working dictionary to the 'workingList' before moving on to the next item in the list # Finally returns the 'workingList' which is the CORPUS data without any " " values. corpusLength = len(data) - log.out(f"corpus.removeEmpty: Removing empty strings from {corpusLength} CORPUS datapoints", "INFO") + log.out(f"corpus.removeEmpty: Removing empty strings from {corpusLength} CORPUS documents", "INFO") workingList = [] for item in data: workingDict = {} @@ -42,12 +42,12 @@ def onlyStations(data): # Iterates through the list and checks each item (which is a dictionary) for the STANOX key # If the 3ALPHA exists the dictionary will be appended to the `workingList` which is returned once iteration is complete preLength = len(data) - log.out(f"corpus.onlyStations: Removing items without STANOX from {preLength} datapoints", "INFO") + log.out(f"corpus.onlyStations: Removing items without `3ALPHA` from {preLength} documents", "INFO") workingList = [] for item in data: if '3ALPHA' in item: workingList.append(item) postLength = len(workingList) - log.out(f"corpus.onlyStations: Removed {preLength - postLength} datapoints", "INFO") + log.out(f"corpus.onlyStations: Removed {preLength - postLength} documents", "INFO") log.out(f"Yes, I am aware there are not {postLength} stations but the data includes NI, TfL, some bus, tram and closed stations too","OTHR") return workingList \ No newline at end of file diff --git a/src/mongo.py b/src/mongo.py index 4e4be45..563dd34 100644 --- a/src/mongo.py +++ b/src/mongo.py @@ -11,7 +11,7 @@ db_user = urllib.parse.quote_plus(os.getenv('OWL_DB_USER', "owl")) db_pass = urllib.parse.quote_plus(os.getenv('OWL_DB_PASS', "twittwoo")) db_name = os.getenv('OWL_DB_NAME', "owlboard") -log.out(f"mongo.py: Connecting to database at {db_host}:{db_port}", "INFOm") +log.out(f"mongo.py: Connecting to database at {db_host}:{db_port}", "INFO") client = MongoClient(f"mongodb://{db_user}:{db_pass}@{db_host}:{db_port}") db = client[db_name] @@ -30,34 +30,46 @@ def getLength(collection): col = db[collection] return col.count_documents({}) +def createSingleIndex(collection, field): + col = db[collection] + col.create_index(field) + log.out(f'mongo.createSingleIndex: Created index of {field} in {collection}', "INFO") + return + + def putBulkCorpus(data): - startCount = getLength("corpus") - col = db["corpus"] + collection = "corpus" + startCount = getLength(collection) + col = db[collection] if startCount > 0: log.out(f'mongo.putBulkCorpus: Dropping {startCount} CORPUS documents', "INFO") col.drop() col.insert_many(data) - endCount = getLength("corpus") + endCount = getLength(collection) log.out(f'mongo.putBulkCorpus: {endCount} documents inserted', "INFO") log.out(f'mongo.putBulkCorpus: {endCount - startCount} new documents', "INFO") - #log.out('mongo.putBulkCorpus: Building collection indexes',"INFO") - # ADD INDEXES HERE + log.out('mongo.putBulkCorpus: Building collection indexes',"INFO") + createSingleIndex(collection, "NLC") + createSingleIndex(collection, "3ALPHA") log.out('mongo.putBulkCorpus: Updating meta time',"INFO") - metaUpdateTime("corpus") + metaUpdateTime(collection) return def putBulkStations(data): - startCount = getLength("stations") - col = db["stations"] + collection = "stations" + startCount = getLength(collection) + col = db[collection] if startCount > 0: log.out(f'mongo.putBulkStations: Dropping {startCount} station documents', "INFO") col.drop() col.insert_many(data) - endCount = getLength("stations") + endCount = getLength(collection) log.out(f'mongo.putBulkStations: {endCount} documents inserted', "INFO") log.out(f'mongo.putBulkStations: {endCount - startCount} new documents', "INFO") - #log.out('mongo.putBulkStations: Building collection indexes',"INFO") - # ADD INDEXES HERE + log.out('mongo.putBulkStations: Building collection indexes',"INFO") + createSingleIndex(collection, "3ALPHA") + createSingleIndex(collection, "STANOX") + createSingleIndex(collection, "TIPLOC") log.out('mongo.putBulkStations: Updating meta time',"INFO") - metaUpdateTime("stations") + metaUpdateTime(collection) return \ No newline at end of file