Now updates corpus and stations

This commit is contained in:
Fred Boniface 2023-02-11 22:05:30 +00:00
parent 72445785a5
commit b9bb1b1afa
3 changed files with 87 additions and 13 deletions

View File

@ -40,12 +40,12 @@ def removeEmpty(data):
def onlyStations(data):
# DATA: List of Dictionaries
# Iterates through the list and checks each item (which is a dictionary) for the STANOX key
# If the STANOX exists the dictionary will be appended to the `workingList` which is returned once iteration is complete
# If the 3ALPHA exists the dictionary will be appended to the `workingList` which is returned once iteration is complete
preLength = len(data)
log.out(f"corpus.onlyStations: Removing items without STANOX from {preLength} datapoints", "INFO")
workingList = []
for item in data:
if 'STANOX' in item:
if '3ALPHA' in item:
workingList.append(item)
postLength = len(workingList)
log.out(f"corpus.onlyStations: Removed {preLength - postLength} datapoints", "INFO")

View File

@ -14,19 +14,40 @@
# program. If not, see
# https://git.fjla.uk/OwlBoard/db-manager/src/branch/main/LICENSE
print("main.py: Initialising db-manager")
version = "0.1.0"
print(f"main.py: Initialising db-manager v{version}")
#Third Party Imports
import os
import time
#Local Imports
import corpus
#import mongo
import corpus, mongo
import logger as log
#This is only a test run:
rawCorpus = corpus.fetch()
cleanCorpus = corpus.removeEmpty(rawCorpus)
stationsOnly = corpus.onlyStations(cleanCorpus)
print("DONE")
log.out("main.py: db-manager Initialised", "INFO")
#Check & Update corpus/stations:
# If older than 12 days then update
corpusAge = int(time.time()) - mongo.metaCheckTime("corpus")
log.out(f'main.py: Corpus is {corpusAge}s old', "INFO")
if corpusAge > 1036800:
log.out('main.py: Updating CORPUS data', "INFO")
corpusData = corpus.removeEmpty(corpus.fetch())
mongo.putBulkCorpus(corpusData)
else:
log.out('main.py: Not updating CORPUS data', "INFO")
stationsAge = int(time.time()) - mongo.metaCheckTime("stations")
log.out(f'main.py: Stations is {stationsAge}s old', "INFO")
# While the source of stations data is CORPUS, this statement is based on corpusAge, when/if changing the source, it should be changed to use stationsAge
# if stationsAge is used now, there could be a situation where stationsAge tries to update but fails as corpusData doesn't exist.
if corpusAge > 1036800:
log.out('main.py: Updating stations data', "INFO")
stationData = corpus.onlyStations(corpusData)
mongo.putBulkStations(stationData)
else:
log.out('main.py: Not updating stations data', "INFO")
# END
log.out(f"main.py: db-manager v{version} Complete", "INFO")

View File

@ -1,9 +1,62 @@
import os
from pymongo import MongoClient
import time
import urllib.parse
import logger as log
log.out("mongo.py: Fetching configuration", "INFO")
db_url = os.getenv('OWL_DB_HOST') + ":" + os.getenv('OWL_DB_PORT')
db_user = urllib.parse.quote_plus(os.getenv('OWL_DB_USER'))
db_pass = urllib.parse.quote_plus(os.getenv('OWL_DB_PASS'))
db_host = os.getenv('OWL_DB_HOST', 'localhost')
db_port = os.getenv('OWL_DB_PORT', 27017)
db_user = urllib.parse.quote_plus(os.getenv('OWL_DB_USER', "owl"))
db_pass = urllib.parse.quote_plus(os.getenv('OWL_DB_PASS', "twittwoo"))
db_name = os.getenv('OWL_DB_NAME', "owlboard")
client = MongoClient(f"mongodb://{db_user}:{db_pass}@{db_host}:{db_port}")
db = client[db_name]
def metaCheckTime(target):
col = db["meta"]
res = col.find_one({"target": target})
log.out(f'mongo.metaUpdateTime: {target} last updated at {res["updated"]}', "INFO")
return res["updated"]
def metaUpdateTime(target):
col = db["meta"]
log.out(f'mongo.metaUpdateTime: Updating updated time for {target}', "INFO")
res = col.update_one({"target": target}, {"$set":{"updated": int(time.time())}}, upsert=True)
def getLength(collection):
col = db[collection]
return col.count_documents({})
def putBulkCorpus(data):
startCount = getLength("corpus")
col = db["corpus"]
if startCount > 0:
log.out(f'mongo.putBulkCorpus: Dropping {startCount} CORPUS documents', "INFO")
col.drop()
col.insert_many(data)
endCount = getLength("corpus")
log.out(f'mongo.putBulkCorpus: {endCount} documents inserted', "INFO")
log.out(f'mongo.putBulkCorpus: {endCount - startCount} new documents', "INFO")
#log.out('mongo.putBulkCorpus: Building collection indexes',"INFO")
# ADD INDEXES HERE
log.out('mongo.putBulkCorpus: Updating meta time',"INFO")
metaUpdateTime("corpus")
return
def putBulkStations(data):
startCount = getLength("stations")
col = db["stations"]
if startCount > 0:
log.out(f'mongo.putBulkStations: Dropping {startCount} station documents', "INFO")
col.drop()
col.insert_many(data)
endCount = getLength("stations")
log.out(f'mongo.putBulkStations: {endCount} documents inserted', "INFO")
log.out(f'mongo.putBulkStations: {endCount - startCount} new documents', "INFO")
#log.out('mongo.putBulkStations: Building collection indexes',"INFO")
# ADD INDEXES HERE
log.out('mongo.putBulkStations: Updating meta time',"INFO")
metaUpdateTime("stations")
return