Now updates corpus and stations
This commit is contained in:
parent
72445785a5
commit
b9bb1b1afa
@ -40,12 +40,12 @@ def removeEmpty(data):
|
|||||||
def onlyStations(data):
|
def onlyStations(data):
|
||||||
# DATA: List of Dictionaries
|
# DATA: List of Dictionaries
|
||||||
# Iterates through the list and checks each item (which is a dictionary) for the STANOX key
|
# Iterates through the list and checks each item (which is a dictionary) for the STANOX key
|
||||||
# If the STANOX exists the dictionary will be appended to the `workingList` which is returned once iteration is complete
|
# If the 3ALPHA exists the dictionary will be appended to the `workingList` which is returned once iteration is complete
|
||||||
preLength = len(data)
|
preLength = len(data)
|
||||||
log.out(f"corpus.onlyStations: Removing items without STANOX from {preLength} datapoints", "INFO")
|
log.out(f"corpus.onlyStations: Removing items without STANOX from {preLength} datapoints", "INFO")
|
||||||
workingList = []
|
workingList = []
|
||||||
for item in data:
|
for item in data:
|
||||||
if 'STANOX' in item:
|
if '3ALPHA' in item:
|
||||||
workingList.append(item)
|
workingList.append(item)
|
||||||
postLength = len(workingList)
|
postLength = len(workingList)
|
||||||
log.out(f"corpus.onlyStations: Removed {preLength - postLength} datapoints", "INFO")
|
log.out(f"corpus.onlyStations: Removed {preLength - postLength} datapoints", "INFO")
|
||||||
|
37
src/main.py
37
src/main.py
@ -14,19 +14,40 @@
|
|||||||
# program. If not, see
|
# program. If not, see
|
||||||
# https://git.fjla.uk/OwlBoard/db-manager/src/branch/main/LICENSE
|
# https://git.fjla.uk/OwlBoard/db-manager/src/branch/main/LICENSE
|
||||||
|
|
||||||
print("main.py: Initialising db-manager")
|
version = "0.1.0"
|
||||||
|
print(f"main.py: Initialising db-manager v{version}")
|
||||||
|
|
||||||
#Third Party Imports
|
#Third Party Imports
|
||||||
import os
|
import os
|
||||||
import time
|
import time
|
||||||
|
|
||||||
#Local Imports
|
#Local Imports
|
||||||
import corpus
|
import corpus, mongo
|
||||||
#import mongo
|
|
||||||
import logger as log
|
import logger as log
|
||||||
|
|
||||||
#This is only a test run:
|
log.out("main.py: db-manager Initialised", "INFO")
|
||||||
rawCorpus = corpus.fetch()
|
|
||||||
cleanCorpus = corpus.removeEmpty(rawCorpus)
|
#Check & Update corpus/stations:
|
||||||
stationsOnly = corpus.onlyStations(cleanCorpus)
|
# If older than 12 days then update
|
||||||
print("DONE")
|
corpusAge = int(time.time()) - mongo.metaCheckTime("corpus")
|
||||||
|
log.out(f'main.py: Corpus is {corpusAge}s old', "INFO")
|
||||||
|
if corpusAge > 1036800:
|
||||||
|
log.out('main.py: Updating CORPUS data', "INFO")
|
||||||
|
corpusData = corpus.removeEmpty(corpus.fetch())
|
||||||
|
mongo.putBulkCorpus(corpusData)
|
||||||
|
else:
|
||||||
|
log.out('main.py: Not updating CORPUS data', "INFO")
|
||||||
|
|
||||||
|
stationsAge = int(time.time()) - mongo.metaCheckTime("stations")
|
||||||
|
log.out(f'main.py: Stations is {stationsAge}s old', "INFO")
|
||||||
|
# While the source of stations data is CORPUS, this statement is based on corpusAge, when/if changing the source, it should be changed to use stationsAge
|
||||||
|
# if stationsAge is used now, there could be a situation where stationsAge tries to update but fails as corpusData doesn't exist.
|
||||||
|
if corpusAge > 1036800:
|
||||||
|
log.out('main.py: Updating stations data', "INFO")
|
||||||
|
stationData = corpus.onlyStations(corpusData)
|
||||||
|
mongo.putBulkStations(stationData)
|
||||||
|
else:
|
||||||
|
log.out('main.py: Not updating stations data', "INFO")
|
||||||
|
|
||||||
|
# END
|
||||||
|
log.out(f"main.py: db-manager v{version} Complete", "INFO")
|
59
src/mongo.py
59
src/mongo.py
@ -1,9 +1,62 @@
|
|||||||
import os
|
import os
|
||||||
from pymongo import MongoClient
|
from pymongo import MongoClient
|
||||||
|
import time
|
||||||
import urllib.parse
|
import urllib.parse
|
||||||
import logger as log
|
import logger as log
|
||||||
|
|
||||||
log.out("mongo.py: Fetching configuration", "INFO")
|
log.out("mongo.py: Fetching configuration", "INFO")
|
||||||
db_url = os.getenv('OWL_DB_HOST') + ":" + os.getenv('OWL_DB_PORT')
|
db_host = os.getenv('OWL_DB_HOST', 'localhost')
|
||||||
db_user = urllib.parse.quote_plus(os.getenv('OWL_DB_USER'))
|
db_port = os.getenv('OWL_DB_PORT', 27017)
|
||||||
db_pass = urllib.parse.quote_plus(os.getenv('OWL_DB_PASS'))
|
db_user = urllib.parse.quote_plus(os.getenv('OWL_DB_USER', "owl"))
|
||||||
|
db_pass = urllib.parse.quote_plus(os.getenv('OWL_DB_PASS', "twittwoo"))
|
||||||
|
db_name = os.getenv('OWL_DB_NAME', "owlboard")
|
||||||
|
|
||||||
|
client = MongoClient(f"mongodb://{db_user}:{db_pass}@{db_host}:{db_port}")
|
||||||
|
db = client[db_name]
|
||||||
|
|
||||||
|
def metaCheckTime(target):
|
||||||
|
col = db["meta"]
|
||||||
|
res = col.find_one({"target": target})
|
||||||
|
log.out(f'mongo.metaUpdateTime: {target} last updated at {res["updated"]}', "INFO")
|
||||||
|
return res["updated"]
|
||||||
|
|
||||||
|
def metaUpdateTime(target):
|
||||||
|
col = db["meta"]
|
||||||
|
log.out(f'mongo.metaUpdateTime: Updating updated time for {target}', "INFO")
|
||||||
|
res = col.update_one({"target": target}, {"$set":{"updated": int(time.time())}}, upsert=True)
|
||||||
|
|
||||||
|
def getLength(collection):
|
||||||
|
col = db[collection]
|
||||||
|
return col.count_documents({})
|
||||||
|
|
||||||
|
def putBulkCorpus(data):
|
||||||
|
startCount = getLength("corpus")
|
||||||
|
col = db["corpus"]
|
||||||
|
if startCount > 0:
|
||||||
|
log.out(f'mongo.putBulkCorpus: Dropping {startCount} CORPUS documents', "INFO")
|
||||||
|
col.drop()
|
||||||
|
col.insert_many(data)
|
||||||
|
endCount = getLength("corpus")
|
||||||
|
log.out(f'mongo.putBulkCorpus: {endCount} documents inserted', "INFO")
|
||||||
|
log.out(f'mongo.putBulkCorpus: {endCount - startCount} new documents', "INFO")
|
||||||
|
#log.out('mongo.putBulkCorpus: Building collection indexes',"INFO")
|
||||||
|
# ADD INDEXES HERE
|
||||||
|
log.out('mongo.putBulkCorpus: Updating meta time',"INFO")
|
||||||
|
metaUpdateTime("corpus")
|
||||||
|
return
|
||||||
|
|
||||||
|
def putBulkStations(data):
|
||||||
|
startCount = getLength("stations")
|
||||||
|
col = db["stations"]
|
||||||
|
if startCount > 0:
|
||||||
|
log.out(f'mongo.putBulkStations: Dropping {startCount} station documents', "INFO")
|
||||||
|
col.drop()
|
||||||
|
col.insert_many(data)
|
||||||
|
endCount = getLength("stations")
|
||||||
|
log.out(f'mongo.putBulkStations: {endCount} documents inserted', "INFO")
|
||||||
|
log.out(f'mongo.putBulkStations: {endCount - startCount} new documents', "INFO")
|
||||||
|
#log.out('mongo.putBulkStations: Building collection indexes',"INFO")
|
||||||
|
# ADD INDEXES HERE
|
||||||
|
log.out('mongo.putBulkStations: Updating meta time',"INFO")
|
||||||
|
metaUpdateTime("stations")
|
||||||
|
return
|
Reference in New Issue
Block a user