Add validation steps

This commit is contained in:
Fred Boniface 2024-10-31 23:25:41 +00:00
parent 7d56f09c87
commit be31e6cfe1
4 changed files with 3139 additions and 4 deletions

2990
output.txt Normal file

File diff suppressed because it is too large Load Diff

View File

@ -3,7 +3,10 @@ import sys
import glob import glob
import datetime import datetime
import json import json
import parse_pdf import parse_pdf
import train_detail
import validate
# List all PDF files in the given directory # List all PDF files in the given directory
def list_pdf_files(directory): def list_pdf_files(directory):
@ -72,11 +75,52 @@ def main():
for pdf_file in pdf_files: for pdf_file in pdf_files:
schedule_cards.append(get_schedule_card_data(pdf_file)) schedule_cards.append(get_schedule_card_data(pdf_file))
# print(schedule_cards) # Iterate over schedule cards and fetch train data
out = open("output.txt", "w") trains = []
out.write(json.dumps(schedule_cards, indent=4, default=str)) existing_codes = []
out.close() for schedule_card in schedule_cards:
date = schedule_card['schedule_date']
for schedule in schedule_card['schedule_data']:
## Skip over schedules which have no PIS Code
if schedule['pis_code'] is None:
continue
## Check if PIS code already processed, if so skip it. Else add it to 'existing_codes'
if schedule['pis_code'] in existing_codes:
continue
existing_codes.append(schedule['pis_code'])
full_schedule = train_detail.find_gw_trains_by_headcode(schedule['headcode'], date)
train = {
'schedule_date': date,
'schedule_headcode': schedule['headcode'],
'schedule_location': schedule['location'],
'schedule_time1': schedule['time0'],
'schedule_time2': schedule['time1'],
'schedule_pis_code': schedule['pis_code'],
'timetable_entry': full_schedule,
}
## Filter out values missing schedule details or with matching PIS Codes
if not full_schedule:
continue # Skip if timetable entry was not found
if len(train['timetable_entry']) == 1 and train['schedule_pis_code'] == train['timetable_entry'][0].get('pis', {}).get('code'):
continue # Skip if only one possible timetable entry exists, and PIS code matches OwlBoards suggestion
trains.append(train)
organised_for_processing = train_detail.organise_trains(trains)
auto_matched = validate.filter_timetable_entries(organised_for_processing)
# print(trains)
out = open("output.txt", "w")
out.write(json.dumps(auto_matched, indent=4, default=str))
out.close()
if __name__ == "__main__": if __name__ == "__main__":
main() main()

65
src/train_detail.py Normal file
View File

@ -0,0 +1,65 @@
from pyOwlBoard import client
ob_client = client.OwlBoardClient('https://owlboard.info', 'x')
def find_gw_trains_by_headcode(headcode, date):
train_list = ob_client.get_trains_by_headcode(headcode, date)
# Filter for 'gw & hx' services only
for train in train_list[:]:
if train['operator'] not in ('GW', 'HX'):
train_list.remove(train)
# Get details for any services remaining in list
train_detail_list = []
for train in train_list:
train_detail = ob_client.get_trains_by_trainUid(train['trainUid'], date)
train_detail_list.append(train_detail)
return train_detail_list
# Organize list of trains removing unimportant information
def organise_trains(trains):
organised_trains = []
for train in trains:
organised_train = {
'diagram_date': train['schedule_date'],
'train_headcode': train['schedule_headcode'],
'diagram_time1': train['schedule_time1'],
'diagram_time2': train['schedule_time2'],
'diagram_pis_code': train['schedule_pis_code'],
'timetable_entries': rationalise_timetable_entry(train['timetable_entry']),
}
organised_trains.append(organised_train)
return organised_trains
# Convert timetable entry into a list of CRS Codes for PIS identification
def rationalise_timetable_entry(timetable_entries):
rationalised_entries = []
for timetable_entry in timetable_entries:
entry = {
'stpIndicator': timetable_entry['stpIndicator'],
'operator': timetable_entry['operator'],
'trainUid': timetable_entry['trainUid'],
'scheduleStart': timetable_entry['scheduleStart'],
'scheduleEnd': timetable_entry['scheduleEnd'],
'daysRun': timetable_entry['daysRun'],
'trainStartTime': timetable_entry['stops'][0].get('wttDeparture') or timetable_entry['stops'][0].get('publicDeparture'),
'stops': create_crs_list_from_stops(timetable_entry['stops']),
}
rationalised_entries.append(entry)
return rationalised_entries
# Work through list of stops, and create a list of CRS codes for public stops
def create_crs_list_from_stops(stops):
crs_list = []
for stop in stops:
if not stop['isPublic']:
continue
loc_ref = ob_client.get_loc_ref_codes_by_tiploc(stop['tiploc'])
crs_list.append(loc_ref[0]['3ALPHA'])
return crs_list

36
src/validate.py Normal file
View File

@ -0,0 +1,36 @@
## Validates and filters input based on whether diagram time matches schedule start time
def filter_timetable_entries(diagram_entries):
for entry in diagram_entries:
diagram_time1 = (entry.get('diagram_time1') or '').replace('.', '')
diagram_time2 = (entry.get('diagram_time2') or '').replace('.', '')
timetable_entries = entry.get('timetable_entries', [])
# Collect train start times for comparison
train_start_times = [t['trainStartTime'] for t in timetable_entries]
# Check for match
matching_times = [
t for t in train_start_times
if t == diagram_time1 or t== diagram_time2
]
# Check for unique matches
if len(matching_times) == 1:
# Filter any non-matching entries if there is a match
entry['timetable_entries'] = [
t for t in timetable_entries
if t['trainStartTime'] == matching_times[0]
]
entry['verification'] = "AUTO: Diagram/Schedule Time Match"
return diagram_entries
## Checks OwlBoard API for existing PIS codes and whether they match
def check_and_validate_against_owlboard(train_entries):
### Loop through input list (which is output of above function, currently in output.txt)
### check whether code exists in OwlBoard API, if so - does it match.
### If exists but no match, open an issue.
### If does not exist, do nothing.
### If exists and does match, remove from input list.
### Return output