Add validation steps

2024-10-31 23:25:41 +00:00 · 2024-10-31 23:25:41 +00:00 · be31e6cfe1
commit be31e6cfe1
parent 7d56f09c87
4 changed files with 3139 additions and 4 deletions
--- a/output.txt
+++ b/output.txt
--- a/src/main.py
+++ b/src/main.py
@ -3,7 +3,10 @@ import sys
 import glob
 import datetime
 import json
 import parse_pdf
 import train_detail
 import validate
 # List all PDF files in the given directory
 def list_pdf_files(directory):
@ -72,11 +75,52 @@ def main():
    for pdf_file in pdf_files:
        schedule_cards.append(get_schedule_card_data(pdf_file))
-   # print(schedule_cards)
+    # Iterate over schedule cards and fetch train data
-    out = open("output.txt", "w")
+    trains = []
-    out.write(json.dumps(schedule_cards, indent=4, default=str))
+    existing_codes = []
-    out.close()
+    for schedule_card in schedule_cards:
        date = schedule_card['schedule_date']
        for schedule in schedule_card['schedule_data']:
            ## Skip over schedules which have no PIS Code
            if schedule['pis_code'] is None:
                continue
            ## Check if PIS code already processed, if so skip it.  Else add it to 'existing_codes'
            if schedule['pis_code'] in existing_codes:
                continue
            existing_codes.append(schedule['pis_code'])
            full_schedule = train_detail.find_gw_trains_by_headcode(schedule['headcode'], date)
            train = {
                'schedule_date': date,
                'schedule_headcode': schedule['headcode'],
                'schedule_location': schedule['location'],
                'schedule_time1': schedule['time0'],
                'schedule_time2': schedule['time1'],
                'schedule_pis_code': schedule['pis_code'],
                'timetable_entry': full_schedule,
            }
            ## Filter out values missing schedule details or with matching PIS Codes
            if not full_schedule:
                continue # Skip if timetable entry was not found
            if len(train['timetable_entry']) == 1 and train['schedule_pis_code'] == train['timetable_entry'][0].get('pis', {}).get('code'):
                continue # Skip if only one possible timetable entry exists, and PIS code matches OwlBoards suggestion
            trains.append(train)
    organised_for_processing = train_detail.organise_trains(trains)
    auto_matched = validate.filter_timetable_entries(organised_for_processing)
    # print(trains)
    out = open("output.txt", "w")
    out.write(json.dumps(auto_matched, indent=4, default=str))
    out.close()
 if __name__ == "__main__":
    main()
--- a/src/train_detail.py
+++ b/src/train_detail.py
@ -0,0 +1,65 @@
 from pyOwlBoard import client
 ob_client = client.OwlBoardClient('https://owlboard.info', 'x')
 def find_gw_trains_by_headcode(headcode, date):
    train_list = ob_client.get_trains_by_headcode(headcode, date)
    # Filter for 'gw & hx' services only
    for train in train_list[:]:
        if train['operator'] not in ('GW', 'HX'):
            train_list.remove(train)
    # Get details for any services remaining in list
    train_detail_list = []
    for train in train_list:
        train_detail = ob_client.get_trains_by_trainUid(train['trainUid'], date)
        train_detail_list.append(train_detail)
    return train_detail_list
 # Organize list of trains removing unimportant information
 def organise_trains(trains):
    organised_trains = []
    for train in trains:
        organised_train = {
            'diagram_date': train['schedule_date'],
            'train_headcode': train['schedule_headcode'],
            'diagram_time1': train['schedule_time1'],
            'diagram_time2': train['schedule_time2'],
            'diagram_pis_code': train['schedule_pis_code'],
            'timetable_entries': rationalise_timetable_entry(train['timetable_entry']),
        }
        organised_trains.append(organised_train)
    return organised_trains
 # Convert timetable entry into a list of CRS Codes for PIS identification
 def rationalise_timetable_entry(timetable_entries):
    rationalised_entries = []
    for timetable_entry in timetable_entries:
        entry = {
            'stpIndicator': timetable_entry['stpIndicator'],
            'operator': timetable_entry['operator'],
            'trainUid': timetable_entry['trainUid'],
            'scheduleStart': timetable_entry['scheduleStart'],
            'scheduleEnd': timetable_entry['scheduleEnd'],
            'daysRun': timetable_entry['daysRun'],
            'trainStartTime': timetable_entry['stops'][0].get('wttDeparture') or timetable_entry['stops'][0].get('publicDeparture'),
            'stops': create_crs_list_from_stops(timetable_entry['stops']),
        }
        rationalised_entries.append(entry)
    return rationalised_entries
 # Work through list of stops, and create a list of CRS codes for public stops
 def create_crs_list_from_stops(stops):
    crs_list = []
    for stop in stops:
        if not stop['isPublic']:
            continue
        loc_ref = ob_client.get_loc_ref_codes_by_tiploc(stop['tiploc'])
        crs_list.append(loc_ref[0]['3ALPHA'])
    return crs_list
--- a/src/validate.py
+++ b/src/validate.py
@ -0,0 +1,36 @@
 ## Validates and filters input based on whether diagram time matches schedule start time
 def filter_timetable_entries(diagram_entries):
    for entry in diagram_entries:
        diagram_time1 = (entry.get('diagram_time1') or '').replace('.', '')
        diagram_time2 = (entry.get('diagram_time2') or '').replace('.', '')
        timetable_entries = entry.get('timetable_entries', [])
        # Collect train start times for comparison
        train_start_times = [t['trainStartTime'] for t in timetable_entries]
        # Check for match
        matching_times = [
            t for t in train_start_times
            if t == diagram_time1 or t== diagram_time2
        ]
        # Check for unique matches
        if len(matching_times) == 1:
            # Filter any non-matching entries if there is a match
            entry['timetable_entries'] = [
                t for t in timetable_entries
                if t['trainStartTime'] == matching_times[0]
            ]
            entry['verification'] = "AUTO: Diagram/Schedule Time Match"
    return diagram_entries
 ## Checks OwlBoard API for existing PIS codes and whether they match
 def check_and_validate_against_owlboard(train_entries):
    ### Loop through input list (which is output of above function, currently in output.txt)
    ### check whether code exists in OwlBoard API, if so - does it match.
    ### If exists but no match, open an issue.
    ### If does not exist, do nothing.
    ### If exists and does match, remove from input list.
    ### Return output