Fix parsing of schedule card text

This commit is contained in:
Fred Boniface 2024-10-26 11:37:07 +01:00
parent 49d47b479b
commit 11ec1329a5
3 changed files with 19 additions and 4 deletions

5
pyvenv.cfg Normal file
View File

@ -0,0 +1,5 @@
home = /usr/bin
include-system-site-packages = false
version = 3.12.7
executable = /usr/bin/python3.12
command = /usr/bin/python -m venv /home/fred.boniface/git/owlboard/dgp2

View File

@ -1,6 +1,8 @@
import os import os
import sys import sys
import glob import glob
import datetime
import json
import parse_pdf import parse_pdf
# List all PDF files in the given directory # List all PDF files in the given directory
@ -12,7 +14,7 @@ def list_pdf_files(directory):
def get_schedule_card_data(filepath): def get_schedule_card_data(filepath):
filename = os.path.basename(filepath) filename = os.path.basename(filepath)
date_str = filename[:4] date_str = filename[:4]
date_object = datetime.now() date_object = datetime.datetime.now()
if len(date_str) < 4: if len(date_str) < 4:
raise ValueError("Filename must have at least four characters") raise ValueError("Filename must have at least four characters")
return None return None
@ -24,8 +26,8 @@ def get_schedule_card_data(filepath):
month = int(month_str) month = int(month_str)
day = int(day_str) day = int(day_str)
year = datetime.now().year year = datetime.datetime.now().year
date_object = datetime(year=year, month=month, day=day) date_object = datetime.datetime(year=year, month=month, day=day)
except ValueError as e: except ValueError as e:
print(f"Error parsing date: {e}") print(f"Error parsing date: {e}")
return None return None
@ -66,6 +68,14 @@ def main():
else: else:
print(f"Found {len(pdf_files)} PDF files") print(f"Found {len(pdf_files)} PDF files")
schedule_cards = []
for pdf_file in pdf_files:
schedule_cards.append(get_schedule_card_data(pdf_file))
# print(schedule_cards)
out = open("output.txt", "w")
out.write(json.dumps(schedule_cards, indent=4, default=str))
out.close()
## For each file in list, run get_schedule_card_data(filepath) ## For each file in list, run get_schedule_card_data(filepath)
## if returned value is not None, append to a list. Once ## if returned value is not None, append to a list. Once

View File

@ -32,7 +32,7 @@ def parse_pdf_file(filename):
parsed_schedule_cards = [] parsed_schedule_cards = []
for schedule_card in schedule_cards: for schedule_card in schedule_cards:
train_sections = re.split(f"(?={train_start_pattern})", schedule_cards[0]) train_sections = re.split(f"(?={train_start_pattern})", schedule_card)
train_sections = [section.strip() for section in train_sections if section.strip()] train_sections = [section.strip() for section in train_sections if section.strip()]
parsed_schedule_cards.append(train_sections) parsed_schedule_cards.append(train_sections)