From 11ec1329a5bf6066d0b0d7decc7d744676822359 Mon Sep 17 00:00:00 2001 From: Fred Boniface Date: Sat, 26 Oct 2024 11:37:07 +0100 Subject: [PATCH] Fix parsing of schedule card text --- pyvenv.cfg | 5 +++++ src/main.py | 16 +++++++++++++--- src/parse_pdf.py | 2 +- 3 files changed, 19 insertions(+), 4 deletions(-) create mode 100644 pyvenv.cfg diff --git a/pyvenv.cfg b/pyvenv.cfg new file mode 100644 index 0000000..3a27ad8 --- /dev/null +++ b/pyvenv.cfg @@ -0,0 +1,5 @@ +home = /usr/bin +include-system-site-packages = false +version = 3.12.7 +executable = /usr/bin/python3.12 +command = /usr/bin/python -m venv /home/fred.boniface/git/owlboard/dgp2 diff --git a/src/main.py b/src/main.py index e1a7332..e0587c9 100644 --- a/src/main.py +++ b/src/main.py @@ -1,6 +1,8 @@ import os import sys import glob +import datetime +import json import parse_pdf # List all PDF files in the given directory @@ -12,7 +14,7 @@ def list_pdf_files(directory): def get_schedule_card_data(filepath): filename = os.path.basename(filepath) date_str = filename[:4] - date_object = datetime.now() + date_object = datetime.datetime.now() if len(date_str) < 4: raise ValueError("Filename must have at least four characters") return None @@ -24,8 +26,8 @@ def get_schedule_card_data(filepath): month = int(month_str) day = int(day_str) - year = datetime.now().year - date_object = datetime(year=year, month=month, day=day) + year = datetime.datetime.now().year + date_object = datetime.datetime(year=year, month=month, day=day) except ValueError as e: print(f"Error parsing date: {e}") return None @@ -66,6 +68,14 @@ def main(): else: print(f"Found {len(pdf_files)} PDF files") + schedule_cards = [] + for pdf_file in pdf_files: + schedule_cards.append(get_schedule_card_data(pdf_file)) + + # print(schedule_cards) + out = open("output.txt", "w") + out.write(json.dumps(schedule_cards, indent=4, default=str)) + out.close() ## For each file in list, run get_schedule_card_data(filepath) ## if returned value is not None, append to a list. Once diff --git a/src/parse_pdf.py b/src/parse_pdf.py index 006ebb8..b52d335 100644 --- a/src/parse_pdf.py +++ b/src/parse_pdf.py @@ -32,7 +32,7 @@ def parse_pdf_file(filename): parsed_schedule_cards = [] for schedule_card in schedule_cards: - train_sections = re.split(f"(?={train_start_pattern})", schedule_cards[0]) + train_sections = re.split(f"(?={train_start_pattern})", schedule_card) train_sections = [section.strip() for section in train_sections if section.strip()] parsed_schedule_cards.append(train_sections)