Fix parsing of schedule card text
This commit is contained in:
parent
49d47b479b
commit
11ec1329a5
5
pyvenv.cfg
Normal file
5
pyvenv.cfg
Normal file
@ -0,0 +1,5 @@
|
||||
home = /usr/bin
|
||||
include-system-site-packages = false
|
||||
version = 3.12.7
|
||||
executable = /usr/bin/python3.12
|
||||
command = /usr/bin/python -m venv /home/fred.boniface/git/owlboard/dgp2
|
16
src/main.py
16
src/main.py
@ -1,6 +1,8 @@
|
||||
import os
|
||||
import sys
|
||||
import glob
|
||||
import datetime
|
||||
import json
|
||||
import parse_pdf
|
||||
|
||||
# List all PDF files in the given directory
|
||||
@ -12,7 +14,7 @@ def list_pdf_files(directory):
|
||||
def get_schedule_card_data(filepath):
|
||||
filename = os.path.basename(filepath)
|
||||
date_str = filename[:4]
|
||||
date_object = datetime.now()
|
||||
date_object = datetime.datetime.now()
|
||||
if len(date_str) < 4:
|
||||
raise ValueError("Filename must have at least four characters")
|
||||
return None
|
||||
@ -24,8 +26,8 @@ def get_schedule_card_data(filepath):
|
||||
month = int(month_str)
|
||||
day = int(day_str)
|
||||
|
||||
year = datetime.now().year
|
||||
date_object = datetime(year=year, month=month, day=day)
|
||||
year = datetime.datetime.now().year
|
||||
date_object = datetime.datetime(year=year, month=month, day=day)
|
||||
except ValueError as e:
|
||||
print(f"Error parsing date: {e}")
|
||||
return None
|
||||
@ -66,6 +68,14 @@ def main():
|
||||
else:
|
||||
print(f"Found {len(pdf_files)} PDF files")
|
||||
|
||||
schedule_cards = []
|
||||
for pdf_file in pdf_files:
|
||||
schedule_cards.append(get_schedule_card_data(pdf_file))
|
||||
|
||||
# print(schedule_cards)
|
||||
out = open("output.txt", "w")
|
||||
out.write(json.dumps(schedule_cards, indent=4, default=str))
|
||||
out.close()
|
||||
|
||||
## For each file in list, run get_schedule_card_data(filepath)
|
||||
## if returned value is not None, append to a list. Once
|
||||
|
@ -32,7 +32,7 @@ def parse_pdf_file(filename):
|
||||
parsed_schedule_cards = []
|
||||
|
||||
for schedule_card in schedule_cards:
|
||||
train_sections = re.split(f"(?={train_start_pattern})", schedule_cards[0])
|
||||
train_sections = re.split(f"(?={train_start_pattern})", schedule_card)
|
||||
train_sections = [section.strip() for section in train_sections if section.strip()]
|
||||
|
||||
parsed_schedule_cards.append(train_sections)
|
||||
|
Loading…
Reference in New Issue
Block a user