Fix parsing of schedule card text
This commit is contained in:
parent
49d47b479b
commit
11ec1329a5
5
pyvenv.cfg
Normal file
5
pyvenv.cfg
Normal file
@ -0,0 +1,5 @@
|
|||||||
|
home = /usr/bin
|
||||||
|
include-system-site-packages = false
|
||||||
|
version = 3.12.7
|
||||||
|
executable = /usr/bin/python3.12
|
||||||
|
command = /usr/bin/python -m venv /home/fred.boniface/git/owlboard/dgp2
|
16
src/main.py
16
src/main.py
@ -1,6 +1,8 @@
|
|||||||
import os
|
import os
|
||||||
import sys
|
import sys
|
||||||
import glob
|
import glob
|
||||||
|
import datetime
|
||||||
|
import json
|
||||||
import parse_pdf
|
import parse_pdf
|
||||||
|
|
||||||
# List all PDF files in the given directory
|
# List all PDF files in the given directory
|
||||||
@ -12,7 +14,7 @@ def list_pdf_files(directory):
|
|||||||
def get_schedule_card_data(filepath):
|
def get_schedule_card_data(filepath):
|
||||||
filename = os.path.basename(filepath)
|
filename = os.path.basename(filepath)
|
||||||
date_str = filename[:4]
|
date_str = filename[:4]
|
||||||
date_object = datetime.now()
|
date_object = datetime.datetime.now()
|
||||||
if len(date_str) < 4:
|
if len(date_str) < 4:
|
||||||
raise ValueError("Filename must have at least four characters")
|
raise ValueError("Filename must have at least four characters")
|
||||||
return None
|
return None
|
||||||
@ -24,8 +26,8 @@ def get_schedule_card_data(filepath):
|
|||||||
month = int(month_str)
|
month = int(month_str)
|
||||||
day = int(day_str)
|
day = int(day_str)
|
||||||
|
|
||||||
year = datetime.now().year
|
year = datetime.datetime.now().year
|
||||||
date_object = datetime(year=year, month=month, day=day)
|
date_object = datetime.datetime(year=year, month=month, day=day)
|
||||||
except ValueError as e:
|
except ValueError as e:
|
||||||
print(f"Error parsing date: {e}")
|
print(f"Error parsing date: {e}")
|
||||||
return None
|
return None
|
||||||
@ -66,6 +68,14 @@ def main():
|
|||||||
else:
|
else:
|
||||||
print(f"Found {len(pdf_files)} PDF files")
|
print(f"Found {len(pdf_files)} PDF files")
|
||||||
|
|
||||||
|
schedule_cards = []
|
||||||
|
for pdf_file in pdf_files:
|
||||||
|
schedule_cards.append(get_schedule_card_data(pdf_file))
|
||||||
|
|
||||||
|
# print(schedule_cards)
|
||||||
|
out = open("output.txt", "w")
|
||||||
|
out.write(json.dumps(schedule_cards, indent=4, default=str))
|
||||||
|
out.close()
|
||||||
|
|
||||||
## For each file in list, run get_schedule_card_data(filepath)
|
## For each file in list, run get_schedule_card_data(filepath)
|
||||||
## if returned value is not None, append to a list. Once
|
## if returned value is not None, append to a list. Once
|
||||||
|
@ -32,7 +32,7 @@ def parse_pdf_file(filename):
|
|||||||
parsed_schedule_cards = []
|
parsed_schedule_cards = []
|
||||||
|
|
||||||
for schedule_card in schedule_cards:
|
for schedule_card in schedule_cards:
|
||||||
train_sections = re.split(f"(?={train_start_pattern})", schedule_cards[0])
|
train_sections = re.split(f"(?={train_start_pattern})", schedule_card)
|
||||||
train_sections = [section.strip() for section in train_sections if section.strip()]
|
train_sections = [section.strip() for section in train_sections if section.strip()]
|
||||||
|
|
||||||
parsed_schedule_cards.append(train_sections)
|
parsed_schedule_cards.append(train_sections)
|
||||||
|
Loading…
Reference in New Issue
Block a user