Flesh out main.py and reorganise project directories

This commit is contained in:
Fred Boniface
2024-10-25 21:39:16 +01:00
parent 3c58645eba
commit 49d47b479b
3 changed files with 77 additions and 11 deletions

77
src/main.py Normal file
View File

@@ -0,0 +1,77 @@
import os
import sys
import glob
import parse_pdf
# List all PDF files in the given directory
def list_pdf_files(directory):
pdf_files = glob.glob(os.path.join(directory, '*.pdf'))
return pdf_files
# Extracts date from PDF filename and returns its content
def get_schedule_card_data(filepath):
filename = os.path.basename(filepath)
date_str = filename[:4]
date_object = datetime.now()
if len(date_str) < 4:
raise ValueError("Filename must have at least four characters")
return None
month_str = date_str[:2]
day_str = date_str[2:]
try:
month = int(month_str)
day = int(day_str)
year = datetime.now().year
date_object = datetime(year=year, month=month, day=day)
except ValueError as e:
print(f"Error parsing date: {e}")
return None
schedule_card_data = {
"schedule_date": date_object,
"schedule_data": parse_pdf.parse_pdf_file(filepath)
}
return schedule_card_data
# Loop through data and remove duplicate codes.
# Check for existing codes via OwlBoard API.
# Validate existing codes and submit issue if not correct.
# Use the train list to search for stopping pattern of any absent codes.
# Format file and commit to git
def main():
# Check for arguments
if len(sys.argv) > 1:
directory = sys.argv[1]
else:
directory = os.getcwd()
if not os.path.isdir(directory):
print(f"Error: '{directory}' is not a valid directory")
return
pdf_files = list_pdf_files(directory)
if len(pdf_files) == 0:
print(f"Error: '{directory}' contains no PDF files")
return
else:
print(f"Found {len(pdf_files)} PDF files")
## For each file in list, run get_schedule_card_data(filepath)
## if returned value is not None, append to a list. Once
## complete, pass to a validation function, then into Git handling
## function.
if __name__ == "__main__":
main()

55
src/parse_pdf.py Normal file
View File

@@ -0,0 +1,55 @@
import PyPDF2
import re
## re Patterns
schedule_pattern = r"(?<=NOTES)(.*?)(?=NOTES|$)" # Split Schedule Cards
train_start_pattern = r"\b\d[A-Z]\d{2}\s+\d{5}\s+\d{2}\.\d{2}(?:\s+\d{2}\.\d{2})?\s+[A-Za-z &]+" # Defines start of train section
train_first_line_pattern = r"(?P<headcode>\d[A-Z]\d{2})\s+(?P<gsmr_code>\d{5})\s+(?P<time0>\d{2}\.\d{2})(?:\s+(?P<time1>\d{2}\.\d{2}))?\s+(?P<location>[A-Za-z0-9& ]+)" # Extracts Train Data
train_pis_line_pattern = r"\(PIS Code\s*:\s*(?P<pis_code>\d+)\s*\)" # Extracts PIS Code
# Extract Schedule Cards to list of strings
def extract_pdf_text(file_path):
with open(file_path, 'rb') as file:
pdf_reader = PyPDF2.PdfReader(file)
page_texts = []
for page in pdf_reader.pages:
text = page.extract_text()
if text:
page_texts.append(text)
full_text = " ".join(page_texts)
return full_text
# Does everything - should be split in to functional functions
def parse_pdf_file(filename):
pdf_text = extract_pdf_text(filename)
schedule_cards = re.findall(schedule_pattern, pdf_text, re.DOTALL)
print(f"{len(schedule_cards)} Schedule Cards parsed")
# Split into individual trains
parsed_schedule_cards = []
for schedule_card in schedule_cards:
train_sections = re.split(f"(?={train_start_pattern})", schedule_cards[0])
train_sections = [section.strip() for section in train_sections if section.strip()]
parsed_schedule_cards.append(train_sections)
# Process data from each individual train
train_data = []
for parsed_schedule_card in parsed_schedule_cards:
for train in parsed_schedule_card:
first_line_match = re.match(train_first_line_pattern, train)
if first_line_match:
train_entry = first_line_match.groupdict()
pis_code_match = re.search(train_pis_line_pattern, train)
train_entry["pis_code"] = pis_code_match.group("pis_code") if pis_code_match else None
train_data.append(train_entry)
return train_data