Flesh out main.py and reorganise project directories
This commit is contained in:
77
src/main.py
Normal file
77
src/main.py
Normal file
@@ -0,0 +1,77 @@
|
||||
import os
|
||||
import sys
|
||||
import glob
|
||||
import parse_pdf
|
||||
|
||||
# List all PDF files in the given directory
|
||||
def list_pdf_files(directory):
|
||||
pdf_files = glob.glob(os.path.join(directory, '*.pdf'))
|
||||
return pdf_files
|
||||
|
||||
# Extracts date from PDF filename and returns its content
|
||||
def get_schedule_card_data(filepath):
|
||||
filename = os.path.basename(filepath)
|
||||
date_str = filename[:4]
|
||||
date_object = datetime.now()
|
||||
if len(date_str) < 4:
|
||||
raise ValueError("Filename must have at least four characters")
|
||||
return None
|
||||
|
||||
month_str = date_str[:2]
|
||||
day_str = date_str[2:]
|
||||
|
||||
try:
|
||||
month = int(month_str)
|
||||
day = int(day_str)
|
||||
|
||||
year = datetime.now().year
|
||||
date_object = datetime(year=year, month=month, day=day)
|
||||
except ValueError as e:
|
||||
print(f"Error parsing date: {e}")
|
||||
return None
|
||||
|
||||
schedule_card_data = {
|
||||
"schedule_date": date_object,
|
||||
"schedule_data": parse_pdf.parse_pdf_file(filepath)
|
||||
}
|
||||
|
||||
return schedule_card_data
|
||||
|
||||
|
||||
# Loop through data and remove duplicate codes.
|
||||
|
||||
# Check for existing codes via OwlBoard API.
|
||||
# Validate existing codes and submit issue if not correct.
|
||||
|
||||
# Use the train list to search for stopping pattern of any absent codes.
|
||||
|
||||
|
||||
# Format file and commit to git
|
||||
|
||||
def main():
|
||||
# Check for arguments
|
||||
if len(sys.argv) > 1:
|
||||
directory = sys.argv[1]
|
||||
else:
|
||||
directory = os.getcwd()
|
||||
|
||||
if not os.path.isdir(directory):
|
||||
print(f"Error: '{directory}' is not a valid directory")
|
||||
return
|
||||
|
||||
pdf_files = list_pdf_files(directory)
|
||||
if len(pdf_files) == 0:
|
||||
print(f"Error: '{directory}' contains no PDF files")
|
||||
return
|
||||
else:
|
||||
print(f"Found {len(pdf_files)} PDF files")
|
||||
|
||||
|
||||
## For each file in list, run get_schedule_card_data(filepath)
|
||||
## if returned value is not None, append to a list. Once
|
||||
## complete, pass to a validation function, then into Git handling
|
||||
## function.
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
55
src/parse_pdf.py
Normal file
55
src/parse_pdf.py
Normal file
@@ -0,0 +1,55 @@
|
||||
import PyPDF2
|
||||
import re
|
||||
|
||||
## re Patterns
|
||||
schedule_pattern = r"(?<=NOTES)(.*?)(?=NOTES|$)" # Split Schedule Cards
|
||||
train_start_pattern = r"\b\d[A-Z]\d{2}\s+\d{5}\s+\d{2}\.\d{2}(?:\s+\d{2}\.\d{2})?\s+[A-Za-z &]+" # Defines start of train section
|
||||
train_first_line_pattern = r"(?P<headcode>\d[A-Z]\d{2})\s+(?P<gsmr_code>\d{5})\s+(?P<time0>\d{2}\.\d{2})(?:\s+(?P<time1>\d{2}\.\d{2}))?\s+(?P<location>[A-Za-z0-9& ]+)" # Extracts Train Data
|
||||
train_pis_line_pattern = r"\(PIS Code\s*:\s*(?P<pis_code>\d+)\s*\)" # Extracts PIS Code
|
||||
|
||||
# Extract Schedule Cards to list of strings
|
||||
|
||||
def extract_pdf_text(file_path):
|
||||
with open(file_path, 'rb') as file:
|
||||
pdf_reader = PyPDF2.PdfReader(file)
|
||||
|
||||
page_texts = []
|
||||
for page in pdf_reader.pages:
|
||||
text = page.extract_text()
|
||||
if text:
|
||||
page_texts.append(text)
|
||||
|
||||
full_text = " ".join(page_texts)
|
||||
return full_text
|
||||
|
||||
# Does everything - should be split in to functional functions
|
||||
def parse_pdf_file(filename):
|
||||
pdf_text = extract_pdf_text(filename)
|
||||
schedule_cards = re.findall(schedule_pattern, pdf_text, re.DOTALL)
|
||||
print(f"{len(schedule_cards)} Schedule Cards parsed")
|
||||
|
||||
# Split into individual trains
|
||||
parsed_schedule_cards = []
|
||||
|
||||
for schedule_card in schedule_cards:
|
||||
train_sections = re.split(f"(?={train_start_pattern})", schedule_cards[0])
|
||||
train_sections = [section.strip() for section in train_sections if section.strip()]
|
||||
|
||||
parsed_schedule_cards.append(train_sections)
|
||||
|
||||
# Process data from each individual train
|
||||
|
||||
|
||||
train_data = []
|
||||
|
||||
for parsed_schedule_card in parsed_schedule_cards:
|
||||
for train in parsed_schedule_card:
|
||||
first_line_match = re.match(train_first_line_pattern, train)
|
||||
if first_line_match:
|
||||
train_entry = first_line_match.groupdict()
|
||||
pis_code_match = re.search(train_pis_line_pattern, train)
|
||||
train_entry["pis_code"] = pis_code_match.group("pis_code") if pis_code_match else None
|
||||
|
||||
train_data.append(train_entry)
|
||||
|
||||
return train_data
|
||||
Reference in New Issue
Block a user