Flesh out main.py and reorganise project directories

2024-10-25 21:39:16 +01:00
parent 3c58645eba
commit 49d47b479b
3 changed files with 77 additions and 11 deletions
--- a/src/main.py
+++ b/src/main.py
@@ -0,0 +1,77 @@
+import os
+import sys
+import glob
+import parse_pdf
+
+# List all PDF files in the given directory
+def list_pdf_files(directory):
+    pdf_files = glob.glob(os.path.join(directory, '*.pdf'))
+    return pdf_files
+
+# Extracts date from PDF filename and returns its content
+def get_schedule_card_data(filepath):
+    filename = os.path.basename(filepath)
+    date_str = filename[:4]
+    date_object = datetime.now()
+    if len(date_str) < 4:
+        raise ValueError("Filename must have at least four characters")
+        return None
+
+    month_str = date_str[:2]
+    day_str = date_str[2:]
+
+    try:
+        month = int(month_str)
+        day = int(day_str)
+
+        year = datetime.now().year
+        date_object = datetime(year=year, month=month, day=day)
+    except ValueError as e:
+        print(f"Error parsing date: {e}")
+        return None
+
+    schedule_card_data = {
+        "schedule_date": date_object,
+        "schedule_data": parse_pdf.parse_pdf_file(filepath)
+    }
+
+    return schedule_card_data
+
+
+# Loop through data and remove duplicate codes.
+
+# Check for existing codes via OwlBoard API.
+# Validate existing codes and submit issue if not correct.
+
+# Use the train list to search for stopping pattern of any absent codes.
+
+
+# Format file and commit to git
+
+def main():
+    # Check for arguments
+    if len(sys.argv) > 1:
+        directory = sys.argv[1]
+    else:
+        directory = os.getcwd()
+
+    if not os.path.isdir(directory):
+        print(f"Error: '{directory}' is not a valid directory")
+        return
+
+    pdf_files = list_pdf_files(directory)
+    if len(pdf_files) == 0:
+        print(f"Error: '{directory}' contains no PDF files")
+        return
+    else:
+        print(f"Found {len(pdf_files)} PDF files")
+
+
+    ## For each file in list, run get_schedule_card_data(filepath)
+    ## if returned value is not None, append to a list.  Once
+    ## complete, pass to a validation function, then into Git handling
+    ## function.
+
+    
+if __name__ == "__main__":
+    main()
--- a/src/parse_pdf.py
+++ b/src/parse_pdf.py
@@ -0,0 +1,55 @@
+import PyPDF2
+import re
+
+## re Patterns
+schedule_pattern = r"(?<=NOTES)(.*?)(?=NOTES|$)" # Split Schedule Cards
+train_start_pattern = r"\b\d[A-Z]\d{2}\s+\d{5}\s+\d{2}\.\d{2}(?:\s+\d{2}\.\d{2})?\s+[A-Za-z &]+" # Defines start of train section
+train_first_line_pattern = r"(?P<headcode>\d[A-Z]\d{2})\s+(?P<gsmr_code>\d{5})\s+(?P<time0>\d{2}\.\d{2})(?:\s+(?P<time1>\d{2}\.\d{2}))?\s+(?P<location>[A-Za-z0-9& ]+)" # Extracts Train Data
+train_pis_line_pattern = r"\(PIS Code\s*:\s*(?P<pis_code>\d+)\s*\)" # Extracts PIS Code
+
+# Extract Schedule Cards to list of strings
+
+def extract_pdf_text(file_path):
+    with open(file_path, 'rb') as file:
+        pdf_reader = PyPDF2.PdfReader(file)
+        
+        page_texts = []
+        for page in pdf_reader.pages:
+            text = page.extract_text()
+            if text:
+                page_texts.append(text)
+
+        full_text = " ".join(page_texts)
+        return full_text
+
+# Does everything - should be split in to functional functions
+def parse_pdf_file(filename):
+    pdf_text = extract_pdf_text(filename)
+    schedule_cards = re.findall(schedule_pattern, pdf_text, re.DOTALL)
+    print(f"{len(schedule_cards)} Schedule Cards parsed")
+
+    # Split into individual trains
+    parsed_schedule_cards = []
+
+    for schedule_card in schedule_cards:
+        train_sections = re.split(f"(?={train_start_pattern})", schedule_cards[0])
+        train_sections = [section.strip() for section in train_sections if section.strip()]
+
+        parsed_schedule_cards.append(train_sections)
+
+    # Process data from each individual train
+
+
+    train_data = []
+
+    for parsed_schedule_card in parsed_schedule_cards:
+        for train in parsed_schedule_card:
+            first_line_match = re.match(train_first_line_pattern, train)
+            if first_line_match:
+                train_entry = first_line_match.groupdict()
+                pis_code_match = re.search(train_pis_line_pattern, train)
+                train_entry["pis_code"] = pis_code_match.group("pis_code") if pis_code_match else None
+
+                train_data.append(train_entry)
+
+    return train_data