From d9abf9f19e9e93a0e27b11c284872a57014a81dd Mon Sep 17 00:00:00 2001 From: Fred Boniface Date: Sun, 1 Dec 2024 12:32:02 +0000 Subject: [PATCH] Properly handle blank PIS codes and prevent matching next service incorrectly --- src/parse_pdf.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/parse_pdf.py b/src/parse_pdf.py index b52d335..9c53624 100644 --- a/src/parse_pdf.py +++ b/src/parse_pdf.py @@ -5,7 +5,8 @@ import re schedule_pattern = r"(?<=NOTES)(.*?)(?=NOTES|$)" # Split Schedule Cards train_start_pattern = r"\b\d[A-Z]\d{2}\s+\d{5}\s+\d{2}\.\d{2}(?:\s+\d{2}\.\d{2})?\s+[A-Za-z &]+" # Defines start of train section train_first_line_pattern = r"(?P\d[A-Z]\d{2})\s+(?P\d{5})\s+(?P\d{2}\.\d{2})(?:\s+(?P\d{2}\.\d{2}))?\s+(?P[A-Za-z0-9& ]+)" # Extracts Train Data -train_pis_line_pattern = r"\(PIS Code\s*:\s*(?P\d+)\s*\)" # Extracts PIS Code +train_pis_line_pattern = r"\(PIS Code\s*:\s*(?P\d+|____)\s*\)" # Extracts PIS Code + # Extract Schedule Cards to list of strings