diff --git a/src/parse_pdf.py b/src/parse_pdf.py index b52d335..9c53624 100644 --- a/src/parse_pdf.py +++ b/src/parse_pdf.py @@ -5,7 +5,8 @@ import re schedule_pattern = r"(?<=NOTES)(.*?)(?=NOTES|$)" # Split Schedule Cards train_start_pattern = r"\b\d[A-Z]\d{2}\s+\d{5}\s+\d{2}\.\d{2}(?:\s+\d{2}\.\d{2})?\s+[A-Za-z &]+" # Defines start of train section train_first_line_pattern = r"(?P\d[A-Z]\d{2})\s+(?P\d{5})\s+(?P\d{2}\.\d{2})(?:\s+(?P\d{2}\.\d{2}))?\s+(?P[A-Za-z0-9& ]+)" # Extracts Train Data -train_pis_line_pattern = r"\(PIS Code\s*:\s*(?P\d+)\s*\)" # Extracts PIS Code +train_pis_line_pattern = r"\(PIS Code\s*:\s*(?P\d+|____)\s*\)" # Extracts PIS Code + # Extract Schedule Cards to list of strings