import PyPDF2 import re ## re Patterns schedule_pattern = r"(?<=NOTES)(.*?)(?=NOTES|$)" # Split Schedule Cards train_start_pattern = r"\b\d[A-Z]\d{2}\s+\d{5}\s+\d{2}\.\d{2}(?:\s+\d{2}\.\d{2})?\s+[A-Za-z &]+" # Defines start of train section train_first_line_pattern = r"(?P\d[A-Z]\d{2})\s+(?P\d{5})\s+(?P\d{2}\.\d{2})(?:\s+(?P\d{2}\.\d{2}))?\s+(?P[A-Za-z0-9& ]+)" # Extracts Train Data train_pis_line_pattern = r"\(PIS Code\s*:\s*(?P\d+)\s*\)" # Extracts PIS Code # Extract Schedule Cards to list of strings def extract_pdf_text(file_path): with open(file_path, 'rb') as file: pdf_reader = PyPDF2.PdfReader(file) page_texts = [] for page in pdf_reader.pages: text = page.extract_text() if text: page_texts.append(text) full_text = " ".join(page_texts) return full_text # Does everything - should be split in to functional functions def parse_pdf_file(filename): pdf_text = extract_pdf_text(filename) schedule_cards = re.findall(schedule_pattern, pdf_text, re.DOTALL) print(f"{len(schedule_cards)} Schedule Cards parsed") # Split into individual trains parsed_schedule_cards = [] for schedule_card in schedule_cards: train_sections = re.split(f"(?={train_start_pattern})", schedule_cards[0]) train_sections = [section.strip() for section in train_sections if section.strip()] parsed_schedule_cards.append(train_sections) # Process data from each individual train train_data = [] for parsed_schedule_card in parsed_schedule_cards: for train in parsed_schedule_card: first_line_match = re.match(train_first_line_pattern, train) if first_line_match: train_entry = first_line_match.groupdict() pis_code_match = re.search(train_pis_line_pattern, train) train_entry["pis_code"] = pis_code_match.group("pis_code") if pis_code_match else None train_data.append(train_entry) return train_data