55 lines
2.1 KiB
Python
55 lines
2.1 KiB
Python
import PyPDF2
|
|
import re
|
|
|
|
## re Patterns
|
|
schedule_pattern = r"(?<=NOTES)(.*?)(?=NOTES|$)" # Split Schedule Cards
|
|
train_start_pattern = r"\b\d[A-Z]\d{2}\s+\d{5}\s+\d{2}\.\d{2}(?:\s+\d{2}\.\d{2})?\s+[A-Za-z &]+" # Defines start of train section
|
|
train_first_line_pattern = r"(?P<headcode>\d[A-Z]\d{2})\s+(?P<gsmr_code>\d{5})\s+(?P<time0>\d{2}\.\d{2})(?:\s+(?P<time1>\d{2}\.\d{2}))?\s+(?P<location>[A-Za-z0-9& ]+)" # Extracts Train Data
|
|
train_pis_line_pattern = r"\(PIS Code\s*:\s*(?P<pis_code>\d+)\s*\)" # Extracts PIS Code
|
|
|
|
# Extract Schedule Cards to list of strings
|
|
|
|
def extract_pdf_text(file_path):
|
|
with open(file_path, 'rb') as file:
|
|
pdf_reader = PyPDF2.PdfReader(file)
|
|
|
|
page_texts = []
|
|
for page in pdf_reader.pages:
|
|
text = page.extract_text()
|
|
if text:
|
|
page_texts.append(text)
|
|
|
|
full_text = " ".join(page_texts)
|
|
return full_text
|
|
|
|
# Does everything - should be split in to functional functions
|
|
def parse_pdf_file(filename):
|
|
pdf_text = extract_pdf_text(filename)
|
|
schedule_cards = re.findall(schedule_pattern, pdf_text, re.DOTALL)
|
|
print(f"{len(schedule_cards)} Schedule Cards parsed")
|
|
|
|
# Split into individual trains
|
|
parsed_schedule_cards = []
|
|
|
|
for schedule_card in schedule_cards:
|
|
train_sections = re.split(f"(?={train_start_pattern})", schedule_cards[0])
|
|
train_sections = [section.strip() for section in train_sections if section.strip()]
|
|
|
|
parsed_schedule_cards.append(train_sections)
|
|
|
|
# Process data from each individual train
|
|
|
|
|
|
train_data = []
|
|
|
|
for parsed_schedule_card in parsed_schedule_cards:
|
|
for train in parsed_schedule_card:
|
|
first_line_match = re.match(train_first_line_pattern, train)
|
|
if first_line_match:
|
|
train_entry = first_line_match.groupdict()
|
|
pis_code_match = re.search(train_pis_line_pattern, train)
|
|
train_entry["pis_code"] = pis_code_match.group("pis_code") if pis_code_match else None
|
|
|
|
train_data.append(train_entry)
|
|
|
|
return train_data |