From 3c58645ebaa8a069c9c76d0d353e07ac2ab51904 Mon Sep 17 00:00:00 2001 From: Fred Boniface Date: Fri, 25 Oct 2024 21:15:56 +0100 Subject: [PATCH] init --- .gitignore | 165 +++++++++++++++++++++++++++++++++++++++++++++++ README.md | 3 + main.py | 11 ++++ parse_pdf.py | 55 ++++++++++++++++ requirements.txt | 1 + 5 files changed, 235 insertions(+) create mode 100644 .gitignore create mode 100644 README.md create mode 100644 main.py create mode 100644 parse_pdf.py create mode 100644 requirements.txt diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..3b9b3bc --- /dev/null +++ b/.gitignore @@ -0,0 +1,165 @@ +*.pdf +output + +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +share/python-wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.nox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +*.py,cover +.hypothesis/ +.pytest_cache/ +cover/ + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py +db.sqlite3 +db.sqlite3-journal + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +.pybuilder/ +target/ + +# Jupyter Notebook +.ipynb_checkpoints + +# IPython +profile_default/ +ipython_config.py + +# pyenv +# For a library or package, you might want to ignore these files since the code is +# intended to run in multiple environments; otherwise, check them in: +# .python-version + +# pipenv +# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. +# However, in case of collaboration, if having platform-specific dependencies or dependencies +# having no cross-platform support, pipenv may install dependencies that don't work, or not +# install all needed dependencies. +#Pipfile.lock + +# poetry +# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. +# This is especially recommended for binary packages to ensure reproducibility, and is more +# commonly ignored for libraries. +# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control +#poetry.lock + +# pdm +# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. +#pdm.lock +# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it +# in version control. +# https://pdm.fming.dev/latest/usage/project/#working-with-version-control +.pdm.toml +.pdm-python +.pdm-build/ + +# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm +__pypackages__/ + +# Celery stuff +celerybeat-schedule +celerybeat.pid + +# SageMath parsed files +*.sage.py + +# Environments +.env +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ +.dmypy.json +dmypy.json + +# Pyre type checker +.pyre/ + +# pytype static type analyzer +.pytype/ + +# Cython debug symbols +cython_debug/ + +# PyCharm +# JetBrains specific template is maintained in a separate JetBrains.gitignore that can +# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore +# and can be added to the global gitignore or merged into this file. For a more nuclear +# option (not recommended) you can uncomment the following to ignore the entire idea folder. +#.idea/ \ No newline at end of file diff --git a/README.md b/README.md new file mode 100644 index 0000000..f294f6e --- /dev/null +++ b/README.md @@ -0,0 +1,3 @@ +# dgp2 + +Parses PDF schedule cards, extracting PIS codes and headcodes before using the OwlBoard API to build a list of PIS codes. Submitting the completed list to Git. \ No newline at end of file diff --git a/main.py b/main.py new file mode 100644 index 0000000..a6d8892 --- /dev/null +++ b/main.py @@ -0,0 +1,11 @@ +# Open all PDF files in the working directory. + +# Loop through data and remove duplicate codes. + +# Check for existing codes via OwlBoard API. +# Validate existing codes and submit issue if not correct. + +# Use the train list to search for stopping pattern of any absent codes. + + +# Format file and commit to git \ No newline at end of file diff --git a/parse_pdf.py b/parse_pdf.py new file mode 100644 index 0000000..006ebb8 --- /dev/null +++ b/parse_pdf.py @@ -0,0 +1,55 @@ +import PyPDF2 +import re + +## re Patterns +schedule_pattern = r"(?<=NOTES)(.*?)(?=NOTES|$)" # Split Schedule Cards +train_start_pattern = r"\b\d[A-Z]\d{2}\s+\d{5}\s+\d{2}\.\d{2}(?:\s+\d{2}\.\d{2})?\s+[A-Za-z &]+" # Defines start of train section +train_first_line_pattern = r"(?P\d[A-Z]\d{2})\s+(?P\d{5})\s+(?P\d{2}\.\d{2})(?:\s+(?P\d{2}\.\d{2}))?\s+(?P[A-Za-z0-9& ]+)" # Extracts Train Data +train_pis_line_pattern = r"\(PIS Code\s*:\s*(?P\d+)\s*\)" # Extracts PIS Code + +# Extract Schedule Cards to list of strings + +def extract_pdf_text(file_path): + with open(file_path, 'rb') as file: + pdf_reader = PyPDF2.PdfReader(file) + + page_texts = [] + for page in pdf_reader.pages: + text = page.extract_text() + if text: + page_texts.append(text) + + full_text = " ".join(page_texts) + return full_text + +# Does everything - should be split in to functional functions +def parse_pdf_file(filename): + pdf_text = extract_pdf_text(filename) + schedule_cards = re.findall(schedule_pattern, pdf_text, re.DOTALL) + print(f"{len(schedule_cards)} Schedule Cards parsed") + + # Split into individual trains + parsed_schedule_cards = [] + + for schedule_card in schedule_cards: + train_sections = re.split(f"(?={train_start_pattern})", schedule_cards[0]) + train_sections = [section.strip() for section in train_sections if section.strip()] + + parsed_schedule_cards.append(train_sections) + + # Process data from each individual train + + + train_data = [] + + for parsed_schedule_card in parsed_schedule_cards: + for train in parsed_schedule_card: + first_line_match = re.match(train_first_line_pattern, train) + if first_line_match: + train_entry = first_line_match.groupdict() + pis_code_match = re.search(train_pis_line_pattern, train) + train_entry["pis_code"] = pis_code_match.group("pis_code") if pis_code_match else None + + train_data.append(train_entry) + + return train_data \ No newline at end of file diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..1bf0c7e --- /dev/null +++ b/requirements.txt @@ -0,0 +1 @@ +PyPDF2==3.0.1