From 8af14df670ed434e411246e091ec4a9eb263afec Mon Sep 17 00:00:00 2001 From: Fred Boniface Date: Fri, 6 Dec 2024 11:58:03 +0000 Subject: [PATCH] Open SC files and parse tables --- requirements.txt | 8 +++++++- src/parse_pdf.py | 25 ++++++++++++------------- 2 files changed, 19 insertions(+), 14 deletions(-) diff --git a/requirements.txt b/requirements.txt index 54d8967..3542966 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,7 +1,13 @@ certifi==2024.8.30 +cffi==1.17.1 charset-normalizer==3.4.0 +cryptography==44.0.0 idna==3.10 +pdfminer.six==20231228 +pdfplumber==0.11.4 +pillow==11.0.0 +pycparser==2.22 pyOwlBoard==0.0.2 -PyPDF2==3.0.1 +pypdfium2==4.30.0 requests==2.32.3 urllib3==2.2.3 diff --git a/src/parse_pdf.py b/src/parse_pdf.py index 9c53624..ed01a0d 100644 --- a/src/parse_pdf.py +++ b/src/parse_pdf.py @@ -1,4 +1,4 @@ -import PyPDF2 +import pdfplumber import re ## re Patterns @@ -11,19 +11,18 @@ train_pis_line_pattern = r"\(PIS Code\s*:\s*(?P\d+|____)\s*\)" # Extra # Extract Schedule Cards to list of strings def extract_pdf_text(file_path): - with open(file_path, 'rb') as file: - pdf_reader = PyPDF2.PdfReader(file) - - page_texts = [] - for page in pdf_reader.pages: - text = page.extract_text() - if text: - page_texts.append(text) + tables = [] + try: + with pdfplumber.open(file_path) as file: + for page_number, page in enumerate(file.pages, start=1): + page_tables = page.extract_tables() + if page_tables: + tables.extend(page_tables) + except Exception as e: + print(f"Error processing PDF: {e}") + return tables - full_text = " ".join(page_texts) - return full_text - -# Does everything - should be split in to functional functions +# Does everything - NEEDS REWRITE TO HANDLE EXTRACTED TABLES FROM ABOVE FN def parse_pdf_file(filename): pdf_text = extract_pdf_text(filename) schedule_cards = re.findall(schedule_pattern, pdf_text, re.DOTALL)