Open SC files and parse tables

2024-12-06 11:58:03 +00:00 · 2024-12-06 11:58:03 +00:00 · 8af14df670
commit 8af14df670
parent aca3f94396
2 changed files with 19 additions and 14 deletions
--- a/requirements.txt
+++ b/requirements.txt
@ -1,7 +1,13 @@
 certifi==2024.8.30
 cffi==1.17.1
 charset-normalizer==3.4.0
 cryptography==44.0.0
 idna==3.10
 pdfminer.six==20231228
 pdfplumber==0.11.4
 pillow==11.0.0
 pycparser==2.22
 pyOwlBoard==0.0.2
-PyPDF2==3.0.1
+pypdfium2==4.30.0
 requests==2.32.3
 urllib3==2.2.3
--- a/src/parse_pdf.py
+++ b/src/parse_pdf.py
@ -1,4 +1,4 @@
-import PyPDF2
+import pdfplumber
 import re
 ## re Patterns
@ -11,19 +11,18 @@ train_pis_line_pattern = r"\(PIS Code\s*:\s*(?P<pis_code>\d+|____)\s*\)" # Extra
 # Extract Schedule Cards to list of strings
 def extract_pdf_text(file_path):
-    with open(file_path, 'rb') as file:
+    tables = []
-        pdf_reader = PyPDF2.PdfReader(file)
+    try:
-        
+        with pdfplumber.open(file_path) as file:
-        page_texts = []
+            for page_number, page in enumerate(file.pages, start=1):
-        for page in pdf_reader.pages:
+                page_tables = page.extract_tables()
-            text = page.extract_text()
+                if page_tables:
-            if text:
+                    tables.extend(page_tables)
-                page_texts.append(text)
+    except Exception as e:
        print(f"Error processing PDF: {e}")
    return tables
-        full_text = " ".join(page_texts)
+# Does everything - NEEDS REWRITE TO HANDLE EXTRACTED TABLES FROM ABOVE FN
        return full_text
 # Does everything - should be split in to functional functions
 def parse_pdf_file(filename):
    pdf_text = extract_pdf_text(filename)
    schedule_cards = re.findall(schedule_pattern, pdf_text, re.DOTALL)