Open SC files and parse tables
This commit is contained in:
parent
aca3f94396
commit
8af14df670
@ -1,7 +1,13 @@
|
||||
certifi==2024.8.30
|
||||
cffi==1.17.1
|
||||
charset-normalizer==3.4.0
|
||||
cryptography==44.0.0
|
||||
idna==3.10
|
||||
pdfminer.six==20231228
|
||||
pdfplumber==0.11.4
|
||||
pillow==11.0.0
|
||||
pycparser==2.22
|
||||
pyOwlBoard==0.0.2
|
||||
PyPDF2==3.0.1
|
||||
pypdfium2==4.30.0
|
||||
requests==2.32.3
|
||||
urllib3==2.2.3
|
||||
|
@ -1,4 +1,4 @@
|
||||
import PyPDF2
|
||||
import pdfplumber
|
||||
import re
|
||||
|
||||
## re Patterns
|
||||
@ -11,19 +11,18 @@ train_pis_line_pattern = r"\(PIS Code\s*:\s*(?P<pis_code>\d+|____)\s*\)" # Extra
|
||||
# Extract Schedule Cards to list of strings
|
||||
|
||||
def extract_pdf_text(file_path):
|
||||
with open(file_path, 'rb') as file:
|
||||
pdf_reader = PyPDF2.PdfReader(file)
|
||||
|
||||
page_texts = []
|
||||
for page in pdf_reader.pages:
|
||||
text = page.extract_text()
|
||||
if text:
|
||||
page_texts.append(text)
|
||||
tables = []
|
||||
try:
|
||||
with pdfplumber.open(file_path) as file:
|
||||
for page_number, page in enumerate(file.pages, start=1):
|
||||
page_tables = page.extract_tables()
|
||||
if page_tables:
|
||||
tables.extend(page_tables)
|
||||
except Exception as e:
|
||||
print(f"Error processing PDF: {e}")
|
||||
return tables
|
||||
|
||||
full_text = " ".join(page_texts)
|
||||
return full_text
|
||||
|
||||
# Does everything - should be split in to functional functions
|
||||
# Does everything - NEEDS REWRITE TO HANDLE EXTRACTED TABLES FROM ABOVE FN
|
||||
def parse_pdf_file(filename):
|
||||
pdf_text = extract_pdf_text(filename)
|
||||
schedule_cards = re.findall(schedule_pattern, pdf_text, re.DOTALL)
|
||||
|
Loading…
Reference in New Issue
Block a user