Compare commits
2 Commits
table-proc
...
main
Author | SHA1 | Date | |
---|---|---|---|
c7fd19f477 | |||
42c0ae34ab |
@ -1,13 +1,7 @@
|
|||||||
certifi==2024.8.30
|
certifi==2024.8.30
|
||||||
cffi==1.17.1
|
|
||||||
charset-normalizer==3.4.0
|
charset-normalizer==3.4.0
|
||||||
cryptography==44.0.0
|
|
||||||
idna==3.10
|
idna==3.10
|
||||||
pdfminer.six==20231228
|
|
||||||
pdfplumber==0.11.4
|
|
||||||
pillow==11.0.0
|
|
||||||
pycparser==2.22
|
|
||||||
pyOwlBoard==0.0.2
|
pyOwlBoard==0.0.2
|
||||||
pypdfium2==4.30.0
|
PyPDF2==3.0.1
|
||||||
requests==2.32.3
|
requests==2.32.3
|
||||||
urllib3==2.2.3
|
urllib3==2.2.3
|
||||||
|
@ -1,4 +1,4 @@
|
|||||||
import pdfplumber
|
import PyPDF2
|
||||||
import re
|
import re
|
||||||
|
|
||||||
## re Patterns
|
## re Patterns
|
||||||
@ -11,18 +11,19 @@ train_pis_line_pattern = r"\(PIS Code\s*:\s*(?P<pis_code>\d+|____)\s*\)" # Extra
|
|||||||
# Extract Schedule Cards to list of strings
|
# Extract Schedule Cards to list of strings
|
||||||
|
|
||||||
def extract_pdf_text(file_path):
|
def extract_pdf_text(file_path):
|
||||||
tables = []
|
with open(file_path, 'rb') as file:
|
||||||
try:
|
pdf_reader = PyPDF2.PdfReader(file)
|
||||||
with pdfplumber.open(file_path) as file:
|
|
||||||
for page_number, page in enumerate(file.pages, start=1):
|
page_texts = []
|
||||||
page_tables = page.extract_tables()
|
for page in pdf_reader.pages:
|
||||||
if page_tables:
|
text = page.extract_text()
|
||||||
tables.extend(page_tables)
|
if text:
|
||||||
except Exception as e:
|
page_texts.append(text)
|
||||||
print(f"Error processing PDF: {e}")
|
|
||||||
return tables
|
|
||||||
|
|
||||||
# Does everything - NEEDS REWRITE TO HANDLE EXTRACTED TABLES FROM ABOVE FN
|
full_text = " ".join(page_texts)
|
||||||
|
return full_text
|
||||||
|
|
||||||
|
# Does everything - should be split in to functional functions
|
||||||
def parse_pdf_file(filename):
|
def parse_pdf_file(filename):
|
||||||
pdf_text = extract_pdf_text(filename)
|
pdf_text = extract_pdf_text(filename)
|
||||||
schedule_cards = re.findall(schedule_pattern, pdf_text, re.DOTALL)
|
schedule_cards = re.findall(schedule_pattern, pdf_text, re.DOTALL)
|
||||||
|
@ -1,9 +0,0 @@
|
|||||||
import pdfplumber
|
|
||||||
|
|
||||||
with pdfplumber.open("./../1031 THO SC CNDR.pdf") as pdf:
|
|
||||||
for page_number, page in enumerate(pdf.pages, start=1):
|
|
||||||
tables = page.extract_tables()
|
|
||||||
for table in tables:
|
|
||||||
print(f"Table on page {page_number}: ")
|
|
||||||
for row in table:
|
|
||||||
print(row[0])
|
|
@ -55,7 +55,7 @@ def check_and_validate_against_owlboard(train_entries):
|
|||||||
output.append(train_entry)
|
output.append(train_entry)
|
||||||
|
|
||||||
# Else if only one possible entry (and previous statements false), open issue
|
# Else if only one possible entry (and previous statements false), open issue
|
||||||
elif len(train_entry['timetable_entries']) == 1:
|
elif len(train_entry['timetable_entries']) == 1 and train_entry['diagram_pis_code'] != "____":
|
||||||
issue_title = f"PIS Error | Code: {train_entry['diagram_pis_code']}"
|
issue_title = f"PIS Error | Code: {train_entry['diagram_pis_code']}"
|
||||||
issue_content = f"""
|
issue_content = f"""
|
||||||
PIS Code {train_entry['diagram_pis_code']}.
|
PIS Code {train_entry['diagram_pis_code']}.
|
||||||
|
Loading…
Reference in New Issue
Block a user