72 lines
2.1 KiB
Python
72 lines
2.1 KiB
Python
|
### This uses the 'python-docx-2023' module
|
||
|
from docx import Document
|
||
|
import re
|
||
|
|
||
|
### This can parse each table. What needs to happen next
|
||
|
### is to parse all tables, then check for a PIS code.
|
||
|
### If PIS code exists, then find the associated headcode,
|
||
|
### Then an API request can be made to OwlBoard to try
|
||
|
### and find a service with valid stopping pattern,
|
||
|
### then the PIS codes can be generated for review.
|
||
|
|
||
|
PIS_PATTERN = re.compile(r'PIS code\s*:\s*(\d{4})')
|
||
|
HEADCODE_PATTERN = re.compile(r'(\d{1}[A-Z]\d{2})')
|
||
|
|
||
|
def extract_tables(file_path):
|
||
|
document = Document(file_path)
|
||
|
print(f"Reading {len(document.tables)} tables from {file_path}")
|
||
|
|
||
|
pis_info = []
|
||
|
|
||
|
for table in document.tables:
|
||
|
data = []
|
||
|
for i, row in enumerate(table.rows):
|
||
|
text = (cell.text for cell in row.cells)
|
||
|
if i == 0:
|
||
|
keys = tuple(text)
|
||
|
continue
|
||
|
row_data = dict(zip(keys, text))
|
||
|
data.append(row_data)
|
||
|
pis_and_headcode = match_pis_and_headcode(data)
|
||
|
if pis_and_headcode:
|
||
|
pis_and_headcode['source_file'] = file_path
|
||
|
pis_info.append(pis_and_headcode)
|
||
|
|
||
|
return(pis_info)
|
||
|
|
||
|
|
||
|
def match_pis_and_headcode(table_data):
|
||
|
pis_code = None
|
||
|
headcode = None
|
||
|
job_head = None
|
||
|
|
||
|
for item in table_data:
|
||
|
for key, value in item.items():
|
||
|
match = PIS_PATTERN.search(value)
|
||
|
if match:
|
||
|
pis_code = match.group(1)
|
||
|
job_head = key.strip()
|
||
|
break
|
||
|
if pis_code:
|
||
|
break
|
||
|
|
||
|
if pis_code:
|
||
|
for item in table_data:
|
||
|
for key in item:
|
||
|
match = HEADCODE_PATTERN.search(key)
|
||
|
if match:
|
||
|
headcode = match.group()
|
||
|
break
|
||
|
if headcode:
|
||
|
break
|
||
|
|
||
|
if pis_code and headcode:
|
||
|
return {'job_head': job_head, 'headcode': headcode, 'pis': pis_code}
|
||
|
else:
|
||
|
return None
|
||
|
|
||
|
def solo_run():
|
||
|
print(extract_tables("./file.docx"))
|
||
|
|
||
|
if __name__ == "__main__":
|
||
|
solo_run()
|