diagram-parser/src/parse_docx.py

72 lines
2.1 KiB
Python

### This uses the 'python-docx-2023' module
from docx import Document
import re
### This can parse each table. What needs to happen next
### is to parse all tables, then check for a PIS code.
### If PIS code exists, then find the associated headcode,
### Then an API request can be made to OwlBoard to try
### and find a service with valid stopping pattern,
### then the PIS codes can be generated for review.
PIS_PATTERN = re.compile(r'PIS code\s*:\s*(\d{4})')
HEADCODE_PATTERN = re.compile(r'(\d{1}[A-Z]\d{2})')
def extract_tables(file_path):
document = Document(file_path)
print(f"Reading {len(document.tables)} tables from {file_path}")
pis_info = []
for table in document.tables:
data = []
for i, row in enumerate(table.rows):
text = (cell.text for cell in row.cells)
if i == 0:
keys = tuple(text)
continue
row_data = dict(zip(keys, text))
data.append(row_data)
pis_and_headcode = match_pis_and_headcode(data)
if pis_and_headcode:
pis_and_headcode['source_file'] = file_path
pis_info.append(pis_and_headcode)
return(pis_info)
def match_pis_and_headcode(table_data):
pis_code = None
headcode = None
job_head = None
for item in table_data:
for key, value in item.items():
match = PIS_PATTERN.search(value)
if match:
pis_code = match.group(1)
job_head = key.strip()
break
if pis_code:
break
if pis_code:
for item in table_data:
for key in item:
match = HEADCODE_PATTERN.search(key)
if match:
headcode = match.group()
break
if headcode:
break
if pis_code and headcode:
return {'job_head': job_head, 'headcode': headcode, 'pis': pis_code}
else:
return None
def solo_run():
print(extract_tables("./file.docx"))
if __name__ == "__main__":
solo_run()