diagram-parser/src/parse_docx.py

### This uses the 'python-docx-2023' module
from docx import Document
import re

### This can parse each table.  What needs to happen next
### is to parse all tables, then check for a PIS code.
### If PIS code exists, then find the associated headcode,
### Then an API request can be made to OwlBoard to try
### and find a service with valid stopping pattern,
### then the PIS codes can be generated for review.

PIS_PATTERN = re.compile(r'PIS code\s*:\s*(\d{4})')
HEADCODE_PATTERN = re.compile(r'(\d{1}[A-Z]\d{2})')

def extract_tables(file_path):
    document = Document(file_path)
    print(f"Reading {len(document.tables)} tables from {file_path}")

    pis_info = []

    for table in document.tables:
        data = []
        for i, row in enumerate(table.rows):
            text = (cell.text for cell in row.cells)
            if i == 0:
                keys = tuple(text)
                continue
            row_data = dict(zip(keys, text))
            data.append(row_data)
        pis_and_headcode = match_pis_and_headcode(data)
        if pis_and_headcode:
            pis_and_headcode['source_file'] = file_path
            pis_info.append(pis_and_headcode)

    return(pis_info)


def match_pis_and_headcode(table_data):
    pis_code = None
    headcode = None
    job_head = None

    for item in table_data:
        for key, value in item.items():
            match = PIS_PATTERN.search(value)
            if match:
                pis_code = match.group(1)
                job_head = key.strip()
                break
            if pis_code:
                break

    if pis_code:
        for item in table_data:
            for key in item:
                match = HEADCODE_PATTERN.search(key)
                if match:
                    headcode = match.group()
                    break
                if headcode:
                    break

    if pis_code and headcode:
        return {'job_head': job_head, 'headcode': headcode, 'pis': pis_code}
    else:
        return None

def solo_run():
    print(extract_tables("./file.docx"))

if __name__ == "__main__":
    solo_run()