### This uses the 'python-docx-2023' module from docx import Document import re ### This can parse each table. What needs to happen next ### is to parse all tables, then check for a PIS code. ### If PIS code exists, then find the associated headcode, ### Then an API request can be made to OwlBoard to try ### and find a service with valid stopping pattern, ### then the PIS codes can be generated for review. PIS_PATTERN = re.compile(r'PIS code\s*:\s*(\d{4})') HEADCODE_PATTERN = re.compile(r'(\d{1}[A-Z]\d{2})') def extract_tables(file_path): document = Document(file_path) print(f"Reading {len(document.tables)} tables from {file_path}") pis_info = [] for table in document.tables: data = [] for i, row in enumerate(table.rows): text = (cell.text for cell in row.cells) if i == 0: keys = tuple(text) continue row_data = dict(zip(keys, text)) data.append(row_data) pis_and_headcode = match_pis_and_headcode(data) if pis_and_headcode: pis_and_headcode['source_file'] = file_path pis_info.append(pis_and_headcode) return(pis_info) def match_pis_and_headcode(table_data): pis_code = None headcode = None job_head = None for item in table_data: for key, value in item.items(): match = PIS_PATTERN.search(value) if match: pis_code = match.group(1) job_head = key.strip() break if pis_code: break if pis_code: for item in table_data: for key in item: match = HEADCODE_PATTERN.search(key) if match: headcode = match.group() break if headcode: break if pis_code and headcode: return {'job_head': job_head, 'headcode': headcode, 'pis': pis_code} else: return None def solo_run(): print(extract_tables("./file.docx")) if __name__ == "__main__": solo_run()