Changes:

- PIS Finder - DOCX Parser - LOCAL MODE (DEPRECATED ALREADY)
2024-02-16 21:18:26 +00:00 · 2024-02-16 21:18:26 +00:00 · 59f6439872
commit 59f6439872
parent d3e24c8a03
11 changed files with 159 additions and 121 deletions
--- a/CNDR.docx
+++ b/CNDR.docx
--- a/src/find-missing-from-docx.py
+++ b/src/find-missing-from-docx.py
@ -1,34 +0,0 @@
 ### This uses the 'python-docx-2023' module
 from docx import Document
 def extract_table(file_path):
    document = Document(file_path)
    table = document.tables[4]
    print(document.tables[1])
    print(document.tables[2])
    print(document.tables[3])
    print(document.tables[4])
    print(document.tables[5])
    data = []
    keys = None
    for i, row in enumerate(table.rows):
        text = (cell.text for cell in row.cells)
        if i == 0:
            keys = tuple(text)
            continue
        row_data = dict(zip(keys, text))
        data.append(row_data)
    print(data)
 if __name__ == "__main__":
    extract_table("./file.docx")
 ### This can parse each table.  What needs to happen next
 ### is to parse all tables, then check for a PIS code.
 ### If PIS code exists, then find the associated headcode,
 ### Then an API request can be made to OwlBoard to try
 ### and find a service with valid stopping pattern,
 ### then the PIS codes can be generated for review.
--- a/src/find-missing-from-pdf-dir.py
+++ b/src/find-missing-from-pdf-dir.py
@ -1,87 +0,0 @@
 import os, sys, json, subprocess, re, yaml, requests
 report_file_path = "./report.txt"
 code_store_list_url = "https://git.fjla.uk/OwlBoard/data/raw/branch/main/pis/gw.yaml"
 def is_pdfgrep_installed():
    try:
        subprocess.check_call(["pdfgrep", "--version"], stdout=subprocess.PIPE, stderr=subprocess.PIPE)
        return True
    except subprocess.CalledProcessError:
        return False
 def fetch_and_parse_yaml(url):
    try:
        response = requests.get(url)
        response.raise_for_status()
        existing_codes = yaml.safe_load(response.text)
        return existing_codes
    except Exception as e:
        print(f"Error downloading and parsing codes: {e}")
        sys.exit(1)
 def main():
    if len(sys.argv) != 2:
        print("Usage: python pdf_code_extraction.py <directory_path>")
        sys.exit(1)
    pdf_directory = sys.argv[1]
    if not os.path.isdir(pdf_directory):
        print(f"'{pdf_directory}' is not a valid directory.")
        sys.exit(1)
    if not is_pdfgrep_installed():
        print("pdfgrep is not installed on your system.")
        sys.exit(1)
    code_list = []
    pdfgrep_cmd = f"pdfgrep -Prno 'code\\s*:\\s*\\d{{4}}' {pdf_directory}"
    pdfgrep_output = subprocess.getoutput(pdfgrep_cmd)
    for line in pdfgrep_output.splitlines():
        match = re.search(r"^(.*?):\s*code\s*:\s*(\d{4})", line)
        if match:
            filename, code = match.groups()
            code_list.append({"file":filename, "code":str(code)})
    existing_codes = fetch_and_parse_yaml(code_store_list_url)['pis']
    existing_set = set()
    for item in existing_codes:
        code = item['code']
        existing_set.add(str(code))
    unique_codes = set()
    unique_code_list = []
    missing_codes = []
    for item in code_list:
        code = item['code']
        if code not in unique_codes:
            unique_codes.add(code)
            unique_code_list.append(item)
            if code not in existing_set:
                missing_codes.append(item)
    #print(missing_codes)
    report = f"""
    Number of missing codes found: {len(missing_codes)}
 Missing Codes:
    """
    for item in missing_codes:
        report += f"\n  - code: {item['code']}\n    stops: (File: {item['file']})"
    print(f"Saving report to {report_file_path}")
    with open(report_file_path, 'w') as report_file:
        report_file.write(report)
    print(report)
 if __name__ == "__main__":
    main()
--- a/src/find_service.py
+++ b/src/find_service.py
@ -0,0 +1,3 @@
 ## Uses the HEADCODE to guess at the service the PIS code matches
 ## Where there are multiple matches both are prepared and
 ## await human review.
--- a/src/gitea_connector.py
+++ b/src/gitea_connector.py
--- a/src/local_mode.py
+++ b/src/local_mode.py
@ -0,0 +1,30 @@
 import parse_docx, pis_find
 import os
 def start():
    print("Local mode activated")
    working_directory = os.getcwd()
    print("Working directory: ", working_directory)
    ## Get all files in directory
    files = [f for f in os.listdir(working_directory) if os.path.isfile(os.path.join(working_directory, f))]
    docx_files = [f for f in files if f.endswith(".docx")]
    results = []
    if docx_files:
        print(f"Found {len(docx_files)} DOCX files in directory")
        for file in docx_files:
            print(file)
            items = parse_docx.extract_tables(file)
            results.extend(items)
    else:
        print("No DOCX files found")
    print(f"Found {len(results)} PIS Codes in documents")
    pis_find.run(results)
 if __name__ == "__main__":
    start()
--- a/src/mailbox_mode.py
+++ b/src/mailbox_mode.py
--- a/src/main.py
+++ b/src/main.py
@ -0,0 +1,16 @@
 import sys
 def main():
    mode = sys.argv[1] if len(sys.argv) > 1 else "local"
    if mode == "local":
        import local_mode
        local_mode.start()
    elif mode == "mailbox":
        print("MailBox mode not available yet")
        pass
    else:
        print("Invalid mode.  Please specify 'local' or 'mailbox'")
 if __name__ == "__main__":
    main()
--- a/src/owlboard_connector.py
+++ b/src/owlboard_connector.py
@ -0,0 +1,5 @@
 ### API REQUESTS HERE
 ### AUTHENTICATION MUST BE COMPLETED, REGISTERING FOR THE API IF NECCESSARY
 ### THIS NEGATES THE ABILITY TO USE LOCAL MODE - MAILBOX MODE ONLY AS
 ### MAILBOX ACCESS IS NEEDED FOR REGISTRATION
--- a/src/parse_docx.py
+++ b/src/parse_docx.py
@ -0,0 +1,72 @@
 ### This uses the 'python-docx-2023' module
 from docx import Document
 import re
 ### This can parse each table.  What needs to happen next
 ### is to parse all tables, then check for a PIS code.
 ### If PIS code exists, then find the associated headcode,
 ### Then an API request can be made to OwlBoard to try
 ### and find a service with valid stopping pattern,
 ### then the PIS codes can be generated for review.
 PIS_PATTERN = re.compile(r'PIS code\s*:\s*(\d{4})')
 HEADCODE_PATTERN = re.compile(r'(\d{1}[A-Z]\d{2})')
 def extract_tables(file_path):
    document = Document(file_path)
    print(f"Reading {len(document.tables)} tables from {file_path}")
    pis_info = []
    for table in document.tables:
        data = []
        for i, row in enumerate(table.rows):
            text = (cell.text for cell in row.cells)
            if i == 0:
                keys = tuple(text)
                continue
            row_data = dict(zip(keys, text))
            data.append(row_data)
        pis_and_headcode = match_pis_and_headcode(data)
        if pis_and_headcode:
            pis_and_headcode['source_file'] = file_path
            pis_info.append(pis_and_headcode)
    return(pis_info)
 def match_pis_and_headcode(table_data):
    pis_code = None
    headcode = None
    job_head = None
    for item in table_data:
        for key, value in item.items():
            match = PIS_PATTERN.search(value)
            if match:
                pis_code = match.group(1)
                job_head = key.strip()
                break
            if pis_code:
                break
    if pis_code:
        for item in table_data:
            for key in item:
                match = HEADCODE_PATTERN.search(key)
                if match:
                    headcode = match.group()
                    break
                if headcode:
                    break
    if pis_code and headcode:
        return {'job_head': job_head, 'headcode': headcode, 'pis': pis_code}
    else:
        return None
 def solo_run():
    print(extract_tables("./file.docx"))
 if __name__ == "__main__":
    solo_run()
--- a/src/pis_find.py
+++ b/src/pis_find.py
@ -0,0 +1,33 @@
 import requests
 def run(data_list):
    deduplicated_data = dedup(data_list)
    print(f"Removed {len(data_list) - len(deduplicated_data)} duplicate codes")
    print(f"Searching for {len(deduplicated_data)} PIS codes")
    missing_data = find_missing(deduplicated_data)
    print(f"{missing_data} missing PIS codes in OwlBoard data")
 def dedup(data_list):
    unique_dicts = {d['pis']: d for d in data_list}.values()
    unique_list_of_dicts = list(unique_dicts)
    return unique_list_of_dicts
 ## AUTH REQUIRED!!!
 def find_missing(data_list):
    BASEURL = 'http://localhost:8460/api/v2/pis/byCode/'
    #BASEURL = 'https://owlboard.info/api/v2/pis/byCode/'
    missing_data = []
    for item in data_list:
        pis_code = item.get('pis')
        if pis_code:
            url = BASEURL + pis_code
            response = requests.get(url)
            if response.status_code == 200:
                json_response = response.json()
                if json_response and isinstance(json_response, list):
                    missing_data.append(item)
            else:
                print(f"Request failed for PIS {pis_code}.  Status: {response.status_code}")