Changes:

- PIS Finder - DOCX Parser - LOCAL MODE (DEPRECATED ALREADY)
2024-02-16 21:18:26 +00:00 · 2024-02-16 21:18:26 +00:00 · 59f6439872
commit 59f6439872
parent d3e24c8a03
11 changed files with 159 additions and 121 deletions
--- a/CNDR.docx
+++ b/CNDR.docx
--- a/src/find-missing-from-docx.py
+++ b/src/find-missing-from-docx.py
@ -1,34 +0,0 @@
-### This uses the 'python-docx-2023' module
-from docx import Document
-
-def extract_table(file_path):
-    document = Document(file_path)
-
-    table = document.tables[4]
-    print(document.tables[1])
-    print(document.tables[2])
-    print(document.tables[3])
-    print(document.tables[4])
-    print(document.tables[5])
-
-    data = []
-    keys = None
-    for i, row in enumerate(table.rows):
-        text = (cell.text for cell in row.cells)
-        if i == 0:
-            keys = tuple(text)
-            continue
-        row_data = dict(zip(keys, text))
-        data.append(row_data)
-
-    print(data)
-
-if __name__ == "__main__":
-    extract_table("./file.docx")
-
-### This can parse each table.  What needs to happen next
-### is to parse all tables, then check for a PIS code.
-### If PIS code exists, then find the associated headcode,
-### Then an API request can be made to OwlBoard to try
-### and find a service with valid stopping pattern,
-### then the PIS codes can be generated for review.
--- a/src/find-missing-from-pdf-dir.py
+++ b/src/find-missing-from-pdf-dir.py
@ -1,87 +0,0 @@
-import os, sys, json, subprocess, re, yaml, requests
-
-report_file_path = "./report.txt"
-code_store_list_url = "https://git.fjla.uk/OwlBoard/data/raw/branch/main/pis/gw.yaml"
-
-def is_pdfgrep_installed():
-    try:
-        subprocess.check_call(["pdfgrep", "--version"], stdout=subprocess.PIPE, stderr=subprocess.PIPE)
-        return True
-    except subprocess.CalledProcessError:
-        return False
-
-def fetch_and_parse_yaml(url):
-    try:
-        response = requests.get(url)
-        response.raise_for_status()
-        existing_codes = yaml.safe_load(response.text)
-        return existing_codes
-    except Exception as e:
-        print(f"Error downloading and parsing codes: {e}")
-        sys.exit(1)
-
-def main():
-
-    if len(sys.argv) != 2:
-        print("Usage: python pdf_code_extraction.py <directory_path>")
-        sys.exit(1)
-
-    pdf_directory = sys.argv[1]
-
-    if not os.path.isdir(pdf_directory):
-        print(f"'{pdf_directory}' is not a valid directory.")
-        sys.exit(1)
-
-    if not is_pdfgrep_installed():
-        print("pdfgrep is not installed on your system.")
-        sys.exit(1)
-
-    code_list = []
-
-    pdfgrep_cmd = f"pdfgrep -Prno 'code\\s*:\\s*\\d{{4}}' {pdf_directory}"
-    pdfgrep_output = subprocess.getoutput(pdfgrep_cmd)
-    
-
-    for line in pdfgrep_output.splitlines():
-        match = re.search(r"^(.*?):\s*code\s*:\s*(\d{4})", line)
-        if match:
-            filename, code = match.groups()
-            code_list.append({"file":filename, "code":str(code)})
-    
-    existing_codes = fetch_and_parse_yaml(code_store_list_url)['pis']
-    existing_set = set()
-    for item in existing_codes:
-        code = item['code']
-        existing_set.add(str(code))
-
-    unique_codes = set()
-    unique_code_list = []
-    missing_codes = []
-    for item in code_list:
-        code = item['code']
-        if code not in unique_codes:
-            unique_codes.add(code)
-            unique_code_list.append(item)
-            if code not in existing_set:
-                missing_codes.append(item)
-
-    #print(missing_codes)
-
-    report = f"""
-    Number of missing codes found: {len(missing_codes)}
-
-Missing Codes:
-    """
-
-    for item in missing_codes:
-        report += f"\n  - code: {item['code']}\n    stops: (File: {item['file']})"
-
-    
-    print(f"Saving report to {report_file_path}")
-    with open(report_file_path, 'w') as report_file:
-        report_file.write(report)
-
-    print(report)
-
-if __name__ == "__main__":
-    main()
--- a/src/find_service.py
+++ b/src/find_service.py
@ -0,0 +1,3 @@
+## Uses the HEADCODE to guess at the service the PIS code matches
+## Where there are multiple matches both are prepared and
+## await human review.
--- a/src/gitea_connector.py
+++ b/src/gitea_connector.py
--- a/src/local_mode.py
+++ b/src/local_mode.py
@ -0,0 +1,30 @@
+import parse_docx, pis_find
+
+import os
+
+def start():
+    print("Local mode activated")
+    working_directory = os.getcwd()
+    print("Working directory: ", working_directory)
+
+    ## Get all files in directory
+    files = [f for f in os.listdir(working_directory) if os.path.isfile(os.path.join(working_directory, f))]
+    docx_files = [f for f in files if f.endswith(".docx")]
+
+    results = []
+
+    if docx_files:
+        print(f"Found {len(docx_files)} DOCX files in directory")
+        for file in docx_files:
+            print(file)
+            items = parse_docx.extract_tables(file)
+            results.extend(items)
+    else:
+        print("No DOCX files found")
+
+    print(f"Found {len(results)} PIS Codes in documents")
+    pis_find.run(results)
+
+
+if __name__ == "__main__":
+    start()
--- a/src/mailbox_mode.py
+++ b/src/mailbox_mode.py
--- a/src/main.py
+++ b/src/main.py
@ -0,0 +1,16 @@
+import sys
+
+def main():
+    mode = sys.argv[1] if len(sys.argv) > 1 else "local"
+
+    if mode == "local":
+        import local_mode
+        local_mode.start()
+    elif mode == "mailbox":
+        print("MailBox mode not available yet")
+        pass
+    else:
+        print("Invalid mode.  Please specify 'local' or 'mailbox'")
+
+if __name__ == "__main__":
+    main()
--- a/src/owlboard_connector.py
+++ b/src/owlboard_connector.py
@ -0,0 +1,5 @@
+### API REQUESTS HERE
+
+### AUTHENTICATION MUST BE COMPLETED, REGISTERING FOR THE API IF NECCESSARY
+### THIS NEGATES THE ABILITY TO USE LOCAL MODE - MAILBOX MODE ONLY AS
+### MAILBOX ACCESS IS NEEDED FOR REGISTRATION
--- a/src/parse_docx.py
+++ b/src/parse_docx.py
@ -0,0 +1,72 @@
+### This uses the 'python-docx-2023' module
+from docx import Document
+import re
+
+### This can parse each table.  What needs to happen next
+### is to parse all tables, then check for a PIS code.
+### If PIS code exists, then find the associated headcode,
+### Then an API request can be made to OwlBoard to try
+### and find a service with valid stopping pattern,
+### then the PIS codes can be generated for review.
+
+PIS_PATTERN = re.compile(r'PIS code\s*:\s*(\d{4})')
+HEADCODE_PATTERN = re.compile(r'(\d{1}[A-Z]\d{2})')
+
+def extract_tables(file_path):
+    document = Document(file_path)
+    print(f"Reading {len(document.tables)} tables from {file_path}")
+
+    pis_info = []
+
+    for table in document.tables:
+        data = []
+        for i, row in enumerate(table.rows):
+            text = (cell.text for cell in row.cells)
+            if i == 0:
+                keys = tuple(text)
+                continue
+            row_data = dict(zip(keys, text))
+            data.append(row_data)
+        pis_and_headcode = match_pis_and_headcode(data)
+        if pis_and_headcode:
+            pis_and_headcode['source_file'] = file_path
+            pis_info.append(pis_and_headcode)
+
+    return(pis_info)
+        
+
+def match_pis_and_headcode(table_data):
+    pis_code = None
+    headcode = None
+    job_head = None
+
+    for item in table_data:
+        for key, value in item.items():
+            match = PIS_PATTERN.search(value)
+            if match:
+                pis_code = match.group(1)
+                job_head = key.strip()
+                break
+            if pis_code:
+                break
+
+    if pis_code:
+        for item in table_data:
+            for key in item:
+                match = HEADCODE_PATTERN.search(key)
+                if match:
+                    headcode = match.group()
+                    break
+                if headcode:
+                    break
+
+    if pis_code and headcode:
+        return {'job_head': job_head, 'headcode': headcode, 'pis': pis_code}
+    else:
+        return None
+
+def solo_run():
+    print(extract_tables("./file.docx"))
+
+if __name__ == "__main__":
+    solo_run()
--- a/src/pis_find.py
+++ b/src/pis_find.py
@ -0,0 +1,33 @@
+import requests
+
+def run(data_list):
+    deduplicated_data = dedup(data_list)
+    print(f"Removed {len(data_list) - len(deduplicated_data)} duplicate codes")
+    print(f"Searching for {len(deduplicated_data)} PIS codes")
+    missing_data = find_missing(deduplicated_data)
+    print(f"{missing_data} missing PIS codes in OwlBoard data")
+
+
+def dedup(data_list):
+    unique_dicts = {d['pis']: d for d in data_list}.values()
+    unique_list_of_dicts = list(unique_dicts)
+    return unique_list_of_dicts
+
+
+## AUTH REQUIRED!!!
+def find_missing(data_list):
+    BASEURL = 'http://localhost:8460/api/v2/pis/byCode/'
+    #BASEURL = 'https://owlboard.info/api/v2/pis/byCode/'
+    missing_data = []
+
+    for item in data_list:
+        pis_code = item.get('pis')
+        if pis_code:
+            url = BASEURL + pis_code
+            response = requests.get(url)
+            if response.status_code == 200:
+                json_response = response.json()
+                if json_response and isinstance(json_response, list):
+                    missing_data.append(item)
+            else:
+                print(f"Request failed for PIS {pis_code}.  Status: {response.status_code}")