diff --git a/src/0222 THO SC CNDR.docx b/src/0222 THO SC CNDR.docx new file mode 100644 index 0000000..95453da Binary files /dev/null and b/src/0222 THO SC CNDR.docx differ diff --git a/src/find-missing-from-docx.py b/src/find-missing-from-docx.py deleted file mode 100644 index 2b5a324..0000000 --- a/src/find-missing-from-docx.py +++ /dev/null @@ -1,34 +0,0 @@ -### This uses the 'python-docx-2023' module -from docx import Document - -def extract_table(file_path): - document = Document(file_path) - - table = document.tables[4] - print(document.tables[1]) - print(document.tables[2]) - print(document.tables[3]) - print(document.tables[4]) - print(document.tables[5]) - - data = [] - keys = None - for i, row in enumerate(table.rows): - text = (cell.text for cell in row.cells) - if i == 0: - keys = tuple(text) - continue - row_data = dict(zip(keys, text)) - data.append(row_data) - - print(data) - -if __name__ == "__main__": - extract_table("./file.docx") - -### This can parse each table. What needs to happen next -### is to parse all tables, then check for a PIS code. -### If PIS code exists, then find the associated headcode, -### Then an API request can be made to OwlBoard to try -### and find a service with valid stopping pattern, -### then the PIS codes can be generated for review. \ No newline at end of file diff --git a/src/find-missing-from-pdf-dir.py b/src/find-missing-from-pdf-dir.py deleted file mode 100644 index a462598..0000000 --- a/src/find-missing-from-pdf-dir.py +++ /dev/null @@ -1,87 +0,0 @@ -import os, sys, json, subprocess, re, yaml, requests - -report_file_path = "./report.txt" -code_store_list_url = "https://git.fjla.uk/OwlBoard/data/raw/branch/main/pis/gw.yaml" - -def is_pdfgrep_installed(): - try: - subprocess.check_call(["pdfgrep", "--version"], stdout=subprocess.PIPE, stderr=subprocess.PIPE) - return True - except subprocess.CalledProcessError: - return False - -def fetch_and_parse_yaml(url): - try: - response = requests.get(url) - response.raise_for_status() - existing_codes = yaml.safe_load(response.text) - return existing_codes - except Exception as e: - print(f"Error downloading and parsing codes: {e}") - sys.exit(1) - -def main(): - - if len(sys.argv) != 2: - print("Usage: python pdf_code_extraction.py ") - sys.exit(1) - - pdf_directory = sys.argv[1] - - if not os.path.isdir(pdf_directory): - print(f"'{pdf_directory}' is not a valid directory.") - sys.exit(1) - - if not is_pdfgrep_installed(): - print("pdfgrep is not installed on your system.") - sys.exit(1) - - code_list = [] - - pdfgrep_cmd = f"pdfgrep -Prno 'code\\s*:\\s*\\d{{4}}' {pdf_directory}" - pdfgrep_output = subprocess.getoutput(pdfgrep_cmd) - - - for line in pdfgrep_output.splitlines(): - match = re.search(r"^(.*?):\s*code\s*:\s*(\d{4})", line) - if match: - filename, code = match.groups() - code_list.append({"file":filename, "code":str(code)}) - - existing_codes = fetch_and_parse_yaml(code_store_list_url)['pis'] - existing_set = set() - for item in existing_codes: - code = item['code'] - existing_set.add(str(code)) - - unique_codes = set() - unique_code_list = [] - missing_codes = [] - for item in code_list: - code = item['code'] - if code not in unique_codes: - unique_codes.add(code) - unique_code_list.append(item) - if code not in existing_set: - missing_codes.append(item) - - #print(missing_codes) - - report = f""" - Number of missing codes found: {len(missing_codes)} - -Missing Codes: - """ - - for item in missing_codes: - report += f"\n - code: {item['code']}\n stops: (File: {item['file']})" - - - print(f"Saving report to {report_file_path}") - with open(report_file_path, 'w') as report_file: - report_file.write(report) - - print(report) - -if __name__ == "__main__": - main() \ No newline at end of file diff --git a/src/find_service.py b/src/find_service.py new file mode 100644 index 0000000..9c68e46 --- /dev/null +++ b/src/find_service.py @@ -0,0 +1,3 @@ +## Uses the HEADCODE to guess at the service the PIS code matches +## Where there are multiple matches both are prepared and +## await human review. \ No newline at end of file diff --git a/src/gitea_connector.py b/src/gitea_connector.py new file mode 100644 index 0000000..e69de29 diff --git a/src/local_mode.py b/src/local_mode.py new file mode 100644 index 0000000..328be48 --- /dev/null +++ b/src/local_mode.py @@ -0,0 +1,30 @@ +import parse_docx, pis_find + +import os + +def start(): + print("Local mode activated") + working_directory = os.getcwd() + print("Working directory: ", working_directory) + + ## Get all files in directory + files = [f for f in os.listdir(working_directory) if os.path.isfile(os.path.join(working_directory, f))] + docx_files = [f for f in files if f.endswith(".docx")] + + results = [] + + if docx_files: + print(f"Found {len(docx_files)} DOCX files in directory") + for file in docx_files: + print(file) + items = parse_docx.extract_tables(file) + results.extend(items) + else: + print("No DOCX files found") + + print(f"Found {len(results)} PIS Codes in documents") + pis_find.run(results) + + +if __name__ == "__main__": + start() \ No newline at end of file diff --git a/src/mailbox_mode.py b/src/mailbox_mode.py new file mode 100644 index 0000000..e69de29 diff --git a/src/main.py b/src/main.py new file mode 100644 index 0000000..cdb591c --- /dev/null +++ b/src/main.py @@ -0,0 +1,16 @@ +import sys + +def main(): + mode = sys.argv[1] if len(sys.argv) > 1 else "local" + + if mode == "local": + import local_mode + local_mode.start() + elif mode == "mailbox": + print("MailBox mode not available yet") + pass + else: + print("Invalid mode. Please specify 'local' or 'mailbox'") + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/src/owlboard_connector.py b/src/owlboard_connector.py new file mode 100644 index 0000000..6be9226 --- /dev/null +++ b/src/owlboard_connector.py @@ -0,0 +1,5 @@ +### API REQUESTS HERE + +### AUTHENTICATION MUST BE COMPLETED, REGISTERING FOR THE API IF NECCESSARY +### THIS NEGATES THE ABILITY TO USE LOCAL MODE - MAILBOX MODE ONLY AS +### MAILBOX ACCESS IS NEEDED FOR REGISTRATION \ No newline at end of file diff --git a/src/parse_docx.py b/src/parse_docx.py new file mode 100644 index 0000000..fa021e7 --- /dev/null +++ b/src/parse_docx.py @@ -0,0 +1,72 @@ +### This uses the 'python-docx-2023' module +from docx import Document +import re + +### This can parse each table. What needs to happen next +### is to parse all tables, then check for a PIS code. +### If PIS code exists, then find the associated headcode, +### Then an API request can be made to OwlBoard to try +### and find a service with valid stopping pattern, +### then the PIS codes can be generated for review. + +PIS_PATTERN = re.compile(r'PIS code\s*:\s*(\d{4})') +HEADCODE_PATTERN = re.compile(r'(\d{1}[A-Z]\d{2})') + +def extract_tables(file_path): + document = Document(file_path) + print(f"Reading {len(document.tables)} tables from {file_path}") + + pis_info = [] + + for table in document.tables: + data = [] + for i, row in enumerate(table.rows): + text = (cell.text for cell in row.cells) + if i == 0: + keys = tuple(text) + continue + row_data = dict(zip(keys, text)) + data.append(row_data) + pis_and_headcode = match_pis_and_headcode(data) + if pis_and_headcode: + pis_and_headcode['source_file'] = file_path + pis_info.append(pis_and_headcode) + + return(pis_info) + + +def match_pis_and_headcode(table_data): + pis_code = None + headcode = None + job_head = None + + for item in table_data: + for key, value in item.items(): + match = PIS_PATTERN.search(value) + if match: + pis_code = match.group(1) + job_head = key.strip() + break + if pis_code: + break + + if pis_code: + for item in table_data: + for key in item: + match = HEADCODE_PATTERN.search(key) + if match: + headcode = match.group() + break + if headcode: + break + + if pis_code and headcode: + return {'job_head': job_head, 'headcode': headcode, 'pis': pis_code} + else: + return None + +def solo_run(): + print(extract_tables("./file.docx")) + +if __name__ == "__main__": + solo_run() \ No newline at end of file diff --git a/src/pis_find.py b/src/pis_find.py new file mode 100644 index 0000000..34806d3 --- /dev/null +++ b/src/pis_find.py @@ -0,0 +1,33 @@ +import requests + +def run(data_list): + deduplicated_data = dedup(data_list) + print(f"Removed {len(data_list) - len(deduplicated_data)} duplicate codes") + print(f"Searching for {len(deduplicated_data)} PIS codes") + missing_data = find_missing(deduplicated_data) + print(f"{missing_data} missing PIS codes in OwlBoard data") + + +def dedup(data_list): + unique_dicts = {d['pis']: d for d in data_list}.values() + unique_list_of_dicts = list(unique_dicts) + return unique_list_of_dicts + + +## AUTH REQUIRED!!! +def find_missing(data_list): + BASEURL = 'http://localhost:8460/api/v2/pis/byCode/' + #BASEURL = 'https://owlboard.info/api/v2/pis/byCode/' + missing_data = [] + + for item in data_list: + pis_code = item.get('pis') + if pis_code: + url = BASEURL + pis_code + response = requests.get(url) + if response.status_code == 200: + json_response = response.json() + if json_response and isinstance(json_response, list): + missing_data.append(item) + else: + print(f"Request failed for PIS {pis_code}. Status: {response.status_code}") \ No newline at end of file