diff --git a/src/find-missing-from-pdf-dir.py b/src/find-missing-from-pdf-dir.py new file mode 100644 index 0000000..a462598 --- /dev/null +++ b/src/find-missing-from-pdf-dir.py @@ -0,0 +1,87 @@ +import os, sys, json, subprocess, re, yaml, requests + +report_file_path = "./report.txt" +code_store_list_url = "https://git.fjla.uk/OwlBoard/data/raw/branch/main/pis/gw.yaml" + +def is_pdfgrep_installed(): + try: + subprocess.check_call(["pdfgrep", "--version"], stdout=subprocess.PIPE, stderr=subprocess.PIPE) + return True + except subprocess.CalledProcessError: + return False + +def fetch_and_parse_yaml(url): + try: + response = requests.get(url) + response.raise_for_status() + existing_codes = yaml.safe_load(response.text) + return existing_codes + except Exception as e: + print(f"Error downloading and parsing codes: {e}") + sys.exit(1) + +def main(): + + if len(sys.argv) != 2: + print("Usage: python pdf_code_extraction.py ") + sys.exit(1) + + pdf_directory = sys.argv[1] + + if not os.path.isdir(pdf_directory): + print(f"'{pdf_directory}' is not a valid directory.") + sys.exit(1) + + if not is_pdfgrep_installed(): + print("pdfgrep is not installed on your system.") + sys.exit(1) + + code_list = [] + + pdfgrep_cmd = f"pdfgrep -Prno 'code\\s*:\\s*\\d{{4}}' {pdf_directory}" + pdfgrep_output = subprocess.getoutput(pdfgrep_cmd) + + + for line in pdfgrep_output.splitlines(): + match = re.search(r"^(.*?):\s*code\s*:\s*(\d{4})", line) + if match: + filename, code = match.groups() + code_list.append({"file":filename, "code":str(code)}) + + existing_codes = fetch_and_parse_yaml(code_store_list_url)['pis'] + existing_set = set() + for item in existing_codes: + code = item['code'] + existing_set.add(str(code)) + + unique_codes = set() + unique_code_list = [] + missing_codes = [] + for item in code_list: + code = item['code'] + if code not in unique_codes: + unique_codes.add(code) + unique_code_list.append(item) + if code not in existing_set: + missing_codes.append(item) + + #print(missing_codes) + + report = f""" + Number of missing codes found: {len(missing_codes)} + +Missing Codes: + """ + + for item in missing_codes: + report += f"\n - code: {item['code']}\n stops: (File: {item['file']})" + + + print(f"Saving report to {report_file_path}") + with open(report_file_path, 'w') as report_file: + report_file.write(report) + + print(report) + +if __name__ == "__main__": + main() \ No newline at end of file