Add a PDF Parser which scans for missng codes
Signed-off-by: Fred Boniface <fred@fjla.uk>
This commit is contained in:
parent
4ae711e9c2
commit
14a8a63c14
87
src/find-missing-from-pdf-dir.py
Normal file
87
src/find-missing-from-pdf-dir.py
Normal file
@ -0,0 +1,87 @@
|
|||||||
|
import os, sys, json, subprocess, re, yaml, requests
|
||||||
|
|
||||||
|
report_file_path = "./report.txt"
|
||||||
|
code_store_list_url = "https://git.fjla.uk/OwlBoard/data/raw/branch/main/pis/gw.yaml"
|
||||||
|
|
||||||
|
def is_pdfgrep_installed():
|
||||||
|
try:
|
||||||
|
subprocess.check_call(["pdfgrep", "--version"], stdout=subprocess.PIPE, stderr=subprocess.PIPE)
|
||||||
|
return True
|
||||||
|
except subprocess.CalledProcessError:
|
||||||
|
return False
|
||||||
|
|
||||||
|
def fetch_and_parse_yaml(url):
|
||||||
|
try:
|
||||||
|
response = requests.get(url)
|
||||||
|
response.raise_for_status()
|
||||||
|
existing_codes = yaml.safe_load(response.text)
|
||||||
|
return existing_codes
|
||||||
|
except Exception as e:
|
||||||
|
print(f"Error downloading and parsing codes: {e}")
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
def main():
|
||||||
|
|
||||||
|
if len(sys.argv) != 2:
|
||||||
|
print("Usage: python pdf_code_extraction.py <directory_path>")
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
pdf_directory = sys.argv[1]
|
||||||
|
|
||||||
|
if not os.path.isdir(pdf_directory):
|
||||||
|
print(f"'{pdf_directory}' is not a valid directory.")
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
if not is_pdfgrep_installed():
|
||||||
|
print("pdfgrep is not installed on your system.")
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
code_list = []
|
||||||
|
|
||||||
|
pdfgrep_cmd = f"pdfgrep -Prno 'code\\s*:\\s*\\d{{4}}' {pdf_directory}"
|
||||||
|
pdfgrep_output = subprocess.getoutput(pdfgrep_cmd)
|
||||||
|
|
||||||
|
|
||||||
|
for line in pdfgrep_output.splitlines():
|
||||||
|
match = re.search(r"^(.*?):\s*code\s*:\s*(\d{4})", line)
|
||||||
|
if match:
|
||||||
|
filename, code = match.groups()
|
||||||
|
code_list.append({"file":filename, "code":str(code)})
|
||||||
|
|
||||||
|
existing_codes = fetch_and_parse_yaml(code_store_list_url)['pis']
|
||||||
|
existing_set = set()
|
||||||
|
for item in existing_codes:
|
||||||
|
code = item['code']
|
||||||
|
existing_set.add(str(code))
|
||||||
|
|
||||||
|
unique_codes = set()
|
||||||
|
unique_code_list = []
|
||||||
|
missing_codes = []
|
||||||
|
for item in code_list:
|
||||||
|
code = item['code']
|
||||||
|
if code not in unique_codes:
|
||||||
|
unique_codes.add(code)
|
||||||
|
unique_code_list.append(item)
|
||||||
|
if code not in existing_set:
|
||||||
|
missing_codes.append(item)
|
||||||
|
|
||||||
|
#print(missing_codes)
|
||||||
|
|
||||||
|
report = f"""
|
||||||
|
Number of missing codes found: {len(missing_codes)}
|
||||||
|
|
||||||
|
Missing Codes:
|
||||||
|
"""
|
||||||
|
|
||||||
|
for item in missing_codes:
|
||||||
|
report += f"\n - code: {item['code']}\n stops: (File: {item['file']})"
|
||||||
|
|
||||||
|
|
||||||
|
print(f"Saving report to {report_file_path}")
|
||||||
|
with open(report_file_path, 'w') as report_file:
|
||||||
|
report_file.write(report)
|
||||||
|
|
||||||
|
print(report)
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
Reference in New Issue
Block a user