dgp2/src/main.py

82 lines
2.0 KiB
Python
Raw Normal View History

import os
import sys
import glob
2024-10-26 11:37:07 +01:00
import datetime
import json
import parse_pdf
# List all PDF files in the given directory
def list_pdf_files(directory):
pdf_files = glob.glob(os.path.join(directory, '*.pdf'))
return pdf_files
# Extracts date from PDF filename and returns its content
def get_schedule_card_data(filepath):
filename = os.path.basename(filepath)
date_str = filename[:4]
2024-10-26 11:37:07 +01:00
date_object = datetime.datetime.now()
if len(date_str) < 4:
raise ValueError("Filename must have at least four characters")
return None
month_str = date_str[:2]
day_str = date_str[2:]
try:
month = int(month_str)
day = int(day_str)
2024-10-26 11:37:07 +01:00
year = datetime.datetime.now().year
date_object = datetime.datetime(year=year, month=month, day=day)
except ValueError as e:
print(f"Error parsing date: {e}")
return None
schedule_card_data = {
"schedule_date": date_object,
"schedule_data": parse_pdf.parse_pdf_file(filepath)
}
return schedule_card_data
# Loop through data and remove duplicate codes.
# Check for existing codes via OwlBoard API.
# Validate existing codes and submit issue if not correct.
# Use the train list to search for stopping pattern of any absent codes.
# Format file and commit to git
def main():
# Check for arguments
if len(sys.argv) > 1:
directory = sys.argv[1]
else:
directory = os.getcwd()
if not os.path.isdir(directory):
print(f"Error: '{directory}' is not a valid directory")
return
pdf_files = list_pdf_files(directory)
if len(pdf_files) == 0:
print(f"Error: '{directory}' contains no PDF files")
return
else:
print(f"Found {len(pdf_files)} PDF files")
2024-10-26 11:37:07 +01:00
schedule_cards = []
for pdf_file in pdf_files:
schedule_cards.append(get_schedule_card_data(pdf_file))
# print(schedule_cards)
out = open("output.txt", "w")
out.write(json.dumps(schedule_cards, indent=4, default=str))
out.close()
if __name__ == "__main__":
main()