Spaces:
Build error
Build error
| import requests | |
| import re | |
| from fpdf import FPDF | |
| import os | |
| import textract | |
| # --- Configuration --- | |
| AI_SERVICE_URL = "http://localhost:8000" | |
| INPUT_DOC_PATH = "Doreen.doc" | |
| OUTPUT_PDF_PATH = "Doreen DeFio_Dr. Daniel Rich_Report_Generated.pdf" | |
| def correct_text_via_api(endpoint: str, text: str) -> str: | |
| try: | |
| response = requests.post(f"{AI_SERVICE_URL}/{endpoint}", json={"text": text}) | |
| response.raise_for_status() | |
| return response.json()["corrected_text"] | |
| except requests.exceptions.RequestException as e: | |
| print(f"Error calling AI service at endpoint '{endpoint}': {e}") | |
| return text | |
| def extract_text_from_doc(filepath): | |
| if not os.path.exists(filepath): | |
| raise FileNotFoundError(f"Input file not found at: {filepath}") | |
| try: | |
| text_bytes = textract.process(filepath) | |
| return text_bytes.decode('utf-8') | |
| except Exception as e: | |
| print(f"Error reading document with textract: {e}") | |
| return None | |
| def parse_and_correct_text(raw_text): | |
| structured_data = {} | |
| current_section = None | |
| buffer = [] | |
| key_value_pattern = re.compile( | |
| r'^\s*(Client Name|Date of Exam|Date of Accident|Examinee|Observed By|Performed By|Specialty|Facility|Facility Description|Appointment Scheduled For|Arrived at Office|Admitted to Exam Room|Intake Start|Exam Start|Exam End|Length of Exam|Total Length of Visit|Others Present|Description of IME physician|Layout of Exam Room|Did IME Physician Have Examinees Medical Records)\s*:\s*(.*)', | |
| re.IGNORECASE | re.DOTALL | |
| ) | |
| section_headers = ["Intake:", "Exam:"] | |
| lines = [line.strip() for line in raw_text.split('\n') if line.strip()] | |
| i = 0 | |
| while i < len(lines): | |
| line = lines[i] | |
| if line in section_headers: | |
| if current_section and buffer: | |
| full_paragraph = " ".join(buffer) | |
| grammar_corrected = correct_text_via_api("correct_grammar", full_paragraph) | |
| final_corrected = correct_text_via_api("correct_gender", grammar_corrected) | |
| structured_data[current_section] = final_corrected | |
| current_section = line.replace(":", "").strip() | |
| buffer = [] | |
| i += 1 | |
| continue | |
| match = key_value_pattern.match(line) | |
| if match: | |
| key, value = map(str.strip, match.groups()) | |
| if not value and (i + 1) < len(lines) and not key_value_pattern.match(lines[i+1]) and lines[i+1] not in section_headers: | |
| value = lines[i+1] | |
| i += 1 | |
| structured_data[key] = correct_text_via_api("correct_grammar", value) | |
| elif current_section: | |
| buffer.append(line) | |
| i += 1 | |
| if current_section and buffer: | |
| full_paragraph = " ".join(buffer) | |
| grammar_corrected = correct_text_via_api("correct_grammar", full_paragraph) | |
| final_corrected = correct_text_via_api("correct_gender", grammar_corrected) | |
| structured_data[current_section] = final_corrected | |
| return structured_data | |
| class PDF(FPDF): | |
| def header(self): | |
| self.set_font("DejaVu", "B", 15) | |
| self.cell(0, 10, 'IME WatchDog Report', 0, 1, 'C') | |
| self.ln(10) | |
| def footer(self): | |
| self.set_y(-15) | |
| self.set_font("DejaVu", "I", 8) | |
| self.cell(0, 10, f'Page {self.page_no()}', 0, 0, 'C') | |
| def generate_pdf(data, output_path): | |
| pdf = PDF() | |
| # --- FIX: Add a Unicode font that supports characters like β --- | |
| # You may need to provide the path to the .ttf font file if not in a standard location. | |
| # This example assumes it can be found. | |
| try: | |
| pdf.add_font("DejaVu", "", "DejaVuSans.ttf", uni=True) | |
| pdf.add_font("DejaVu", "B", "DejaVuSans-Bold.ttf", uni=True) | |
| pdf.add_font("DejaVu", "I", "DejaVuSans-Oblique.ttf", uni=True) | |
| except RuntimeError: | |
| print("---") | |
| print("β οΈ FONT WARNING: DejaVuSans.ttf not found.") | |
| print("The PDF will be generated, but may have character issues.") | |
| print("Please download the DejaVu font family and place the .ttf files in this directory.") | |
| print("---") | |
| pdf.add_page() | |
| pdf.set_font("DejaVu", "", 12) | |
| key_order = [ | |
| "Client Name", "Date of Exam", "Date of Accident", "Examinee", "Observed By", | |
| "Performed By", "Specialty", "Facility", "Facility Description", | |
| "Appointment Scheduled For", "Arrived at Office", "Admitted to Exam Room", | |
| "Intake Start", "Exam Start", "Exam End", "Length of Exam", "Total Length of Visit", | |
| "Others Present", "Description of IME physician", "Layout of Exam Room", | |
| "Did IME Physician Have Examinees Medical Records", "Intake", "Exam" | |
| ] | |
| for key in key_order: | |
| if key in data: | |
| value = data[key] | |
| pdf.set_font("DejaVu", "B", 12) | |
| pdf.cell(0, 10, f"{key}:", ln=True) | |
| pdf.set_font("DejaVu", "", 12) | |
| pdf.multi_cell(0, 8, str(value)) | |
| pdf.ln(4) | |
| pdf.output(output_path) | |
| print(f"β Successfully generated PDF report at: {output_path}") | |
| if __name__ == "__main__": | |
| print("--- Starting Document Transformation Pipeline ---") | |
| if os.path.exists(INPUT_DOC_PATH): | |
| print(f"1. Extracting text from '{INPUT_DOC_PATH}' using textract...") | |
| raw_document_text = extract_text_from_doc(INPUT_DOC_PATH) | |
| if raw_document_text: | |
| print("2. Parsing and correcting text via AI microservice...") | |
| corrected_data = parse_and_correct_text(raw_document_text) | |
| print(f"3. Generating PDF report '{OUTPUT_PDF_PATH}'...") | |
| generate_pdf(corrected_data, OUTPUT_PDF_PATH) | |
| print("--- Pipeline Finished ---") | |
| else: | |
| print(f"β ERROR: Input file not found: '{INPUT_DOC_PATH}'") | |