grammar-correction-api / document_pipeline.py
Enoch Jason J
Add application file
9fca407
import requests
import re
from fpdf import FPDF
import os
import textract
# --- Configuration ---
AI_SERVICE_URL = "http://localhost:8000"
INPUT_DOC_PATH = "Doreen.doc"
OUTPUT_PDF_PATH = "Doreen DeFio_Dr. Daniel Rich_Report_Generated.pdf"
def correct_text_via_api(endpoint: str, text: str) -> str:
try:
response = requests.post(f"{AI_SERVICE_URL}/{endpoint}", json={"text": text})
response.raise_for_status()
return response.json()["corrected_text"]
except requests.exceptions.RequestException as e:
print(f"Error calling AI service at endpoint '{endpoint}': {e}")
return text
def extract_text_from_doc(filepath):
if not os.path.exists(filepath):
raise FileNotFoundError(f"Input file not found at: {filepath}")
try:
text_bytes = textract.process(filepath)
return text_bytes.decode('utf-8')
except Exception as e:
print(f"Error reading document with textract: {e}")
return None
def parse_and_correct_text(raw_text):
structured_data = {}
current_section = None
buffer = []
key_value_pattern = re.compile(
r'^\s*(Client Name|Date of Exam|Date of Accident|Examinee|Observed By|Performed By|Specialty|Facility|Facility Description|Appointment Scheduled For|Arrived at Office|Admitted to Exam Room|Intake Start|Exam Start|Exam End|Length of Exam|Total Length of Visit|Others Present|Description of IME physician|Layout of Exam Room|Did IME Physician Have Examinees Medical Records)\s*:\s*(.*)',
re.IGNORECASE | re.DOTALL
)
section_headers = ["Intake:", "Exam:"]
lines = [line.strip() for line in raw_text.split('\n') if line.strip()]
i = 0
while i < len(lines):
line = lines[i]
if line in section_headers:
if current_section and buffer:
full_paragraph = " ".join(buffer)
grammar_corrected = correct_text_via_api("correct_grammar", full_paragraph)
final_corrected = correct_text_via_api("correct_gender", grammar_corrected)
structured_data[current_section] = final_corrected
current_section = line.replace(":", "").strip()
buffer = []
i += 1
continue
match = key_value_pattern.match(line)
if match:
key, value = map(str.strip, match.groups())
if not value and (i + 1) < len(lines) and not key_value_pattern.match(lines[i+1]) and lines[i+1] not in section_headers:
value = lines[i+1]
i += 1
structured_data[key] = correct_text_via_api("correct_grammar", value)
elif current_section:
buffer.append(line)
i += 1
if current_section and buffer:
full_paragraph = " ".join(buffer)
grammar_corrected = correct_text_via_api("correct_grammar", full_paragraph)
final_corrected = correct_text_via_api("correct_gender", grammar_corrected)
structured_data[current_section] = final_corrected
return structured_data
class PDF(FPDF):
def header(self):
self.set_font("DejaVu", "B", 15)
self.cell(0, 10, 'IME WatchDog Report', 0, 1, 'C')
self.ln(10)
def footer(self):
self.set_y(-15)
self.set_font("DejaVu", "I", 8)
self.cell(0, 10, f'Page {self.page_no()}', 0, 0, 'C')
def generate_pdf(data, output_path):
pdf = PDF()
# --- FIX: Add a Unicode font that supports characters like ’ ---
# You may need to provide the path to the .ttf font file if not in a standard location.
# This example assumes it can be found.
try:
pdf.add_font("DejaVu", "", "DejaVuSans.ttf", uni=True)
pdf.add_font("DejaVu", "B", "DejaVuSans-Bold.ttf", uni=True)
pdf.add_font("DejaVu", "I", "DejaVuSans-Oblique.ttf", uni=True)
except RuntimeError:
print("---")
print("⚠️ FONT WARNING: DejaVuSans.ttf not found.")
print("The PDF will be generated, but may have character issues.")
print("Please download the DejaVu font family and place the .ttf files in this directory.")
print("---")
pdf.add_page()
pdf.set_font("DejaVu", "", 12)
key_order = [
"Client Name", "Date of Exam", "Date of Accident", "Examinee", "Observed By",
"Performed By", "Specialty", "Facility", "Facility Description",
"Appointment Scheduled For", "Arrived at Office", "Admitted to Exam Room",
"Intake Start", "Exam Start", "Exam End", "Length of Exam", "Total Length of Visit",
"Others Present", "Description of IME physician", "Layout of Exam Room",
"Did IME Physician Have Examinees Medical Records", "Intake", "Exam"
]
for key in key_order:
if key in data:
value = data[key]
pdf.set_font("DejaVu", "B", 12)
pdf.cell(0, 10, f"{key}:", ln=True)
pdf.set_font("DejaVu", "", 12)
pdf.multi_cell(0, 8, str(value))
pdf.ln(4)
pdf.output(output_path)
print(f"βœ… Successfully generated PDF report at: {output_path}")
if __name__ == "__main__":
print("--- Starting Document Transformation Pipeline ---")
if os.path.exists(INPUT_DOC_PATH):
print(f"1. Extracting text from '{INPUT_DOC_PATH}' using textract...")
raw_document_text = extract_text_from_doc(INPUT_DOC_PATH)
if raw_document_text:
print("2. Parsing and correcting text via AI microservice...")
corrected_data = parse_and_correct_text(raw_document_text)
print(f"3. Generating PDF report '{OUTPUT_PDF_PATH}'...")
generate_pdf(corrected_data, OUTPUT_PDF_PATH)
print("--- Pipeline Finished ---")
else:
print(f"❌ ERROR: Input file not found: '{INPUT_DOC_PATH}'")