Spaces:

enoch10jason
/

grammar-correction-api

Build error

grammar-correction-api / document_pipeline.py

Enoch Jason J

Add application file

9fca407 2 months ago

5.79 kB

	import requests
	import re
	from fpdf import FPDF
	import os
	import textract

	# --- Configuration ---
	AI_SERVICE_URL = "http://localhost:8000"
	INPUT_DOC_PATH = "Doreen.doc"
	OUTPUT_PDF_PATH = "Doreen DeFio_Dr. Daniel Rich_Report_Generated.pdf"

	def correct_text_via_api(endpoint: str, text: str) -> str:
	try:
	response = requests.post(f"{AI_SERVICE_URL}/{endpoint}", json={"text": text})
	response.raise_for_status()
	return response.json()["corrected_text"]
	except requests.exceptions.RequestException as e:
	print(f"Error calling AI service at endpoint '{endpoint}': {e}")
	return text

	def extract_text_from_doc(filepath):
	if not os.path.exists(filepath):
	raise FileNotFoundError(f"Input file not found at: {filepath}")
	try:
	text_bytes = textract.process(filepath)
	return text_bytes.decode('utf-8')
	except Exception as e:
	print(f"Error reading document with textract: {e}")
	return None

	def parse_and_correct_text(raw_text):
	structured_data = {}
	current_section = None
	buffer = []
	key_value_pattern = re.compile(
	r'^\s(Client Name\|Date of Exam\|Date of Accident\|Examinee\|Observed By\|Performed By\|Specialty\|Facility\|Facility Description\|Appointment Scheduled For\|Arrived at Office\|Admitted to Exam Room\|Intake Start\|Exam Start\|Exam End\|Length of Exam\|Total Length of Visit\|Others Present\|Description of IME physician\|Layout of Exam Room\|Did IME Physician Have Examinees Medical Records)\s:\s(.)',
	re.IGNORECASE \| re.DOTALL
	)
	section_headers = ["Intake:", "Exam:"]
	lines = [line.strip() for line in raw_text.split('\n') if line.strip()]

	i = 0
	while i < len(lines):
	line = lines[i]
	if line in section_headers:
	if current_section and buffer:
	full_paragraph = " ".join(buffer)
	grammar_corrected = correct_text_via_api("correct_grammar", full_paragraph)
	final_corrected = correct_text_via_api("correct_gender", grammar_corrected)
	structured_data[current_section] = final_corrected
	current_section = line.replace(":", "").strip()
	buffer = []
	i += 1
	continue
	match = key_value_pattern.match(line)
	if match:
	key, value = map(str.strip, match.groups())
	if not value and (i + 1) < len(lines) and not key_value_pattern.match(lines[i+1]) and lines[i+1] not in section_headers:
	value = lines[i+1]
	i += 1
	structured_data[key] = correct_text_via_api("correct_grammar", value)
	elif current_section:
	buffer.append(line)
	i += 1
	if current_section and buffer:
	full_paragraph = " ".join(buffer)
	grammar_corrected = correct_text_via_api("correct_grammar", full_paragraph)
	final_corrected = correct_text_via_api("correct_gender", grammar_corrected)
	structured_data[current_section] = final_corrected
	return structured_data

	class PDF(FPDF):
	def header(self):
	self.set_font("DejaVu", "B", 15)
	self.cell(0, 10, 'IME WatchDog Report', 0, 1, 'C')
	self.ln(10)

	def footer(self):
	self.set_y(-15)
	self.set_font("DejaVu", "I", 8)
	self.cell(0, 10, f'Page {self.page_no()}', 0, 0, 'C')

	def generate_pdf(data, output_path):
	pdf = PDF()
	# --- FIX: Add a Unicode font that supports characters like ’ ---
	# You may need to provide the path to the .ttf font file if not in a standard location.
	# This example assumes it can be found.
	try:
	pdf.add_font("DejaVu", "", "DejaVuSans.ttf", uni=True)
	pdf.add_font("DejaVu", "B", "DejaVuSans-Bold.ttf", uni=True)
	pdf.add_font("DejaVu", "I", "DejaVuSans-Oblique.ttf", uni=True)
	except RuntimeError:
	print("---")
	print("⚠️ FONT WARNING: DejaVuSans.ttf not found.")
	print("The PDF will be generated, but may have character issues.")
	print("Please download the DejaVu font family and place the .ttf files in this directory.")
	print("---")

	pdf.add_page()
	pdf.set_font("DejaVu", "", 12)
	key_order = [
	"Client Name", "Date of Exam", "Date of Accident", "Examinee", "Observed By",
	"Performed By", "Specialty", "Facility", "Facility Description",
	"Appointment Scheduled For", "Arrived at Office", "Admitted to Exam Room",
	"Intake Start", "Exam Start", "Exam End", "Length of Exam", "Total Length of Visit",
	"Others Present", "Description of IME physician", "Layout of Exam Room",
	"Did IME Physician Have Examinees Medical Records", "Intake", "Exam"
	]
	for key in key_order:
	if key in data:
	value = data[key]
	pdf.set_font("DejaVu", "B", 12)
	pdf.cell(0, 10, f"{key}:", ln=True)
	pdf.set_font("DejaVu", "", 12)
	pdf.multi_cell(0, 8, str(value))
	pdf.ln(4)
	pdf.output(output_path)
	print(f"✅ Successfully generated PDF report at: {output_path}")

	if __name__ == "__main__":
	print("--- Starting Document Transformation Pipeline ---")
	if os.path.exists(INPUT_DOC_PATH):
	print(f"1. Extracting text from '{INPUT_DOC_PATH}' using textract...")
	raw_document_text = extract_text_from_doc(INPUT_DOC_PATH)
	if raw_document_text:
	print("2. Parsing and correcting text via AI microservice...")
	corrected_data = parse_and_correct_text(raw_document_text)
	print(f"3. Generating PDF report '{OUTPUT_PDF_PATH}'...")
	generate_pdf(corrected_data, OUTPUT_PDF_PATH)
	print("--- Pipeline Finished ---")
	else:
	print(f"❌ ERROR: Input file not found: '{INPUT_DOC_PATH}'")