Spaces:

Emeritus-21
/

handwritten-text-recognition

Runtime error

App Files Files Community

Emeritus-21 commited on Aug 24

Commit

de5e2ab

verified ·

1 Parent(s): 5e2af9a

Update app.py

Browse files

Files changed (1) hide show

app.py +29 -70

app.py CHANGED Viewed

@@ -3,13 +3,10 @@ from threading import Thread
 import gradio as gr
 import spaces
 from PIL import Image
 import torch
-from transformers import AutoProcessor, AutoModelForImageTextToText, Qwen2_5_VLForConditionalGeneration
-from reportlab.platypus import SimpleDocTemplate, Paragraph
-from reportlab.lib.styles import getSampleStyleSheet
-from docx import Document
-from gtts import gTTS
-from jiwer import cer
 # ---------------- Models ----------------
 MODEL_PATHS = {
@@ -35,76 +32,38 @@ for name, (repo_id, cls) in MODEL_PATHS.items():
     except Exception as e:
         print(f"⚠️ Failed to load {name}: {e}")
-# ---------------- Helpers ----------------
-def _build_inputs(processor, tokenizer, image: Image.Image, prompt: str):
-    messages = [{"role": "user", "content": [{"type": "image", "image": image}, {"type": "text", "text": prompt}]}]
-    if tokenizer and hasattr(tokenizer, "apply_chat_template"):
-        chat_prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
-        return processor(text=[chat_prompt], images=[image], return_tensors="pt")
-    return processor(text=[prompt], images=[image], return_tensors="pt")
-def _decode_text(model, processor, tokenizer, output_ids, prompt: str):
-    try:
-        decoded_text = processor.batch_decode(output_ids, skip_special_tokens=True)[0]
-        prompt_start = decoded_text.find(prompt)
-        if prompt_start != -1:
-            decoded_text = decoded_text[prompt_start + len(prompt):].strip()
-        else:
-            decoded_text = decoded_text.strip()
-        return decoded_text
-    except Exception:
-        try:
-            decoded_text = tokenizer.batch_decode(output_ids, skip_special_tokens=True)[0]
-            prompt_start = decoded_text.find(prompt)
-            if prompt_start != -1:
-                decoded_text = decoded_text[prompt_start + len(prompt):].strip()
-            return decoded_text
-        except Exception:
-            return str(output_ids).strip()
-# 🚀 Updated prompt with underline tagging instructions
-def _default_prompt(query: str | None) -> str:
-    if query and query.strip():
-        return query.strip()
-    return (
-        "You are a professional Handwritten OCR system.\n"
-        "TASK: Read the handwritten image and transcribe the text EXACTLY as written.\n"
-        "- Preserve original structure and line breaks.\n"
-        "- Keep spacing, bullet points, numbering, and indentation.\n"
-        "- Render tables as Markdown tables if present.\n"
-        "- Detect and mark UNDERLINED text with <u>...</u> tags.\n"
-        "- If text is double-underlined, wrap twice: <u><u>...</u></u>.\n"
-        "- Do NOT autocorrect spelling or grammar.\n"
-        "- Do NOT merge lines.\n"
-        "Return RAW transcription only."
-    )
-# ---------------- OCR Function ----------------
 @spaces.GPU
-def ocr_image(image: Image.Image, model_choice: str, query: str = None,
-              max_new_tokens: int = MAX_NEW_TOKENS_DEFAULT,
-              temperature: float = 0.1, top_p: float = 1.0, top_k: int = 0, repetition_penalty: float = 1.0,
-              progress=gr.Progress(track_tqdm=True)):
-    if image is None: return "Please upload or capture an image."
     if model_choice not in _loaded_models: return f"Invalid model: {model_choice}"
-    processor, model, tokenizer = _loaded_processors[model_choice], _loaded_models[model_choice], getattr(_loaded_processors[model_choice], "tokenizer", None)
-    prompt = _default_prompt(query)
-    batch = _build_inputs(processor, tokenizer, image, prompt).to(device)
     with torch.inference_mode():
-        output_ids = model.generate(**batch, max_new_tokens=max_new_tokens, do_sample=False,
-                                    temperature=temperature, top_p=top_p, top_k=top_k, repetition_penalty=repetition_penalty)
-    return _decode_text(model, processor, tokenizer, output_ids, prompt).replace("<|im_end|>", "").strip()
-# ---------------- Export Helpers ----------------
-def _safe_text(text: str) -> str: return (text or "").strip()
-def save_as_pdf(text): ...
-def save_as_word(text): ...
-def save_as_audio(text): ...
-def calculate_cer_score(gt, pred): ...
-# ---------------- Gradio Interface ----------------
 with gr.Blocks(theme=gr.themes.Soft()) as demo:
-    gr.Markdown("## ✍🏾 Wilson OCR (Prompt underline mode)")
     model_choice = gr.Radio(choices=list(MODEL_PATHS.keys()), value=list(MODEL_PATHS.keys())[0], label="Select OCR Model")
     with gr.Tab("🖼 Image Inference"):
@@ -113,7 +72,7 @@ with gr.Blocks(theme=gr.themes.Soft()) as demo:
         extract_btn = gr.Button("📤 Extract RAW Text", variant="primary")
         raw_output = gr.Textbox(label="📜 RAW Structured Output", lines=18, show_copy_button=True)
-        extract_btn.click(fn=ocr_image, inputs=[image_input, model_choice, query_input], outputs=[raw_output])
 if __name__ == "__main__":
     demo.queue().launch(share=True)

 import gradio as gr
 import spaces
 from PIL import Image
+import numpy as np
+import cv2
 import torch
+from transformers import AutoProcessor, Qwen2_5_VLForConditionalGeneration
 # ---------------- Models ----------------
 MODEL_PATHS = {
     except Exception as e:
         print(f"⚠️ Failed to load {name}: {e}")
+# ---------------- Underline Detection ----------------
+def detect_underlines(image: Image.Image):
+    cv_img = cv2.cvtColor(np.array(image), cv2.COLOR_RGB2GRAY)
+    _, thresh = cv2.threshold(cv_img, 150, 255, cv2.THRESH_BINARY_INV)
+    kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (30, 1))
+    detected_lines = cv2.morphologyEx(thresh, cv2.MORPH_OPEN, kernel, iterations=2)
+    return detected_lines
+# ---------------- OCR + Underline ----------------
 @spaces.GPU
+def ocr_with_underlines(image: Image.Image, model_choice: str, query: str = None,
+                        max_new_tokens: int = MAX_NEW_TOKENS_DEFAULT):
+    if image is None: return "Please upload an image."
     if model_choice not in _loaded_models: return f"Invalid model: {model_choice}"
+    processor, model = _loaded_processors[model_choice], _loaded_models[model_choice]
+    # Run OCR
+    inputs = processor(images=image, text="Transcribe handwriting.", return_tensors="pt").to(device)
     with torch.inference_mode():
+        output_ids = model.generate(**inputs, max_new_tokens=max_new_tokens)
+    raw_text = processor.batch_decode(output_ids, skip_special_tokens=True)[0]
+    # Run CV underline detection
+    underline_mask = detect_underlines(image)
+    if np.sum(underline_mask) > 5000:
+        raw_text = f"<u>{raw_text}</u>"
+    return raw_text.strip()
+# ---------------- Gradio UI ----------------
 with gr.Blocks(theme=gr.themes.Soft()) as demo:
+    gr.Markdown("## ✍🏾 Wilson OCR (OpenCV underline mode)")
     model_choice = gr.Radio(choices=list(MODEL_PATHS.keys()), value=list(MODEL_PATHS.keys())[0], label="Select OCR Model")
     with gr.Tab("🖼 Image Inference"):
         extract_btn = gr.Button("📤 Extract RAW Text", variant="primary")
         raw_output = gr.Textbox(label="📜 RAW Structured Output", lines=18, show_copy_button=True)
+        extract_btn.click(fn=ocr_with_underlines, inputs=[image_input, model_choice, query_input], outputs=[raw_output])
 if __name__ == "__main__":
     demo.queue().launch(share=True)