Spaces:

Emeritus-21
/

handwritten-text-recognition

Runtime error

App Files Files Community

Emeritus-21 commited on Aug 22

Commit

927e645

verified ·

1 Parent(s): 4ef4dae

Update app.py

Browse files

Files changed (1) hide show

app.py +92 -54

app.py CHANGED Viewed

@@ -1,8 +1,8 @@
-# app.py — HTR Space (Refined Compact Version)
 import os, time
 from threading import Thread
 import gradio as gr
 from PIL import Image
 import torch
 from transformers import AutoProcessor, AutoModelForImageTextToText, Qwen2_5_VLForConditionalGeneration
@@ -10,30 +10,47 @@ from reportlab.platypus import SimpleDocTemplate, Paragraph
 from reportlab.lib.styles import getSampleStyleSheet
 from docx import Document
-# ---------------- Constants ----------------
-MAX_MAX_NEW_TOKENS = 2048
-DEFAULT_MAX_NEW_TOKENS = 512
-device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 # ---------------- Models ----------------
 MODEL_PATHS = {
-    "Complex Handwriting": ("prithivMLmods/Qwen2.5-VL-7B-Abliterated-Caption-it", Qwen2_5_VLForConditionalGeneration),
-    "Simple/Scanned Handwriting": ("nanonets/Nanonets-OCR-s", Qwen2_5_VLForConditionalGeneration),
-    "Structured Handwriting": ("Emeritus-21/Finetuned-full-HTR-model", AutoModelForImageTextToText),
 }
 _loaded_processors, _loaded_models = {}, {}
-print("🚀 Loading HTR models...")
 for name, (repo_id, cls) in MODEL_PATHS.items():
     try:
         processor = AutoProcessor.from_pretrained(repo_id, trust_remote_code=True)
-        model = cls.from_pretrained(repo_id, trust_remote_code=True,
-                                   torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
-                                   low_cpu_mem_usage=True).to(device).eval()
         _loaded_processors[name], _loaded_models[name] = processor, model
-        print(f"✅ {name} ready")
     except Exception as e:
-        print(f"⚠️ Failed {name}: {e}")
 # ---------------- Helpers ----------------
 def _build_inputs(processor, tokenizer, image: Image.Image, prompt: str):
@@ -46,31 +63,36 @@ def _build_inputs(processor, tokenizer, image: Image.Image, prompt: str):
 def _decode_text(model, processor, tokenizer, output_ids):
     for obj in [processor, tokenizer, getattr(model, "tokenizer", None)]:
         try: return obj.batch_decode(output_ids, skip_special_tokens=True)[0]
-        except: pass
     return str(output_ids)
 def _default_prompt(query: str | None) -> str:
     if query and query.strip(): return query.strip()
-    return ("You are a professional Handwritten OCR system.\n"
-            "TASK: Read the handwritten image and transcribe the text EXACTLY as written.\n"
-            "- Preserve original structure and line breaks.\n"
-            "- Keep spacing, bullet points, numbering, and indentation.\n"
-            "- Render tables as Markdown tables if present.\n"
-            "- Do NOT autocorrect spelling or grammar.\n"
-            "- Do NOT merge lines.\n"
-            "Return RAW transcription only.")
-# ---------------- OCR ----------------
-def ocr_image(model_name: str, image: Image.Image, query: str = "",
-              max_new_tokens: int = DEFAULT_MAX_NEW_TOKENS):
-    if image is None: return "Please upload an image."
-    if model_name not in _loaded_models: return "Invalid model selected."
-    processor, model = _loaded_processors[model_name], _loaded_models[model_name]
-    tokenizer = getattr(processor, "tokenizer", None)
     prompt = _default_prompt(query)
     batch = _build_inputs(processor, tokenizer, image, prompt).to(device)
     with torch.inference_mode():
-        output_ids = model.generate(**batch, max_new_tokens=max_new_tokens)
     return _decode_text(model, processor, tokenizer, output_ids).replace("<|im_end|>", "").strip()
 # ---------------- Export Helpers ----------------
@@ -93,28 +115,44 @@ def save_as_word(text):
     doc.save("output.docx")
     return "output.docx"
 # ---------------- Gradio Interface ----------------
-css = """.submit-btn { background-color: #2980b9 !important; color: white !important; }
-.submit-btn:hover { background-color: #3498db !important; }
-.canvas-output { border: 2px solid #4682B4; border-radius: 10px; padding: 20px;}"""
-with gr.Blocks(css=css, theme="soft") as demo:
-    gr.Markdown("## ✍🏾 Wilson HTR OCR")
-    with gr.Row():
-        with gr.Column():
-            model_choice = gr.Radio(choices=list(MODEL_PATHS.keys()), value=list(MODEL_PATHS.keys())[0], label="Select OCR Model")
-            query_input = gr.Textbox(label="Custom Prompt (optional)")
-            image_input = gr.Image(type="pil", label="Upload / Capture Image", source="upload")
-            submit_btn = gr.Button("📤 Extract Text", elem_classes="submit-btn")
-            raw_output = gr.Textbox(label="OCR Output", lines=15, interactive=False, show_copy_button=True)
-            pdf_btn = gr.Button("⬇️ Download PDF")
-            word_btn = gr.Button("⬇️ Download Word")
-            pdf_file = gr.File(label="PDF File")
-            word_file = gr.File(label="Word File")
-            submit_btn.click(fn=ocr_image, inputs=[model_choice, image_input, query_input], outputs=[raw_output])
-            pdf_btn.click(fn=save_as_pdf, inputs=[raw_output], outputs=[pdf_file])
-            word_btn.click(fn=save_as_word, inputs=[raw_output], outputs=[word_file])
 if __name__ == "__main__":
-    demo.queue(max_size=50).launch(share=True, show_error=True)

+# app.py — HTR Space (Compact Version)
 import os, time
 from threading import Thread
 import gradio as gr
+import spaces
 from PIL import Image
 import torch
 from transformers import AutoProcessor, AutoModelForImageTextToText, Qwen2_5_VLForConditionalGeneration
 from reportlab.lib.styles import getSampleStyleSheet
 from docx import Document
 # ---------------- Models ----------------
 MODEL_PATHS = {
+    "Model 1 (Complex handwrittings )": ("prithivMLmods/Qwen2.5-VL-7B-Abliterated-Caption-it", Qwen2_5_VLForConditionalGeneration),
+    "Model 2 (simple and scanned handwritting )": ("nanonets/Nanonets-OCR-s", Qwen2_5_VLForConditionalGeneration),
+    "Model 3 (structured handwritting)": ("Emeritus-21/Finetuned-full-HTR-model", AutoModelForImageTextToText),
 }
+MAX_NEW_TOKENS_DEFAULT = 512
+device = "cuda" if torch.cuda.is_available() else "cpu"
 _loaded_processors, _loaded_models = {}, {}
+print("🚀 Preloading models into GPU/CPU memory...")
 for name, (repo_id, cls) in MODEL_PATHS.items():
     try:
         processor = AutoProcessor.from_pretrained(repo_id, trust_remote_code=True)
+        model = cls.from_pretrained(
+            repo_id,
+            trust_remote_code=True,
+            torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
+            low_cpu_mem_usage=True
+        ).to(device).eval()
         _loaded_processors[name], _loaded_models[name] = processor, model
+        print(f"✅ {name} ready.")
     except Exception as e:
+        print(f"⚠️ Failed to load {name}: {e}")
+# ---------------- GPU Warmup ----------------
+@spaces.GPU
+def warmup(progress=gr.Progress(track_tqdm=True)):
+    try:
+        default_model_choice = next(iter(MODEL_PATHS.keys()))
+        processor = _loaded_processors[default_model_choice]
+        model = _loaded_models[default_model_choice]
+        tokenizer = getattr(processor, "tokenizer", None)
+        messages = [{"role": "user", "content": [{"type": "text", "text": "Warmup."}]}]
+        chat_prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True) if tokenizer and hasattr(tokenizer, "apply_chat_template") else "Warmup."
+        inputs = processor(text=[chat_prompt], images=None, return_tensors="pt").to(device)
+        with torch.inference_mode(): _ = model.generate(**inputs, max_new_tokens=1)
+        return f"GPU warm and {default_model_choice} ready."
+    except Exception as e:
+        return f"Warmup skipped: {e}"
 # ---------------- Helpers ----------------
 def _build_inputs(processor, tokenizer, image: Image.Image, prompt: str):
 def _decode_text(model, processor, tokenizer, output_ids):
     for obj in [processor, tokenizer, getattr(model, "tokenizer", None)]:
         try: return obj.batch_decode(output_ids, skip_special_tokens=True)[0]
+        except Exception: pass
     return str(output_ids)
 def _default_prompt(query: str | None) -> str:
     if query and query.strip(): return query.strip()
+    return (
+        "You are a professional Handwritten OCR system.\n"
+        "TASK: Read the handwritten image and transcribe the text EXACTLY as written.\n"
+        "- Preserve original structure and line breaks.\n"
+        "- Keep spacing, bullet points, numbering, and indentation.\n"
+        "- Render tables as Markdown tables if present.\n"
+        "- Do NOT autocorrect spelling or grammar.\n"
+        "- Do NOT merge lines.\n"
+        "Return RAW transcription only."
+    )
+# ---------------- OCR Function ----------------
+@spaces.GPU
+def ocr_image(image: Image.Image, model_choice: str, query: str = None,
+              max_new_tokens: int = MAX_NEW_TOKENS_DEFAULT,
+              temperature: float = 0.1, top_p: float = 1.0, top_k: int = 0, repetition_penalty: float = 1.0,
+              progress=gr.Progress(track_tqdm=True)):
+    if image is None: return "Please upload or capture an image."
+    if model_choice not in _loaded_models: return f"Invalid model: {model_choice}"
+    processor, model, tokenizer = _loaded_processors[model_choice], _loaded_models[model_choice], getattr(_loaded_processors[model_choice], "tokenizer", None)
     prompt = _default_prompt(query)
     batch = _build_inputs(processor, tokenizer, image, prompt).to(device)
     with torch.inference_mode():
+        output_ids = model.generate(**batch, max_new_tokens=max_new_tokens, do_sample=False,
+                                    temperature=temperature, top_p=top_p, top_k=top_k, repetition_penalty=repetition_penalty)
     return _decode_text(model, processor, tokenizer, output_ids).replace("<|im_end|>", "").strip()
 # ---------------- Export Helpers ----------------
     doc.save("output.docx")
     return "output.docx"
+def save_as_audio(text):
+    text = _safe_text(text)
+    if not text: return None
+    try:
+        from gTTS import gTTS
+        tts = gTTS(text)
+        tts.save("output.mp3")
+        return "output.mp3"
+    except Exception as e:
+        print(f"gTTS failed: {e}")
+        return None
 # ---------------- Gradio Interface ----------------
+with gr.Blocks(theme=gr.themes.Soft()) as demo:
+    gr.Markdown("## ✍🏾 wilson Handwritten OCR")
+    model_choice = gr.Radio(choices=list(MODEL_PATHS.keys()), value=list(MODEL_PATHS.keys())[0], label="Select OCR Model")
+    with gr.Tab("🖼 Image Inference"):
+        query_input = gr.Textbox(label="Custom Prompt (optional)", placeholder="Leave empty for RAW structured output")
+        image_input = gr.Image(type="pil", label="Upload / Capture Handwritten Image", sources=["upload", "webcam"])
+        with gr.Accordion("⚙️ Advanced Options", open=False):
+            max_new_tokens = gr.Slider(1, 2048, value=MAX_NEW_TOKENS_DEFAULT, step=1, label="Max new tokens")
+            temperature = gr.Slider(0.1, 2.0, value=0.1, step=0.05, label="Temperature")
+            top_p = gr.Slider(0.05, 1.0, value=1.0, step=0.05, label="Top-p (nucleus)")
+            top_k = gr.Slider(0, 1000, value=0, step=1, label="Top-k")
+            repetition_penalty = gr.Slider(0.8, 2.0, value=1.0, step=0.05, label="Repetition penalty")
+        extract_btn = gr.Button("📤 Extract RAW Text", variant="primary")
+        clear_btn = gr.Button("🧹 Clear")
+        raw_output = gr.Textbox(label="📜 RAW Structured Output (exact as written)", lines=18, show_copy_button=True)
+        pdf_btn = gr.Button("⬇️ Download as PDF")
+        word_btn = gr.Button("⬇️ Download as Word")
+        audio_btn = gr.Button("🔊 Download as Audio")
+        pdf_file, word_file, audio_file = gr.File(label="PDF File"), gr.File(label="Word File"), gr.File(label="Audio File")
+        extract_btn.click(fn=ocr_image, inputs=[image_input, model_choice, query_input, max_new_tokens, temperature, top_p, top_k, repetition_penalty], outputs=[raw_output], api_name="ocr_image")
+        pdf_btn.click(fn=save_as_pdf, inputs=[raw_output], outputs=[pdf_file])
+        word_btn.click(fn=save_as_word, inputs=[raw_output], outputs=[word_file])
+        audio_btn.click(fn=save_as_audio, inputs=[raw_output], outputs=[audio_file])
+        clear_btn.click(fn=lambda: ("", None, "", MAX_NEW_TOKENS_DEFAULT, 0.1, 1.0, 0, 1.0), outputs=[raw_output, image_input, query_input, max_new_tokens, temperature, top_p, top_k, repetition_penalty])
 if __name__ == "__main__":
+    demo.queue(max_size=50).launch(show_error=True)