Spaces:

saeid1999
/

qwen3

Runtime error

App Files Files Community

saeid1999 commited on Nov 1

Commit

3bdb5e3

verified ·

1 Parent(s): 2679a38

Update app.py

Browse files

Files changed (1) hide show

app.py +181 -159

app.py CHANGED Viewed

@@ -3,196 +3,218 @@ from fastapi.responses import JSONResponse
 from PIL import Image
 from transformers import Qwen2VLForConditionalGeneration, AutoProcessor
 import torch
 from qwen_vl_utils import process_vision_info
-import fitz  # PyMuPDF
 import io
-import os
-from typing import List, Dict
-import tempfile
-import uvicorn
-from contextlib import asynccontextmanager
-# Initialize model and processor
-model_name = "NAMAA-Space/Qari-OCR-0.2.2.1-VL-2B-Instruct"
 model = None
 processor = None
 max_tokens = 2000
-@asynccontextmanager
-async def lifespan(app: FastAPI):
-    """Load model on startup"""
     global model, processor
-    # Detect device
-    device = "cuda" if torch.cuda.is_available() else "cpu"
-    print(f"Using device: {device}")
-    # Load model with appropriate settings for CPU or GPU
-    if device == "cpu":
-        model = Qwen2VLForConditionalGeneration.from_pretrained(
-            model_name,
-            torch_dtype=torch.float32,
-            device_map="cpu"
-        )
-    else:
-        model = Qwen2VLForConditionalGeneration.from_pretrained(
-            model_name,
-            torch_dtype="auto",
-            device_map="auto"
-        )
     processor = AutoProcessor.from_pretrained(model_name)
-    print("Model loaded successfully!")
-    yield
-    # Cleanup on shutdown
-    print("Shutting down...")
-app = FastAPI(
-    title="PDF OCR API",
-    description="Convert PDF to images and extract text using Qari OCR model",
-    version="1.0.0",
-    lifespan=lifespan
-)
-def pdf_to_images(pdf_bytes: bytes) -> List[Image.Image]:
-    """Convert PDF pages to PIL Images"""
-    images = []
-    pdf_document = fitz.open(stream=pdf_bytes, filetype="pdf")
-    for page_num in range(len(pdf_document)):
-        page = pdf_document[page_num]
-        # Render page to image at 300 DPI for better quality
-        pix = page.get_pixmap(matrix=fitz.Matrix(300/72, 300/72))
-        img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
-        images.append(img)
-    pdf_document.close()
-    return images
-def process_image_ocr(image: Image.Image, temp_path: str) -> str:
-    """Process a single image with OCR"""
-    prompt = "Below is the image of one page of a document, as well as some raw textual content that was previously extracted for it. Just return the plain text representation of this document as if you were reading it naturally. Do not hallucinate."
-    # Save image temporarily
-    image.save(temp_path)
-    messages = [
-        {
-            "role": "user",
-            "content": [
-                {"type": "image", "image": f"file://{temp_path}"},
-                {"type": "text", "text": prompt},
-            ],
-        }
-    ]
-    text = processor.apply_chat_template(
-        messages, tokenize=False, add_generation_prompt=True
-    )
-    image_inputs, video_inputs = process_vision_info(messages)
-    inputs = processor(
-        text=[text],
-        images=image_inputs,
-        videos=video_inputs,
-        padding=True,
-        return_tensors="pt",
-    )
-    inputs = inputs.to(model.device)
-    generated_ids = model.generate(**inputs, max_new_tokens=max_tokens)
-    generated_ids_trimmed = [
-        out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
-    ]
-    output_text = processor.batch_decode(
-        generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
-    )[0]
-    # Clean up temp file
-    if os.path.exists(temp_path):
-        os.remove(temp_path)
-    return output_text
-@app.post("/ocr", response_model=Dict)
-async def extract_text_from_pdf(file: UploadFile = File(...)):
     """
-    Extract text from PDF file using OCR
-    - **file**: PDF file (max 5MB)
-    Returns JSON with extracted text for each page
     """
-    # Validate file type
-    if not file.filename.endswith('.pdf'):
-        raise HTTPException(status_code=400, detail="Only PDF files are allowed")
-    # Read file content
-    pdf_bytes = await file.read()
-    # Check file size (5MB limit)
-    file_size = len(pdf_bytes)
-    if file_size > 5 * 1024 * 1024:  # 5MB in bytes
-        raise HTTPException(
-            status_code=400,
-            detail=f"File size ({file_size / (1024*1024):.2f}MB) exceeds 5MB limit"
-        )
-    try:
-        # Convert PDF to images
-        images = pdf_to_images(pdf_bytes)
-        if not images:
-            raise HTTPException(status_code=400, detail="No pages found in PDF")
-        # Process each page
-        results = []
-        with tempfile.TemporaryDirectory() as temp_dir:
-            for i, image in enumerate(images):
-                temp_path = os.path.join(temp_dir, f"page_{i}.png")
-                try:
-                    ocr_text = process_image_ocr(image, temp_path)
-                    results.append({
-                        "page": i + 1,
-                        "text": ocr_text
-                    })
-                except Exception as e:
-                    results.append({
-                        "page": i + 1,
-                        "error": str(e)
-                    })
-        return {
-            "success": True,
-            "filename": file.filename,
-            "total_pages": len(images),
-            "file_size_mb": round(file_size / (1024*1024), 2),
-            "results": results
-        }
-    except Exception as e:
-        raise HTTPException(status_code=500, detail=f"Processing failed: {str(e)}")
-@app.get("/")
-async def root():
-    """API status endpoint"""
-    return {
-        "message": "PDF OCR API is running",
-        "model": model_name,
-        "max_file_size": "5MB",
-        "endpoints": {
-            "POST /ocr": "Extract text from PDF"
-        }
-    }
-@app.get("/health")
-async def health_check():
-    """Health check endpoint"""
-    return {
-        "status": "healthy",
-        "model_loaded": model is not None
-    }
 if __name__ == "__main__":
     uvicorn.run(app, host="0.0.0.0", port=7860)

 from PIL import Image
 from transformers import Qwen2VLForConditionalGeneration, AutoProcessor
 import torch
+import os
 from qwen_vl_utils import process_vision_info
 import io
+from typing import Optional
+import uuid
+app = FastAPI(title="Qari OCR API", description="OCR API using Qwen2VL model")
+# Global variables for model and processor
 model = None
 processor = None
 max_tokens = 2000
+@app.on_event("startup")
+async def load_model():
+    """Load model and processor on startup"""
     global model, processor
+    model_name = "NAMAA-Space/Qari-OCR-0.2.2.1-VL-2B-Instruct"
+    print("Loading model...")
+    model = Qwen2VLForConditionalGeneration.from_pretrained(
+        model_name,
+        torch_dtype="auto",
+        device_map="auto"
+    )
+    print("Loading processor...")
     processor = AutoProcessor.from_pretrained(model_name)
+    print("Model and processor loaded successfully!")
+@app.get("/")
+async def root():
+    """Health check endpoint"""
+    return {
+        "status": "running",
+        "model": "NAMAA-Space/Qari-OCR-0.2.2.1-VL-2B-Instruct",
+        "message": "OCR API is ready"
+    }
+@app.post("/ocr")
+async def perform_ocr(
+    file: UploadFile = File(...),
+    prompt: Optional[str] = None
+):
+    """
+    Perform OCR on uploaded image
+    Args:
+        file: Image file (PNG, JPG, JPEG)
+        prompt: Optional custom prompt (defaults to standard OCR prompt)
+    Returns:
+        JSON with extracted text
+    """
+    if model is None or processor is None:
+        raise HTTPException(status_code=503, detail="Model not loaded yet")
+    # Validate file type
+    if not file.content_type.startswith("image/"):
+        raise HTTPException(status_code=400, detail="File must be an image")
+    # Generate unique filename
+    temp_filename = f"temp_{uuid.uuid4()}.png"
+    try:
+        # Read and save image
+        contents = await file.read()
+        image = Image.open(io.BytesIO(contents))
+        image.save(temp_filename)
+        # Default prompt if not provided
+        if prompt is None:
+            prompt = "Below is the image of one page of a document, as well as some raw textual content that was previously extracted for it. Just return the plain text representation of this document as if you were reading it naturally. Do not hallucinate."
+        # Prepare messages
+        messages = [
+            {
+                "role": "user",
+                "content": [
+                    {"type": "image", "image": f"file://{temp_filename}"},
+                    {"type": "text", "text": prompt},
+                ],
+            }
+        ]
+        # Process inputs
+        text = processor.apply_chat_template(
+            messages, tokenize=False, add_generation_prompt=True
+        )
+        image_inputs, video_inputs = process_vision_info(messages)
+        inputs = processor(
+            text=[text],
+            images=image_inputs,
+            videos=video_inputs,
+            padding=True,
+            return_tensors="pt",
+        )
+        inputs = inputs.to("cuda")
+        # Generate output
+        generated_ids = model.generate(**inputs, max_new_tokens=max_tokens)
+        generated_ids_trimmed = [
+            out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
+        ]
+        output_text = processor.batch_decode(
+            generated_ids_trimmed,
+            skip_special_tokens=True,
+            clean_up_tokenization_spaces=False
+        )[0]
+        return JSONResponse(content={
+            "success": True,
+            "text": output_text,
+            "filename": file.filename
+        })
+    except Exception as e:
+        raise HTTPException(status_code=500, detail=f"Error processing image: {str(e)}")
+    finally:
+        # Clean up temporary file
+        if os.path.exists(temp_filename):
+            os.remove(temp_filename)
+@app.post("/ocr-batch")
+async def perform_ocr_batch(files: list[UploadFile] = File(...)):
     """
+    Perform OCR on multiple images
+    Args:
+        files: List of image files
+    Returns:
+        JSON with extracted text for each image
     """
+    if model is None or processor is None:
+        raise HTTPException(status_code=503, detail="Model not loaded yet")
+    results = []
+    for file in files:
+        if not file.content_type.startswith("image/"):
+            results.append({
+                "filename": file.filename,
+                "success": False,
+                "error": "File must be an image"
+            })
+            continue
+        temp_filename = f"temp_{uuid.uuid4()}.png"
+        try:
+            contents = await file.read()
+            image = Image.open(io.BytesIO(contents))
+            image.save(temp_filename)
+            prompt = "Below is the image of one page of a document, as well as some raw textual content that was previously extracted for it. Just return the plain text representation of this document as if you were reading it naturally. Do not hallucinate."
+            messages = [
+                {
+                    "role": "user",
+                    "content": [
+                        {"type": "image", "image": f"file://{temp_filename}"},
+                        {"type": "text", "text": prompt},
+                    ],
+                }
+            ]
+            text = processor.apply_chat_template(
+                messages, tokenize=False, add_generation_prompt=True
+            )
+            image_inputs, video_inputs = process_vision_info(messages)
+            inputs = processor(
+                text=[text],
+                images=image_inputs,
+                videos=video_inputs,
+                padding=True,
+                return_tensors="pt",
+            )
+            inputs = inputs.to("cuda")
+            generated_ids = model.generate(**inputs, max_new_tokens=max_tokens)
+            generated_ids_trimmed = [
+                out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
+            ]
+            output_text = processor.batch_decode(
+                generated_ids_trimmed,
+                skip_special_tokens=True,
+                clean_up_tokenization_spaces=False
+            )[0]
+            results.append({
+                "filename": file.filename,
+                "success": True,
+                "text": output_text
+            })
+        except Exception as e:
+            results.append({
+                "filename": file.filename,
+                "success": False,
+                "error": str(e)
+            })
+        finally:
+            if os.path.exists(temp_filename):
+                os.remove(temp_filename)
+    return JSONResponse(content={"results": results})
 if __name__ == "__main__":
     uvicorn.run(app, host="0.0.0.0", port=7860)