Spaces:

Armaggheddon
/

yolo11-document-layout

Running

File size: 3,944 Bytes

81fc526

from pathlib import Path
import gradio as gr
import pymupdf
from ultralytics import YOLO
from PIL import Image
from huggingface_hub import hf_hub_download

SAMPLES = Path(__file__).parent / "samples"

IMAGE_SAMPLES = [
    SAMPLES / "image1.png",
    SAMPLES / "image2.png",
    SAMPLES / "image3.png",
    SAMPLES / "image4.png",
]

AVAILABLE_MODELS = {
    "yolo11n": ("Armaggheddon/yolo11-document-layout", "yolo11n_doc_layout.pt"),
    "yolo11s": ("Armaggheddon/yolo11-document-layout", "yolo11s_doc_layout.pt"),
    "yolo11m": ("Armaggheddon/yolo11-document-layout", "yolo11m_doc_layout.pt"),
}
current_model = "yolo11n"
model = None

def load_model(selected_model):
    global model
    if model is None or current_model != selected_model:
        repo_id, filename = AVAILABLE_MODELS[selected_model]
        model_path = hf_hub_download(repo_id=repo_id, filename=filename)
        model = YOLO(model_path)

def model_runner(image, conf=0.25, iou=0.45):
    result = model.predict(source=image, save=False, verbose=False, conf=conf, iou=iou, imgsz=1280)
    result_img = result[0].plot()
    return result_img

def process_input(selected_model, pdf_input, image_input, conf=0.25, iou=0.45):
    if pdf_input is None and image_input is None:
        return gr.Error("Please upload a PDF or an image file.")
    
    load_model(selected_model)
    pages = []
    if pdf_input is not None and pdf_input.endswith(".pdf"):
        doc = pymupdf.open(pdf_input)
        for page in doc:
            pix = page.get_pixmap(dpi=200) # if A4 should result in above 1400px width
            pil_img = pix.pil_image()
            result_img = model_runner(pil_img)
            pages.append(result_img)
    elif image_input is not None and image_input.endswith((".png", ".jpg", ".jpeg")):
        image = image_input
        result_img = model_runner(image)
        pages.append(result_img)

    else:
        return gr.Error("Unsupported file type. Please upload a PDF or an image file with .pdf, .jpg or .jpeg extension.")

    return ((page, f"Page {i+1}") for i, page in enumerate(pages))

with gr.Blocks() as demo:
    gr.Markdown("# YOLO11 Document Layout 🔎📄")
    gr.Markdown(
"""
Detects layout elements in documents (PDFs or images) using YOLOv11 models and the Ultralytics library.
Upload a PDF or an image, select a model size, and click "Run" to see the detected layout elements.
- Finetuned models available at [Armaggheddon/yolo11-document-layout](https://huggingface.co/Armaggheddon/yolo11-document-layout)
- More available in the [GitHub Repository](https://github.com/Armaggheddon/yolo11_doc_layout)
"""
    )
    with gr.Row():
        with gr.Column():
            pdf_input = gr.File(label="Upload PDF", file_types=[".pdf"], file_count="single")
            image_input = gr.Image(label="Upload Image", type="filepath")
            clear_button = gr.Button("Clear")
            run_button = gr.Button("Run", variant="primary")
        with gr.Column():
            outputs = gr.Gallery(label="Output Image")
            with gr.Group():
                model_name = gr.Dropdown(
                    list(AVAILABLE_MODELS.keys()),
                    value="yolo11n",
                    label="Model size",
                )
                conf = gr.Slider(0, 1, value=0.25, step=0.01, label="Confidence threshold")
                iou = gr.Slider(0, 1, value=0.45, step=0.01, label="IOU threshold")

    examples = gr.Examples(
        examples=[[str(p), "yolo11n"] for p in IMAGE_SAMPLES],
        inputs=[image_input, model_name],
        cache_examples=False,
        fn=process_input,
        outputs=outputs,
    )

    run_button.click(
        fn=process_input,
        inputs=[model_name, pdf_input, image_input, conf, iou],
        outputs=outputs,
    )

    clear_button.click(
        fn=lambda: (None, None, None),
        inputs=[],
        outputs=[pdf_input, image_input, outputs],
    )

demo.launch()