File size: 3,944 Bytes
81fc526
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
from pathlib import Path
import gradio as gr
import pymupdf
from ultralytics import YOLO
from PIL import Image
from huggingface_hub import hf_hub_download

SAMPLES = Path(__file__).parent / "samples"

IMAGE_SAMPLES = [
    SAMPLES / "image1.png",
    SAMPLES / "image2.png",
    SAMPLES / "image3.png",
    SAMPLES / "image4.png",
]

AVAILABLE_MODELS = {
    "yolo11n": ("Armaggheddon/yolo11-document-layout", "yolo11n_doc_layout.pt"),
    "yolo11s": ("Armaggheddon/yolo11-document-layout", "yolo11s_doc_layout.pt"),
    "yolo11m": ("Armaggheddon/yolo11-document-layout", "yolo11m_doc_layout.pt"),
}
current_model = "yolo11n"
model = None

def load_model(selected_model):
    global model
    if model is None or current_model != selected_model:
        repo_id, filename = AVAILABLE_MODELS[selected_model]
        model_path = hf_hub_download(repo_id=repo_id, filename=filename)
        model = YOLO(model_path)

def model_runner(image, conf=0.25, iou=0.45):
    result = model.predict(source=image, save=False, verbose=False, conf=conf, iou=iou, imgsz=1280)
    result_img = result[0].plot()
    return result_img

def process_input(selected_model, pdf_input, image_input, conf=0.25, iou=0.45):
    if pdf_input is None and image_input is None:
        return gr.Error("Please upload a PDF or an image file.")
    
    load_model(selected_model)
    pages = []
    if pdf_input is not None and pdf_input.endswith(".pdf"):
        doc = pymupdf.open(pdf_input)
        for page in doc:
            pix = page.get_pixmap(dpi=200) # if A4 should result in above 1400px width
            pil_img = pix.pil_image()
            result_img = model_runner(pil_img)
            pages.append(result_img)
    elif image_input is not None and image_input.endswith((".png", ".jpg", ".jpeg")):
        image = image_input
        result_img = model_runner(image)
        pages.append(result_img)

    else:
        return gr.Error("Unsupported file type. Please upload a PDF or an image file with .pdf, .jpg or .jpeg extension.")

    return ((page, f"Page {i+1}") for i, page in enumerate(pages))

with gr.Blocks() as demo:
    gr.Markdown("# YOLO11 Document Layout πŸ”ŽπŸ“„")
    gr.Markdown(
"""
Detects layout elements in documents (PDFs or images) using YOLOv11 models and the Ultralytics library.
Upload a PDF or an image, select a model size, and click "Run" to see the detected layout elements.
- Finetuned models available at [Armaggheddon/yolo11-document-layout](https://huggingface.co/Armaggheddon/yolo11-document-layout)
- More available in the [GitHub Repository](https://github.com/Armaggheddon/yolo11_doc_layout)
"""
    )
    with gr.Row():
        with gr.Column():
            pdf_input = gr.File(label="Upload PDF", file_types=[".pdf"], file_count="single")
            image_input = gr.Image(label="Upload Image", type="filepath")
            clear_button = gr.Button("Clear")
            run_button = gr.Button("Run", variant="primary")
        with gr.Column():
            outputs = gr.Gallery(label="Output Image")
            with gr.Group():
                model_name = gr.Dropdown(
                    list(AVAILABLE_MODELS.keys()),
                    value="yolo11n",
                    label="Model size",
                )
                conf = gr.Slider(0, 1, value=0.25, step=0.01, label="Confidence threshold")
                iou = gr.Slider(0, 1, value=0.45, step=0.01, label="IOU threshold")

    examples = gr.Examples(
        examples=[[str(p), "yolo11n"] for p in IMAGE_SAMPLES],
        inputs=[image_input, model_name],
        cache_examples=False,
        fn=process_input,
        outputs=outputs,
    )

    run_button.click(
        fn=process_input,
        inputs=[model_name, pdf_input, image_input, conf, iou],
        outputs=outputs,
    )

    clear_button.click(
        fn=lambda: (None, None, None),
        inputs=[],
        outputs=[pdf_input, image_input, outputs],
    )

demo.launch()