File size: 2,965 Bytes
dd9548e
dee9ee3
dd9548e
 
 
9b9b214
 
 
 
 
 
 
 
 
dd9548e
 
dee9ee3
dd9548e
 
 
 
 
 
 
dee9ee3
dd9548e
 
 
 
dee9ee3
 
 
 
dd9548e
dee9ee3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
dd9548e
dee9ee3
 
 
 
 
dd9548e
 
 
 
 
dee9ee3
 
dd9548e
 
dee9ee3
 
dd9548e
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
import gradio as gr
from transformers import AutoModel, AutoImageProcessor
import cv2
import numpy as np
import torch
import os
from huggingface_hub import login

# Authentifizierung mit Hugging Face API-Token
HF_TOKEN = os.getenv("HF_TOKEN")
if HF_TOKEN:
    login(HF_TOKEN)
else:
    raise ValueError("HF_TOKEN environment variable not set. Please add it in Space settings.")

# Lade das Modell und den Image Processor
model = AutoModel.from_pretrained("facebook/dinov3-convnext-small-pretrain-lvd1689m")
image_processor = AutoImageProcessor.from_pretrained("facebook/dinov3-convnext-small-pretrain-lvd1689m")

def analyze_image(image, prompt):
    # Konvertiere PIL-Bild zu OpenCV-Format
    image_np = np.array(image)
    image_cv = cv2.cvtColor(image_np, cv2.COLOR_RGB2BGR)

    # Extrahiere Features mit DINOv3
    inputs = image_processor(images=image_np, return_tensors="pt")
    with torch.no_grad():
        outputs = model(**inputs)

    # Einfache Bildanalyse mit OpenCV (Konturen für Objekte)
    gray = cv2.cvtColor(image_cv, cv2.COLOR_BGR2GRAY)
    _, thresh = cv2.threshold(gray, 127, 255, cv2.THRESH_BINARY)
    contours, _ = cv2.findContours(thresh, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)

    # Analysiere das Bild basierend auf dem Prompt
    description = []
    if "what do you see" in prompt.lower() or "was siehst du" in prompt.lower():
        if len(contours) == 0:
            description.append("Das Bild enthält keine klar erkennbaren Objekte.")
        else:
            for idx, contour in enumerate(contours[:10]):  # Begrenze auf 10 Objekte
                if cv2.contourArea(contour) < 100:  # Ignoriere kleine Konturen
                    continue
                x, y, w, h = cv2.boundingRect(contour)
                # Extrahiere Farbe der Region
                roi = image_cv[y:y+h, x:x+w]
                if roi.size == 0:
                    continue
                mean_color = np.mean(roi, axis=(0, 1)).astype(int)
                color_rgb = f"RGB({mean_color[2]},{mean_color[1]},{mean_color[0]})"
                description.append({
                    "object": f"Object_{idx}",
                    "color": color_rgb,
                    "position": f"x={x}, y={y}, width={w}, height={h}"
                })

    return {
        "prompt": prompt,
        "description": description if description else "No objects detected.",
        "features_shape": str(outputs.last_hidden_state.shape) if hasattr(outputs, 'last_hidden_state') else "No features extracted."
    }

# Erstelle Gradio-Schnittstelle
iface = gr.Interface(
    fn=analyze_image,
    inputs=[
        gr.Image(type="pil", label="Upload an Image"),
        gr.Textbox(label="Prompt", placeholder="Enter your prompt, e.g., 'Was siehst du auf dem Bild?'")
    ],
    outputs="json",
    title="General Image Analysis with DINOv3",
    description="Upload an image and provide a prompt to get a description of what the model sees."
)

iface.launch()