Spaces:

Krokodilpirat
/

Video-Depth-Anything_RGBD_Zero

Running on Zero

App Files Files Community

Krokodilpirat commited on Jun 25, 2025

Commit

41131e3

verified ·

1 Parent(s): b77d16c

Update app.py

Browse files

Files changed (1) hide show

app.py +62 -2

app.py CHANGED Viewed

@@ -45,8 +45,56 @@ print("Loading BLIP model...")
 blip_processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
 blip_model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base").to("cpu")
 def generate_blip_name(frame: np.ndarray) -> str:
-    """Generate filename from frame using BLIP image captioning"""
     try:
         # Check if frame is valid
         if frame is None or frame.size == 0:
@@ -57,11 +105,23 @@ def generate_blip_name(frame: np.ndarray) -> str:
         out = blip_model.generate(**inputs)
         caption = blip_processor.decode(out[0], skip_special_tokens=True).lower()
         # Remove common stopwords and create filename
         stopwords = {"a", "an", "the", "in", "on", "at", "with", "by", "of", "for", "under", "through", "and", "is"}
         words = [w for w in caption.split() if w not in stopwords and w.isalpha()]
         trimmed = "_".join(words[:3])
-        return trimmed[:30] if trimmed else "video"
     except Exception as e:
         print(f"BLIP error: {e}")
         return "video"

 blip_processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
 blip_model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base").to("cpu")
+def get_middle_frame_for_blip(video_path, target_size=480):
+    """Effizient: Lädt nur das mittlere Frame für BLIP (nicht alle Frames!)"""
+    try:
+        cap = cv2.VideoCapture(video_path)
+        # Prüfe ob Video gültig ist
+        if not cap.isOpened():
+            print(f"DEBUG: Could not open video: {video_path}")
+            cap.release()
+            return None
+        # Hole Frame-Count und springe zum mittleren Frame
+        frame_count = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
+        if frame_count <= 0:
+            print(f"DEBUG: Invalid frame count: {frame_count}")
+            cap.release()
+            return None
+        middle_idx = frame_count // 2
+        print(f"DEBUG: Video has {frame_count} frames, jumping to frame {middle_idx}")
+        # Springe direkt zum Ziel-Frame (keine Iteration!)
+        cap.set(cv2.CAP_PROP_POS_FRAMES, middle_idx)
+        ret, frame = cap.read()
+        cap.release()
+        if not ret or frame is None:
+            print("DEBUG: Could not read middle frame")
+            return None
+        # Verkleinere nur dieses eine Frame
+        h, w = frame.shape[:2]
+        if max(h, w) > target_size:
+            scale = target_size / max(h, w)
+            new_h, new_w = int(h * scale), int(w * scale)
+            frame = cv2.resize(frame, (new_w, new_h))
+            print(f"DEBUG: Resized frame from {w}x{h} to {new_w}x{new_h}")
+        else:
+            print(f"DEBUG: Frame size {w}x{h} already within target {target_size}")
+        # Convert BGR to RGB für BLIP
+        frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
+        return frame_rgb
+    except Exception as e:
+        print(f"DEBUG: get_middle_frame_for_blip error: {e}")
+        return None
 def generate_blip_name(frame: np.ndarray) -> str:
+    """Generate filename from frame using BLIP image captioning + Duplikat-Entfernung"""
     try:
         # Check if frame is valid
         if frame is None or frame.size == 0:
         out = blip_model.generate(**inputs)
         caption = blip_processor.decode(out[0], skip_special_tokens=True).lower()
+        print(f"DEBUG: BLIP caption: '{caption}'")
         # Remove common stopwords and create filename
         stopwords = {"a", "an", "the", "in", "on", "at", "with", "by", "of", "for", "under", "through", "and", "is"}
         words = [w for w in caption.split() if w not in stopwords and w.isalpha()]
+        # 🎯 NEUE OPTIMIERUNG: Entferne Duplikate, behalte Reihenfolge
+        words = list(dict.fromkeys(words))
+        print(f"DEBUG: Words after stopword removal and deduplication: {words}")
         trimmed = "_".join(words[:3])
+        result = trimmed[:30] if trimmed else "video"
+        print(f"DEBUG: Final BLIP name: '{result}'")
+        return result
     except Exception as e:
         print(f"BLIP error: {e}")
         return "video"