Spaces:

Krokodilpirat
/

Video-Depth-Anything_RGBD_Zero

Running on Zero

App Files Files Community

Krokodilpirat commited on Jun 25

Commit

b77d16c

verified ·

1 Parent(s): b0290d7

Update app.py

Browse files

Files changed (1) hide show

app.py +273 -232

app.py CHANGED Viewed

@@ -7,7 +7,6 @@ import numpy as np
 import gradio as gr
 import subprocess
 import requests
-import time
 from urllib.parse import urlparse
 from huggingface_hub import hf_hub_download
 from video_depth_anything.video_depth import VideoDepthAnything
@@ -46,84 +45,6 @@ print("Loading BLIP model...")
 blip_processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
 blip_model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base").to("cpu")
-# --- Load depth model ---
-print("Loading Video Depth Anything model...")
-DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
-encoder = 'vitl'
-model_name = 'Large'
-model_configs = {
-    'vitl': {'encoder': 'vitl', 'features': 256, 'out_channels': [256, 512, 1024, 1024]},
-}
-video_depth_anything = VideoDepthAnything(**model_configs[encoder])
-ckpt_path = hf_hub_download(repo_id=f"depth-anything/Video-Depth-Anything-{model_name}",
-                            filename=f"video_depth_anything_{encoder}.pth",
-                            cache_dir="/tmp/huggingface")
-video_depth_anything.load_state_dict(torch.load(ckpt_path, map_location='cpu'))
-video_depth_anything = video_depth_anything.to(DEVICE).eval()
-# --- Global variables for toggling ---
-current_video_file = None
-current_video_url = None
-blip_generated_name = ""
-original_filename = ""
-# --- Optimized BLIP processing ---
-def get_middle_frame_for_blip(video_path, target_size=480):
-    """Efficiently extract only the middle frame for BLIP processing"""
-    try:
-        cap = cv2.VideoCapture(video_path)
-        if not cap.isOpened():
-            raise ValueError(f"Could not open video: {video_path}")
-        # Get total frame count
-        frame_count = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
-        if frame_count <= 0:
-            raise ValueError("Video has no frames")
-        # Calculate middle frame index
-        middle_frame_idx = frame_count // 2
-        print(f"DEBUG: Video has {frame_count} frames, extracting frame {middle_frame_idx} for BLIP")
-        # Jump directly to middle frame (no loading of other frames!)
-        cap.set(cv2.CAP_PROP_POS_FRAMES, middle_frame_idx)
-        ret, frame = cap.read()
-        if not ret or frame is None:
-            raise ValueError("Could not read middle frame")
-        # Get original dimensions
-        original_height, original_width = frame.shape[:2]
-        print(f"DEBUG: Original frame size: {original_width} x {original_height}")
-        # Calculate new dimensions maintaining aspect ratio
-        if original_width > original_height:
-            new_width = target_size
-            new_height = int((original_height * target_size) / original_width)
-        else:
-            new_height = target_size
-            new_width = int((original_width * target_size) / original_height)
-        # Ensure even dimensions for compatibility
-        new_width = new_width if new_width % 2 == 0 else new_width + 1
-        new_height = new_height if new_height % 2 == 0 else new_height + 1
-        print(f"DEBUG: BLIP frame resized to: {new_width} x {new_height}")
-        # Resize only this one frame
-        frame_resized = cv2.resize(frame, (new_width, new_height))
-        # Convert BGR to RGB for BLIP
-        frame_rgb = cv2.cvtColor(frame_resized, cv2.COLOR_BGR2RGB)
-        cap.release()
-        return frame_rgb
-    except Exception as e:
-        print(f"ERROR: Failed to extract middle frame: {e}")
-        if 'cap' in locals():
-            cap.release()
-        raise
 def generate_blip_name(frame: np.ndarray) -> str:
     """Generate filename from frame using BLIP image captioning"""
     try:
@@ -139,17 +60,28 @@ def generate_blip_name(frame: np.ndarray) -> str:
         # Remove common stopwords and create filename
         stopwords = {"a", "an", "the", "in", "on", "at", "with", "by", "of", "for", "under", "through", "and", "is"}
         words = [w for w in caption.split() if w not in stopwords and w.isalpha()]
-        # Remove duplicates while preserving order
-        words = list(dict.fromkeys(words))
         trimmed = "_".join(words[:3])
         return trimmed[:30] if trimmed else "video"
     except Exception as e:
         print(f"BLIP error: {e}")
         return "video"
-# --- URL validation and video source detection ---
 def validate_url(url):
     """Validate if URL is properly formatted"""
     try:
@@ -169,7 +101,6 @@ def detect_video_source(url):
     else:
         return "unknown"
-# --- Video download functions ---
 def optimize_civitai_url(url):
     """Convert gallery Civitai URLs to original quality to avoid dimension issues"""
     if "image.civitai.com" in url and "width=450" in url:
@@ -187,12 +118,14 @@ def download_civitai_video(civitai_url):
         # Optimize URL to avoid dimension issues
         civitai_url = optimize_civitai_url(civitai_url)
         headers = {
             'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
             'Referer': 'https://civitai.com/',
             'Accept': 'video/webm,video/mp4,video/*;q=0.9,*/*;q=0.8',
         }
         print(f"DEBUG: Downloading optimized Civitai video: {civitai_url}")
         response = requests.get(civitai_url, headers=headers, stream=True, timeout=30)
@@ -201,8 +134,10 @@ def download_civitai_video(civitai_url):
         # Create filename based on URL
         try:
             parsed_url = urlparse(civitai_url)
             path_parts = parsed_url.path.split('/')
             if len(path_parts) > 1:
                 filename_part = path_parts[-1]
                 if '.' in filename_part:
                     temp_path = f"temp_civitai_{filename_part}"
@@ -211,6 +146,7 @@ def download_civitai_video(civitai_url):
             else:
                 temp_path = f"temp_civitai_{int(time.time())}.webm"
         except:
             temp_path = f"temp_civitai_{int(time.time())}.webm"
         # Download the file
@@ -225,43 +161,25 @@ def download_civitai_video(civitai_url):
     except Exception as e:
         raise RuntimeError(f"Failed to download Civitai video: {e}")
-def download_kling_video(kling_url):
-    """Direct download for Kling videos (no proxy needed)"""
     try:
-        headers = {
-            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
-            'Referer': 'https://kling.ai/',
-            'Accept': 'video/mp4,video/*;q=0.9,*/*;q=0.8',
-        }
-        print(f"DEBUG: Downloading Kling video: {kling_url}")
-        response = requests.get(kling_url, headers=headers, stream=True, timeout=30)
-        response.raise_for_status()
-        # Create filename - extract video ID from URL
-        try:
-            import re
-            match = re.search(r'/([a-f0-9-]{36})_', kling_url)
-            if match:
-                video_id = match.group(1)[:12]
-                temp_path = f"temp_kling_{video_id}.mp4"
-            else:
-                temp_path = f"temp_kling_{int(time.time())}.mp4"
-        except:
-            temp_path = f"temp_kling_{int(time.time())}.mp4"
-        # Download the file
-        with open(temp_path, "wb") as f:
-            for chunk in response.iter_content(chunk_size=8192):
-                if chunk:
-                    f.write(chunk)
-        print(f"DEBUG: Kling video downloaded to: {temp_path}")
-        return temp_path
     except Exception as e:
-        raise RuntimeError(f"Failed to download Kling video: {e}")
 def download_midjourney_video(mj_url):
     """Download MidJourney videos via proxy"""
@@ -269,14 +187,17 @@ def download_midjourney_video(mj_url):
         proxy_base = "https://9cee417c-5874-4e53-939a-52ad3f6f2f30-00-16i6nbwyeqga.picard.replit.dev/"
         proxy_url = f"{proxy_base}?url={mj_url}"
         try:
             parsed_url = urlparse(mj_url)
             url_filename = os.path.basename(parsed_url.path)
             if url_filename and '.' in url_filename:
                 temp_path = f"temp_mj_{url_filename}"
             else:
                 temp_path = f"temp_mj_{int(time.time())}.mp4"
         except:
             temp_path = f"temp_mj_{int(time.time())}.mp4"
         print(f"DEBUG: Downloading MJ video via proxy: {proxy_url}")
@@ -302,6 +223,7 @@ def download_generic_video(url):
         response = requests.get(url, headers=headers, stream=True, timeout=30)
         response.raise_for_status()
         temp_path = f"temp_generic_{int(time.time())}.mp4"
         with open(temp_path, "wb") as f:
@@ -313,52 +235,11 @@ def download_generic_video(url):
     except Exception as e:
         raise RuntimeError(f"Failed to download generic video: {e}")
-def download_video_from_url(original_url):
-    """Universal video downloader for MJ, Civitai, Kling, and others"""
-    try:
-        if not validate_url(original_url):
-            raise ValueError("Invalid URL format")
-        source = detect_video_source(original_url)
-        print(f"DEBUG: Detected video source: {source}")
-        if source == "civitai":
-            return download_civitai_video(original_url)
-        elif source == "kling":
-            return download_kling_video(original_url)
-        elif source == "midjourney":
-            return download_midjourney_video(original_url)
-        else:
-            return download_generic_video(original_url)
-    except Exception as e:
-        raise RuntimeError(f"Failed to download video: {e}")
-# --- Testing functions ---
-def test_civitai_download(url):
-    """Test function to check what format we get from Civitai"""
-    try:
-        print(f"🧪 Testing Civitai download: {url}")
-        headers = {
-            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36',
-            'Referer': 'https://civitai.com/',
-            'Accept': 'video/webm,video/mp4,video/*;q=0.9,*/*;q=0.8',
-        }
-        response = requests.head(url, headers=headers, timeout=10)
-        print(f"📋 Response Status: {response.status_code}")
-        print(f"📋 Content-Type: {response.headers.get('content-type', 'Unknown')}")
-        print(f"📋 Content-Length: {response.headers.get('content-length', 'Unknown')} bytes")
-        if response.status_code == 200:
-            return True, "✅ Civitai URL is accessible"
-        else:
-            return False, f"❌ Status: {response.status_code}"
-    except Exception as e:
-        return False, f"❌ Error: {str(e)}"
 # --- Main inference function ---
 def infer_video_depth_from_source(upload_video, video_url, filename, use_blip, *args):
@@ -366,35 +247,43 @@ def infer_video_depth_from_source(upload_video, video_url, filename, use_blip, *
     try:
         max_len, target_fps, max_res, stitch, grayscale, convert_from_color, blur = args
         input_path = upload_video or video_url
         if not input_path:
             return None, None, "Error: No video source provided"
         base_name = filename.strip().replace(" ", "_")[:30] if filename.strip() else "output"
         print(f"DEBUG: Final filename locked in: '{base_name}'")
         output_dir = "./outputs"
         os.makedirs(output_dir, exist_ok=True)
         vis_video_path = os.path.join(output_dir, base_name + "_vis.mp4")
         rgbd_video_path = os.path.join(output_dir, base_name + "_RGBD.mp4")
         print(f"DEBUG: Output files - Vis: '{vis_video_path}', RGBD: '{rgbd_video_path}'")
         print("Reading video frames...")
         frames, target_fps = read_video_frames(input_path, max_len, target_fps, max_res)
         if len(frames) == 0:
             return None, None, "Error: No frames could be extracted from video"
         print("Generating depth maps...")
         depths, fps = video_depth_anything.infer_video_depth(frames, target_fps, input_size=518, device=DEVICE)
         save_video(depths, vis_video_path, fps=fps, is_depths=True)
         rgbd_path = None
         if stitch:
             print("Creating RGBD stitched video...")
             full_frames, _ = read_video_frames(input_path, max_len, target_fps, max_res=-1)
             d_min, d_max = depths.min(), depths.max()
             stitched_frames = []
@@ -403,6 +292,7 @@ def infer_video_depth_from_source(upload_video, video_url, filename, use_blip, *
                 rgb = full_frames[i]
                 depth = ((depths[i] - d_min) / (d_max - d_min) * 255).astype(np.uint8)
                 if grayscale:
                     if convert_from_color:
                         import matplotlib
@@ -417,16 +307,20 @@ def infer_video_depth_from_source(upload_video, video_url, filename, use_blip, *
                     cmap = matplotlib.colormaps.get_cmap("inferno")
                     depth_vis = (cmap(depth / 255.0)[..., :3] * 255).astype(np.uint8)
                 if blur > 0:
                     kernel = int(blur * 20) * 2 + 1
                     depth_vis = cv2.GaussianBlur(depth_vis, (kernel, kernel), 0)
                 depth_resized = cv2.resize(depth_vis, (rgb.shape[1], rgb.shape[0]))
                 stitched = cv2.hconcat([rgb, depth_resized])
                 stitched_frames.append(stitched)
             save_video(np.array(stitched_frames), rgbd_video_path, fps=fps)
             try:
                 temp_audio_path = rgbd_video_path.replace('.mp4', '_audio.mp4')
                 cmd = [
@@ -442,6 +336,7 @@ def infer_video_depth_from_source(upload_video, video_url, filename, use_blip, *
                 print(f"Audio processing failed: {e}")
                 rgbd_path = rgbd_video_path
         gc.collect()
         if torch.cuda.is_available():
             torch.cuda.empty_cache()
@@ -466,16 +361,19 @@ def on_video_upload_change(video_file, use_blip):
         current_video_file = None
         blip_generated_name = ""
         original_filename = ""
-        return "", gr.update(), "Upload a video file"
     try:
         current_video_file = video_file
-        current_video_url = None
         print(f"DEBUG: Processing upload - video_file type: {type(video_file)}")
-        original_filename = "uploaded_video"
         if hasattr(video_file, 'name') and video_file.name:
             print(f"DEBUG: video_file.name = '{video_file.name}'")
             original_name = os.path.splitext(os.path.basename(video_file.name))[0]
@@ -484,6 +382,7 @@ def on_video_upload_change(video_file, use_blip):
                 original_filename = cleaned
                 print(f"DEBUG: Method 1 success: '{original_filename}'")
         elif hasattr(video_file, 'orig_name') and video_file.orig_name:
             print(f"DEBUG: video_file.orig_name = '{video_file.orig_name}'")
             original_name = os.path.splitext(os.path.basename(video_file.orig_name))[0]
@@ -492,6 +391,7 @@ def on_video_upload_change(video_file, use_blip):
                 original_filename = cleaned
                 print(f"DEBUG: Method 2 success: '{original_filename}'")
         elif isinstance(video_file, str):
             print(f"DEBUG: video_file is string: '{video_file}'")
             original_name = os.path.splitext(os.path.basename(video_file))[0]
@@ -502,6 +402,7 @@ def on_video_upload_change(video_file, use_blip):
         print(f"DEBUG: Final original filename set to: '{original_filename}'")
         blip_generated_name = ""
         if use_blip:
             print("DEBUG: Starting optimized BLIP processing...")
@@ -509,10 +410,11 @@ def on_video_upload_change(video_file, use_blip):
             blip_generated_name = generate_blip_name(frame)
             print(f"DEBUG: BLIP name generated: '{blip_generated_name}'")
         final_name = blip_generated_name if (use_blip and blip_generated_name) else original_filename
         print(f"DEBUG: Final name returned: '{final_name}' (BLIP: {use_blip})")
-        return final_name, gr.update(), "Video uploaded successfully!"
     except Exception as e:
         error_msg = f"Upload processing failed: {str(e)}"
@@ -520,80 +422,55 @@ def on_video_upload_change(video_file, use_blip):
         return "uploaded_video", gr.update(), error_msg
 def on_video_url_change(url, use_blip):
-    """Handle URL input change with support for MJ, Civitai, and Kling"""
     global current_video_file, current_video_url, blip_generated_name, original_filename
-    print(f"DEBUG: URL handler called with URL: '{url}'")
     if not url or url.strip() == "":
-        print("DEBUG: Empty URL - clearing state")
         current_video_file = None
         current_video_url = None
         blip_generated_name = ""
         original_filename = ""
-        return gr.update(), "", "Enter a video URL (MidJourney, Civitai, or Kling supported)"
     try:
         source = detect_video_source(url)
-        print(f"DEBUG: Processing URL for source: {source}")
-        if source == "civitai":
-            print("🔍 Civitai URL detected - running test...")
-            test_success, test_message = test_civitai_download(url)
-            print(test_message)
-            if not test_success:
-                return gr.update(), "", f"Civitai test failed: {test_message}"
-        # Extract filename
         try:
-            parsed_url = urlparse(url)
-            url_path = parsed_url.path
             if source == "civitai":
-                path_parts = url_path.split('/')
                 for part in reversed(path_parts):
-                    if part and '.' in part:
-                        clean_name = os.path.splitext(part)[0]
-                        original_filename = "".join(c for c in clean_name if c.isalnum() or c in "_-")[:30]
-                        break
-                    elif part and len(part) > 2 and not part.startswith('transcode'):
-                        original_filename = "".join(c for c in part if c.isalnum() or c in "_-")[:30]
-                        break
                 else:
                     original_filename = "civitai_video"
-            elif source == "kling":
-                import re
-                match = re.search(r'/([a-f0-9-]{36})_', url)
-                if match:
-                    video_id = match.group(1)[:12]
-                    original_filename = f"kling_{video_id}"
-                else:
-                    original_filename = "kling_video"
             else:
-                url_filename = os.path.basename(url_path)
-                if url_filename and '.' in url_filename:
-                    url_name = os.path.splitext(url_filename)[0]
-                    original_filename = "".join(c for c in url_name if c.isalnum() or c in "_-")[:30]
-                    if not original_filename:
-                        original_filename = "downloaded_video"
-                else:
-                    original_filename = "downloaded_video"
         except:
             original_filename = f"{source}_video" if source != "unknown" else "downloaded_video"
-        print(f"DEBUG: CLEAN original filename extracted: '{original_filename}' (source: {source})")
-        print(f"Downloading {source} video from URL: {url}")
-        video_path = download_video_from_url(url)
-        current_video_file = None
-        current_video_url = video_path
         blip_generated_name = ""
         if use_blip and video_path:
             try:
                 print("DEBUG: Starting optimized BLIP processing for URL video...")
@@ -604,46 +481,41 @@ def on_video_url_change(url, use_blip):
                 print(f"BLIP naming failed: {e}")
                 blip_generated_name = ""
         final_name = blip_generated_name if (use_blip and blip_generated_name) else original_filename
         print(f"DEBUG: {source.title()} final name returned: '{final_name}' (BLIP: {use_blip})")
-        if source in ["civitai", "kling"]:
-            if os.path.exists(video_path):
-                file_size = os.path.getsize(video_path)
-                print(f"📁 Downloaded file: {video_path} ({file_size} bytes)")
-                success_msg = f"✅ {source.title()} video downloaded! File: {os.path.basename(video_path)}"
-            else:
-                success_msg = f"✅ {source.title()} video processed!"
-        else:
-            success_msg = f"✅ {source.title()} video downloaded successfully!"
         return video_path, final_name, success_msg
     except Exception as e:
         error_msg = f"Download failed: {str(e)}"
-        print(f"DEBUG: URL handler error: {error_msg}")
-        return gr.update(), "", error_msg
 def on_blip_toggle(use_blip):
     """Handle BLIP checkbox toggle - switch between BLIP and original name"""
     global current_video_file, current_video_url, blip_generated_name, original_filename
     if current_video_file is None and current_video_url is None:
         return "", "No video loaded"
     print(f"DEBUG: Toggle called - BLIP: {use_blip}, Original: '{original_filename}', BLIP name: '{blip_generated_name}'")
     try:
         if use_blip and not blip_generated_name:
             if current_video_file:
                 frame = get_middle_frame_for_blip(current_video_file, target_size=480)
                 blip_generated_name = generate_blip_name(frame)
                 print(f"DEBUG: Generated new BLIP name from file: '{blip_generated_name}'")
             elif current_video_url:
                 frame = get_middle_frame_for_blip(current_video_url, target_size=480)
                 blip_generated_name = generate_blip_name(frame)
                 print(f"DEBUG: Generated new BLIP name from URL: '{blip_generated_name}'")
         if use_blip and blip_generated_name:
             final_name = blip_generated_name
             status = "Using BLIP generated name"
@@ -651,4 +523,173 @@ def on_blip_toggle(use_blip):
             final_name = original_filename if original_filename else "video"
             status = "Using original filename"
-        print(f"

 import gradio as gr
 import subprocess
 import requests
 from urllib.parse import urlparse
 from huggingface_hub import hf_hub_download
 from video_depth_anything.video_depth import VideoDepthAnything
 blip_processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
 blip_model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base").to("cpu")
 def generate_blip_name(frame: np.ndarray) -> str:
     """Generate filename from frame using BLIP image captioning"""
     try:
         # Remove common stopwords and create filename
         stopwords = {"a", "an", "the", "in", "on", "at", "with", "by", "of", "for", "under", "through", "and", "is"}
         words = [w for w in caption.split() if w not in stopwords and w.isalpha()]
         trimmed = "_".join(words[:3])
         return trimmed[:30] if trimmed else "video"
     except Exception as e:
         print(f"BLIP error: {e}")
         return "video"
+# --- Load depth model ---
+print("Loading Video Depth Anything model...")
+DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
+encoder = 'vitl'
+model_name = 'Large'
+model_configs = {
+    'vitl': {'encoder': 'vitl', 'features': 256, 'out_channels': [256, 512, 1024, 1024]},
+}
+video_depth_anything = VideoDepthAnything(**model_configs[encoder])
+ckpt_path = hf_hub_download(repo_id=f"depth-anything/Video-Depth-Anything-{model_name}",
+                            filename=f"video_depth_anything_{encoder}.pth",
+                            cache_dir="/tmp/huggingface")
+video_depth_anything.load_state_dict(torch.load(ckpt_path, map_location='cpu'))
+video_depth_anything = video_depth_anything.to(DEVICE).eval()
+# --- URL validation and download ---
 def validate_url(url):
     """Validate if URL is properly formatted"""
     try:
     else:
         return "unknown"
 def optimize_civitai_url(url):
     """Convert gallery Civitai URLs to original quality to avoid dimension issues"""
     if "image.civitai.com" in url and "width=450" in url:
         # Optimize URL to avoid dimension issues
         civitai_url = optimize_civitai_url(civitai_url)
+        # Civitai videos können oft direkt geladen werden
         headers = {
             'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
             'Referer': 'https://civitai.com/',
             'Accept': 'video/webm,video/mp4,video/*;q=0.9,*/*;q=0.8',
         }
+        # Try direct download first
         print(f"DEBUG: Downloading optimized Civitai video: {civitai_url}")
         response = requests.get(civitai_url, headers=headers, stream=True, timeout=30)
         # Create filename based on URL
         try:
             parsed_url = urlparse(civitai_url)
+            # Extract filename from URL path
             path_parts = parsed_url.path.split('/')
             if len(path_parts) > 1:
+                # Get the last part that might be a filename
                 filename_part = path_parts[-1]
                 if '.' in filename_part:
                     temp_path = f"temp_civitai_{filename_part}"
             else:
                 temp_path = f"temp_civitai_{int(time.time())}.webm"
         except:
+            import time
             temp_path = f"temp_civitai_{int(time.time())}.webm"
         # Download the file
     except Exception as e:
         raise RuntimeError(f"Failed to download Civitai video: {e}")
+def download_video_from_url(original_url):
+    """Universal video downloader for MJ, Civitai, and others"""
     try:
+        if not validate_url(original_url):
+            raise ValueError("Invalid URL format")
+        # Detect source and use appropriate method
+        source = detect_video_source(original_url)
+        print(f"DEBUG: Detected video source: {source}")
+        if source == "civitai":
+            return download_civitai_video(original_url)
+        elif source == "midjourney":
+            return download_midjourney_video(original_url)
+        else:
+            return download_generic_video(original_url)
     except Exception as e:
+        raise RuntimeError(f"Failed to download video: {e}")
 def download_midjourney_video(mj_url):
     """Download MidJourney videos via proxy"""
         proxy_base = "https://9cee417c-5874-4e53-939a-52ad3f6f2f30-00-16i6nbwyeqga.picard.replit.dev/"
         proxy_url = f"{proxy_base}?url={mj_url}"
+        # Create filename
         try:
             parsed_url = urlparse(mj_url)
             url_filename = os.path.basename(parsed_url.path)
             if url_filename and '.' in url_filename:
                 temp_path = f"temp_mj_{url_filename}"
             else:
+                import time
                 temp_path = f"temp_mj_{int(time.time())}.mp4"
         except:
+            import time
             temp_path = f"temp_mj_{int(time.time())}.mp4"
         print(f"DEBUG: Downloading MJ video via proxy: {proxy_url}")
         response = requests.get(url, headers=headers, stream=True, timeout=30)
         response.raise_for_status()
+        import time
         temp_path = f"temp_generic_{int(time.time())}.mp4"
         with open(temp_path, "wb") as f:
     except Exception as e:
         raise RuntimeError(f"Failed to download generic video: {e}")
+# --- Global variables for toggling ---
+current_video_file = None
+current_video_url = None
+blip_generated_name = ""
+original_filename = ""
 # --- Main inference function ---
 def infer_video_depth_from_source(upload_video, video_url, filename, use_blip, *args):
     try:
         max_len, target_fps, max_res, stitch, grayscale, convert_from_color, blur = args
+        # Determine input source
         input_path = upload_video or video_url
         if not input_path:
             return None, None, "Error: No video source provided"
+        # Fix filename at generation time (no more changing after this point)
         base_name = filename.strip().replace(" ", "_")[:30] if filename.strip() else "output"
         print(f"DEBUG: Final filename locked in: '{base_name}'")
+        # Create output directory
         output_dir = "./outputs"
         os.makedirs(output_dir, exist_ok=True)
+        # Use final names (not temp names!)
         vis_video_path = os.path.join(output_dir, base_name + "_vis.mp4")
         rgbd_video_path = os.path.join(output_dir, base_name + "_RGBD.mp4")
         print(f"DEBUG: Output files - Vis: '{vis_video_path}', RGBD: '{rgbd_video_path}'")
+        # Process video frames
         print("Reading video frames...")
         frames, target_fps = read_video_frames(input_path, max_len, target_fps, max_res)
         if len(frames) == 0:
             return None, None, "Error: No frames could be extracted from video"
+        # Generate depth maps
         print("Generating depth maps...")
         depths, fps = video_depth_anything.infer_video_depth(frames, target_fps, input_size=518, device=DEVICE)
+        # Save depth visualization with final name
         save_video(depths, vis_video_path, fps=fps, is_depths=True)
         rgbd_path = None
         if stitch:
             print("Creating RGBD stitched video...")
+            # Read full resolution frames for stitching
             full_frames, _ = read_video_frames(input_path, max_len, target_fps, max_res=-1)
             d_min, d_max = depths.min(), depths.max()
             stitched_frames = []
                 rgb = full_frames[i]
                 depth = ((depths[i] - d_min) / (d_max - d_min) * 255).astype(np.uint8)
+                # Apply depth visualization options
                 if grayscale:
                     if convert_from_color:
                         import matplotlib
                     cmap = matplotlib.colormaps.get_cmap("inferno")
                     depth_vis = (cmap(depth / 255.0)[..., :3] * 255).astype(np.uint8)
+                # Apply blur if requested
                 if blur > 0:
                     kernel = int(blur * 20) * 2 + 1
                     depth_vis = cv2.GaussianBlur(depth_vis, (kernel, kernel), 0)
+                # Resize depth to match RGB and stitch side by side
                 depth_resized = cv2.resize(depth_vis, (rgb.shape[1], rgb.shape[0]))
                 stitched = cv2.hconcat([rgb, depth_resized])
                 stitched_frames.append(stitched)
+            # Save stitched video with final name
             save_video(np.array(stitched_frames), rgbd_video_path, fps=fps)
+            # Add audio from original video if possible
             try:
                 temp_audio_path = rgbd_video_path.replace('.mp4', '_audio.mp4')
                 cmd = [
                 print(f"Audio processing failed: {e}")
                 rgbd_path = rgbd_video_path
+        # Clean up memory
         gc.collect()
         if torch.cuda.is_available():
             torch.cuda.empty_cache()
         current_video_file = None
         blip_generated_name = ""
         original_filename = ""
+        return "", gr.update(), "Upload a video file"  # Don't change URL when clearing
     try:
+        # Store the current video
         current_video_file = video_file
+        current_video_url = None  # Clear URL when uploading file
         print(f"DEBUG: Processing upload - video_file type: {type(video_file)}")
+        # Generate original filename FIRST - try multiple ways
+        original_filename = "uploaded_video"  # Default fallback
+        # Method 1: Check .name attribute
         if hasattr(video_file, 'name') and video_file.name:
             print(f"DEBUG: video_file.name = '{video_file.name}'")
             original_name = os.path.splitext(os.path.basename(video_file.name))[0]
                 original_filename = cleaned
                 print(f"DEBUG: Method 1 success: '{original_filename}'")
+        # Method 2: Check .orig_name attribute (Gradio sometimes uses this)
         elif hasattr(video_file, 'orig_name') and video_file.orig_name:
             print(f"DEBUG: video_file.orig_name = '{video_file.orig_name}'")
             original_name = os.path.splitext(os.path.basename(video_file.orig_name))[0]
                 original_filename = cleaned
                 print(f"DEBUG: Method 2 success: '{original_filename}'")
+        # Method 3: Try to get filename from the file path itself
         elif isinstance(video_file, str):
             print(f"DEBUG: video_file is string: '{video_file}'")
             original_name = os.path.splitext(os.path.basename(video_file))[0]
         print(f"DEBUG: Final original filename set to: '{original_filename}'")
+        # Generate BLIP name
         blip_generated_name = ""
         if use_blip:
             print("DEBUG: Starting optimized BLIP processing...")
             blip_generated_name = generate_blip_name(frame)
             print(f"DEBUG: BLIP name generated: '{blip_generated_name}'")
+        # Return appropriate name based on BLIP setting
         final_name = blip_generated_name if (use_blip and blip_generated_name) else original_filename
         print(f"DEBUG: Final name returned: '{final_name}' (BLIP: {use_blip})")
+        print(f"DEBUG: Returning - filename: '{final_name}', clear URL: '', status: 'success'")
+        return final_name, "", "Video uploaded successfully!"  # Clear URL when video uploaded
     except Exception as e:
         error_msg = f"Upload processing failed: {str(e)}"
         return "uploaded_video", gr.update(), error_msg
 def on_video_url_change(url, use_blip):
+    """Handle URL input change with support for MJ and Civitai"""
     global current_video_file, current_video_url, blip_generated_name, original_filename
     if not url or url.strip() == "":
         current_video_file = None
         current_video_url = None
         blip_generated_name = ""
         original_filename = ""
+        return None, "", "Enter a video URL (MidJourney or Civitai supported)"
     try:
         source = detect_video_source(url)
+        print(f"Downloading {source} video from URL: {url}")
+        video_path = download_video_from_url(url)
+        # Store the current video info
+        current_video_file = None  # Clear file when using URL
+        current_video_url = video_path
+        # Set original filename based on source
         try:
             if source == "civitai":
+                # Extract filename from Civitai URL
+                parsed_url = urlparse(url)
+                path_parts = parsed_url.path.split('/')
+                # Look for meaningful filename in path
                 for part in reversed(path_parts):
+                    if part and '.' not in part and len(part) > 3:
+                        cleaned = "".join(c for c in part if c.isalnum() or c in "_-")[:20]
+                        if cleaned:
+                            original_filename = f"civitai_{cleaned}"
+                            break
                 else:
                     original_filename = "civitai_video"
+            elif source == "midjourney":
+                original_filename = "midjourney_video"
             else:
+                original_filename = "downloaded_video"
         except:
             original_filename = f"{source}_video" if source != "unknown" else "downloaded_video"
+        print(f"DEBUG: {source.title()} original filename set to: '{original_filename}'")
         blip_generated_name = ""
+        # Generate BLIP name if requested
         if use_blip and video_path:
             try:
                 print("DEBUG: Starting optimized BLIP processing for URL video...")
                 print(f"BLIP naming failed: {e}")
                 blip_generated_name = ""
+        # Return appropriate name
         final_name = blip_generated_name if (use_blip and blip_generated_name) else original_filename
+        success_msg = f"✅ {source.title()} video downloaded successfully!"
         print(f"DEBUG: {source.title()} final name returned: '{final_name}' (BLIP: {use_blip})")
         return video_path, final_name, success_msg
     except Exception as e:
         error_msg = f"Download failed: {str(e)}"
+        print(error_msg)
+        return None, "", error_msg
 def on_blip_toggle(use_blip):
     """Handle BLIP checkbox toggle - switch between BLIP and original name"""
     global current_video_file, current_video_url, blip_generated_name, original_filename
+    # Only react if we have a video loaded
     if current_video_file is None and current_video_url is None:
         return "", "No video loaded"
     print(f"DEBUG: Toggle called - BLIP: {use_blip}, Original: '{original_filename}', BLIP name: '{blip_generated_name}'")
     try:
+        # If toggling BLIP on and we don't have a BLIP name yet, generate it
         if use_blip and not blip_generated_name:
             if current_video_file:
                 frame = get_middle_frame_for_blip(current_video_file, target_size=480)
                 blip_generated_name = generate_blip_name(frame)
                 print(f"DEBUG: Generated new BLIP name from file: '{blip_generated_name}'")
             elif current_video_url:
+                # For URL videos, we might need to re-read frames
                 frame = get_middle_frame_for_blip(current_video_url, target_size=480)
                 blip_generated_name = generate_blip_name(frame)
                 print(f"DEBUG: Generated new BLIP name from URL: '{blip_generated_name}'")
+        # Return appropriate name based on toggle
         if use_blip and blip_generated_name:
             final_name = blip_generated_name
             status = "Using BLIP generated name"
             final_name = original_filename if original_filename else "video"
             status = "Using original filename"
+        print(f"DEBUG: Toggle returning: '{final_name}' - {status}")
+        return final_name, status
+    except Exception as e:
+        error_msg = f"Name generation failed: {str(e)}"
+        print(error_msg)
+        fallback = original_filename if original_filename else "video"
+        return fallback, error_msg
+# --- Gradio Interface ---
+with gr.Blocks(analytics_enabled=False, title="Video Depth Anything") as demo:
+    gr.Markdown("""
+    # 🎥 Video Depth Anything + RGBD Output
+    Generate depth maps from videos and watch RGBD videos on holographic displays like Looking Glass Go.
+    Upload a video or paste a video URL (Midjourney, Civitai, or Kling).
+    [🔗 Project Page](https://videodepthanything.github.io/) | [📖 Paper](https://arxiv.org/abs/2401.01884)
+    """)
+    # Status display
+    status_display = gr.HTML("")
+    with gr.Row(equal_height=True):
+        with gr.Column(scale=1):
+            upload_video = gr.Video(
+                label="📁 Upload Video",
+                height=500,
+                show_label=True
+            )
+        with gr.Column(scale=1):
+            depth_out = gr.Video(
+                label="🎨 Depth Visualization",
+                interactive=False,
+                autoplay=True,
+                height=500,
+                show_label=True
+            )
+        with gr.Column(scale=2):
+            rgbd_out = gr.Video(
+                label="🔄 RGBD Side-by-Side",
+                interactive=False,
+                autoplay=True,
+                height=500,
+                show_label=True
+            )
+    with gr.Row():
+        video_url = gr.Textbox(
+            label="🔗 Video URL (MJ, Civitai, or Kling)",
+            placeholder="Paste MidJourney, Civitai, or Kling video URL here...",
+            scale=4
+        )
+        use_blip = gr.Checkbox(
+            label="🤖 Auto-name with BLIP",
+            value=True,
+            scale=2,
+            info="Generate filename from video content"
+        )
+        filename = gr.Textbox(
+            label="📝 Output Filename (_RGBD.mp4 will be added)",
+            placeholder="Enter filename or let BLIP generate it",
+            scale=4
+        )
+    # Event handlers for input changes - FIXED to prevent interference
+    video_url.change(
+        fn=on_video_url_change,
+        inputs=[video_url, use_blip],
+        outputs=[upload_video, filename, status_display],  # URL loads video to upload field
+        queue=False  # Don't queue URL changes
+    )
+    upload_video.upload(  # Use .upload instead of .change
+        fn=on_video_upload_change,
+        inputs=[upload_video, use_blip],
+        outputs=[filename, video_url, status_display],  # Upload clears URL field
+        queue=False  # Don't queue uploads
+    )
+    # Toggle BLIP checkbox to switch between names
+    use_blip.change(
+        fn=on_blip_toggle,
+        inputs=[use_blip],
+        outputs=[filename, status_display]
+    )
+    with gr.Accordion("⚙️ Advanced Settings", open=False):
+        with gr.Row():
+            max_len = gr.Slider(
+                label="Max Frames",
+                minimum=-1,
+                maximum=1000,
+                value=-1,
+                step=1,
+                info="Maximum frames to process (-1 for all)"
+            )
+            target_fps = gr.Slider(
+                label="Target FPS",
+                minimum=-1,
+                maximum=30,
+                value=-1,
+                step=1,
+                info="Output FPS (-1 for original)"
+            )
+            max_res = gr.Slider(
+                label="Max Resolution",
+                minimum=480,
+                maximum=1920,
+                value=1280,
+                step=1,
+                info="Maximum resolution for processing"
+            )
+        with gr.Row():
+            stitch = gr.Checkbox(
+                label="Create RGBD Output",
+                value=True,
+                info="Generate side-by-side RGB + Depth video"
+            )
+            grayscale = gr.Checkbox(
+                label="Grayscale Depth",
+                value=True,
+                info="Convert depth to grayscale"
+            )
+            convert_from_color = gr.Checkbox(
+                label="From Colormap",
+                value=True,
+                info="Convert from color before grayscale"
+            )
+            blur = gr.Slider(
+                label="Depth Blur",
+                minimum=0,
+                maximum=1,
+                value=0.3,
+                step=0.01,
+                info="Blur amount for depth visualization"
+            )
+    run_btn = gr.Button("🚀 Generate Depth Video", variant="primary", size="lg")
+    # Main processing event
+    run_btn.click(
+        fn=infer_video_depth_from_source,
+        inputs=[
+            upload_video, video_url, filename, use_blip,
+            max_len, target_fps, max_res, stitch,
+            grayscale, convert_from_color, blur
+        ],
+        outputs=[depth_out, rgbd_out, status_display]
+    )
+    gr.Markdown("""
+    ### 💡 Tips:
+    - **Upload formats**: MP4, AVI, MOV, etc.
+    - **BLIP naming**: Automatically generates descriptive filenames
+    - **RGBD output**: Side-by-side comparison of original and depth
+    - **Processing time**: Depends on video length and resolution
+    - **Filename**: Set your preferred name before clicking Generate!
+    """)
+    demo.queue(max_size=10)
+if __name__ == "__main__":
+    print("Starting Video Depth Anything interface...")
+    demo.launch(
+        server_name="0.0.0.0",
+        server_port=7860,
+        show_error=True
+    )