Spaces:

Krokodilpirat
/

Video-Depth-Anything_RGBD_Zero

Running on Zero

App Files Files Community

Krokodilpirat commited on Jun 23

Commit

45326b4

verified ·

1 Parent(s): 18a6e82

Update app.py

Browse files

Files changed (1) hide show

app.py +125 -75

app.py CHANGED Viewed

@@ -7,7 +7,9 @@ import torch
 import numpy as np
 import gradio as gr
 import subprocess
 import requests
 from huggingface_hub import hf_hub_download
 from video_depth_anything.video_depth import VideoDepthAnything
 from utils.dc_utils import read_video_frames, save_video
@@ -19,20 +21,27 @@ os.environ["HF_HOME"] = "/tmp/huggingface"
 os.environ["TRANSFORMERS_CACHE"] = "/tmp/huggingface/transformers"
 os.environ["MPLCONFIGDIR"] = "/tmp/matplotlib"
-# Patch Gradio schema bug
 def patch_gradio_utils():
     try:
         from gradio_client import utils
         original_get_type = utils.get_type
         def patched_get_type(schema):
-            if isinstance(schema, bool): return "boolean"
-            if not isinstance(schema, dict): return "any"
             return original_get_type(schema)
         utils.get_type = patched_get_type
-    except: pass
 patch_gradio_utils()
-# Load BLIP
 blip_processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
 blip_model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base").to("cpu")
@@ -43,103 +52,116 @@ def generate_blip_name(frame: np.ndarray) -> str:
     caption = blip_processor.decode(out[0], skip_special_tokens=True).lower()
     stopwords = {"a", "an", "the", "in", "on", "at", "with", "by", "of", "for", "under", "through", "and", "is"}
     words = [w for w in caption.split() if w not in stopwords and w.isalpha()]
-    return "_".join(words[:3])[:30]
 # Load depth model
 DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
-video_depth_anything = VideoDepthAnything(encoder='vitl', features=256, out_channels=[256,512,1024,1024])
-ckpt_path = hf_hub_download("depth-anything/Video-Depth-Anything-Large", filename="video_depth_anything_vitl.pth", cache_dir="/tmp/huggingface")
 video_depth_anything.load_state_dict(torch.load(ckpt_path, map_location='cpu'))
 video_depth_anything = video_depth_anything.to(DEVICE).eval()
-# Proxy MJ download
-def download_video_from_url(url):
-    proxy = "https://9cee417c-5874-4e53-939a-52ad3f6f2f30-00-16i6nbwyeqga.picard.replit.dev/"
-    full = f"{proxy}?url={url}"
-    temp = "temp_video.mp4"
-    with requests.get(full, stream=True, timeout=20) as r:
-        r.raise_for_status()
-        with open(temp, "wb") as f:
-            for chunk in r.iter_content(chunk_size=8192):
-                if chunk: f.write(chunk)
-    return temp
-# Trigger: Clear upload if MJ
-def clear_uploaded_video(url):
-    return None, "Downloading MJ video...", None
-# Trigger: MJ download + optional BLIP
-def handle_video_url(url, use_blip):
-    path = download_video_from_url(url)
-    blip = ""
-    if use_blip:
-        frames, _ = read_video_frames(path, 999, -1, 480)
-        frame = frames[len(frames)//2]
-        blip = generate_blip_name(frame)
-    return path, blip
-# Trigger: Upload + optional BLIP
-def handle_upload(path, use_blip):
-    blip = ""
-    if use_blip:
-        frames, _ = read_video_frames(path, 999, -1, 480)
-        frame = frames[len(frames)//2]
-        blip = generate_blip_name(frame)
-    return blip
-# Main process
-def infer_video_depth_from_source(upload_video, video_url, custom_name, use_blip,
-                                  max_len, target_fps, max_res, stitch, grayscale, convert_from_color, blur):
-    input_path = upload_video or download_video_from_url(video_url)
-    base_name = os.path.splitext(os.path.basename(input_path))[0]
     if custom_name:
         base_name = custom_name.strip().replace(" ", "_")[:30]
     elif use_blip:
         frames, _ = read_video_frames(input_path, 999, -1, 480)
-        frame = frames[len(frames)//2]
         base_name = generate_blip_name(frame)
     output_dir = "./outputs"
     os.makedirs(output_dir, exist_ok=True)
-    stitched_path = os.path.join(output_dir, base_name + "_RGBD.mp4")
-    vis_path = os.path.join(output_dir, base_name + "_vis.mp4")
     frames, target_fps = read_video_frames(input_path, max_len, target_fps, max_res)
     depths, fps = video_depth_anything.infer_video_depth(frames, target_fps, input_size=518, device=DEVICE)
-    save_video(depths, vis_path, fps=fps, is_depths=True)
     if stitch:
         full_frames, _ = read_video_frames(input_path, max_len, target_fps, max_res=-1)
         d_min, d_max = depths.min(), depths.max()
         stitched_frames = []
         for i in range(min(len(full_frames), len(depths))):
             rgb = full_frames[i]
             depth = ((depths[i] - d_min) / (d_max - d_min) * 255).astype(np.uint8)
             if grayscale:
-                import matplotlib
-                cmap = matplotlib.colormaps.get_cmap("inferno")
-                depth_color = (cmap(depth / 255.0)[..., :3] * 255).astype(np.uint8)
-                gray = cv2.cvtColor(depth_color, cv2.COLOR_RGB2GRAY)
-                depth_vis = np.stack([gray]*3, axis=-1) if convert_from_color else np.stack([depth]*3, axis=-1)
             else:
                 import matplotlib
                 cmap = matplotlib.colormaps.get_cmap("inferno")
                 depth_vis = (cmap(depth / 255.0)[..., :3] * 255).astype(np.uint8)
             if blur > 0:
-                k = int(blur * 20) * 2 + 1
-                depth_vis = cv2.GaussianBlur(depth_vis, (k, k), 0)
             depth_resized = cv2.resize(depth_vis, (rgb.shape[1], rgb.shape[0]))
-            stitched_frames.append(cv2.hconcat([rgb, depth_resized]))
-        save_video(np.array(stitched_frames), stitched_path, fps=fps)
-        temp_audio = stitched_path.replace('_RGBD.mp4', '_RGBD_audio.mp4')
-        cmd = ["ffmpeg", "-y", "-i", stitched_path, "-i", input_path, "-c:v", "copy", "-c:a", "aac",
-               "-map", "0:v:0", "-map", "1:a:0?", "-shortest", temp_audio]
-        subprocess.run(cmd)
-        os.replace(temp_audio, stitched_path)
-    gc.collect(); torch.cuda.empty_cache()
-    return vis_path, stitched_path, input_path, base_name
 # Gradio UI
 with gr.Blocks(analytics_enabled=False, css="""
@@ -152,7 +174,8 @@ with gr.Blocks(analytics_enabled=False, css="""
 """) as demo:
     gr.Markdown("# Video Depth Anything + RGBD sbs output")
-    gr.Markdown("Upload a video or paste a URL to generate RGBD output.")
     with gr.Row(equal_height=True):
         upload_video = gr.Video(label="Upload Video", height=360, scale=1)
@@ -160,14 +183,40 @@ with gr.Blocks(analytics_enabled=False, css="""
         rgbd_out = gr.Video(label="RGBD Output", interactive=False, autoplay=True, show_share_button=True, height=360, scale=2)
     with gr.Row():
-        video_url = gr.Textbox(label="Paste MJ video URL", scale=3)
         use_blip = gr.Checkbox(label="Use BLIP for automatic file name", value=True, scale=1)
         blip_name_display = gr.Textbox(label="BLIP file name", interactive=False, scale=2)
         custom_name = gr.Textbox(label="Custom file name", scale=3)
-    video_url.change(fn=clear_uploaded_video, inputs=[video_url], outputs=[upload_video, blip_name_display, custom_name])
-    video_url.change(fn=handle_video_url, inputs=[video_url, use_blip], outputs=[upload_video, blip_name_display])
-    upload_video.change(fn=handle_upload, inputs=[upload_video, use_blip], outputs=[blip_name_display])
     with gr.Accordion("Advanced Settings", open=False):
         max_len = gr.Slider(label="Max process length", minimum=-1, maximum=1000, value=-1, step=1)
@@ -179,10 +228,11 @@ with gr.Blocks(analytics_enabled=False, css="""
         blur = gr.Slider(label="Blur (for edge smoothing)", minimum=0, maximum=1, value=0.3, step=0.01)
     run_btn = gr.Button("Generate")
     run_btn.click(
         fn=infer_video_depth_from_source,
         inputs=[upload_video, video_url, custom_name, use_blip, max_len, target_fps, max_res, stitch, grayscale, convert_from_color, blur],
-        outputs=[depth_out, rgbd_out, upload_video, blip_name_display]
     )
 demo.queue()

 import numpy as np
 import gradio as gr
 import subprocess
+import urllib.request
 import requests
+from urllib.parse import urlparse
 from huggingface_hub import hf_hub_download
 from video_depth_anything.video_depth import VideoDepthAnything
 from utils.dc_utils import read_video_frames, save_video
 os.environ["TRANSFORMERS_CACHE"] = "/tmp/huggingface/transformers"
 os.environ["MPLCONFIGDIR"] = "/tmp/matplotlib"
+# Patch for Gradio schema bug
 def patch_gradio_utils():
     try:
         from gradio_client import utils
         original_get_type = utils.get_type
         def patched_get_type(schema):
+            if isinstance(schema, bool):
+                return "boolean"
+            if not isinstance(schema, dict):
+                return "any"
             return original_get_type(schema)
         utils.get_type = patched_get_type
+        print("Successfully patched Gradio utils.get_type")
+    except Exception as e:
+        print(f"Could not patch Gradio utils: {e}")
 patch_gradio_utils()
+# Load BLIP model
 blip_processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
 blip_model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base").to("cpu")
     caption = blip_processor.decode(out[0], skip_special_tokens=True).lower()
     stopwords = {"a", "an", "the", "in", "on", "at", "with", "by", "of", "for", "under", "through", "and", "is"}
     words = [w for w in caption.split() if w not in stopwords and w.isalpha()]
+    trimmed = "_".join(words[:3])
+    return trimmed[:30]
 # Load depth model
 DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
+encoder = 'vitl'
+model_name = 'Large'
+model_configs = {
+    'vitl': {'encoder': 'vitl', 'features': 256, 'out_channels': [256, 512, 1024, 1024]},
+}
+video_depth_anything = VideoDepthAnything(**model_configs[encoder])
+ckpt_path = hf_hub_download(repo_id=f"depth-anything/Video-Depth-Anything-{model_name}",
+                            filename=f"video_depth_anything_{encoder}.pth",
+                            cache_dir="/tmp/huggingface")
 video_depth_anything.load_state_dict(torch.load(ckpt_path, map_location='cpu'))
 video_depth_anything = video_depth_anything.to(DEVICE).eval()
+# MJ proxy download
+def download_video_from_url(original_url):
+    try:
+        proxy_base = "https://9cee417c-5874-4e53-939a-52ad3f6f2f30-00-16i6nbwyeqga.picard.replit.dev/"
+        proxy_url = f"{proxy_base}?url={original_url}"
+        temp_path = "temp_video.mp4"
+        with requests.get(proxy_url, stream=True, timeout=20) as response:
+            response.raise_for_status()
+            with open(temp_path, "wb") as f:
+                for chunk in response.iter_content(chunk_size=8192):
+                    if chunk:
+                        f.write(chunk)
+        return temp_path
+    except Exception as e:
+        raise RuntimeError(f"Proxy download failed: {e}")
+# Inference
+def infer_video_depth_from_source(upload_video, video_url, custom_name, use_blip, *args):
+    max_len, target_fps, max_res, stitch, grayscale, convert_from_color, blur = args
+    if upload_video:
+        input_path = upload_video
+        base_name = os.path.splitext(os.path.basename(input_path))[0]
+    elif video_url:
+        input_path = download_video_from_url(video_url)
+        base_name = os.path.splitext(os.path.basename(input_path))[0]
+    else:
+        raise ValueError("No video source provided.")
+    blip_name = ""
     if custom_name:
         base_name = custom_name.strip().replace(" ", "_")[:30]
     elif use_blip:
         frames, _ = read_video_frames(input_path, 999, -1, 480)
+        frame = frames[len(frames) // 2]
         base_name = generate_blip_name(frame)
+        blip_name = base_name
+    else:
+        base_name = os.path.splitext(os.path.basename(input_path))[0]
     output_dir = "./outputs"
     os.makedirs(output_dir, exist_ok=True)
+    stitched_video_path = os.path.join(output_dir, base_name + "_RGBD.mp4")
+    vis_video_path = os.path.join(output_dir, base_name + "_vis.mp4")
     frames, target_fps = read_video_frames(input_path, max_len, target_fps, max_res)
     depths, fps = video_depth_anything.infer_video_depth(frames, target_fps, input_size=518, device=DEVICE)
+    save_video(depths, vis_video_path, fps=fps, is_depths=True)
     if stitch:
         full_frames, _ = read_video_frames(input_path, max_len, target_fps, max_res=-1)
         d_min, d_max = depths.min(), depths.max()
         stitched_frames = []
         for i in range(min(len(full_frames), len(depths))):
             rgb = full_frames[i]
             depth = ((depths[i] - d_min) / (d_max - d_min) * 255).astype(np.uint8)
             if grayscale:
+                if convert_from_color:
+                    import matplotlib
+                    cmap = matplotlib.colormaps.get_cmap("inferno")
+                    depth_color = (cmap(depth / 255.0)[..., :3] * 255).astype(np.uint8)
+                    gray = cv2.cvtColor(depth_color, cv2.COLOR_RGB2GRAY)
+                    depth_vis = np.stack([gray]*3, axis=-1)
+                else:
+                    depth_vis = np.stack([depth]*3, axis=-1)
             else:
                 import matplotlib
                 cmap = matplotlib.colormaps.get_cmap("inferno")
                 depth_vis = (cmap(depth / 255.0)[..., :3] * 255).astype(np.uint8)
             if blur > 0:
+                kernel = int(blur * 20) * 2 + 1
+                depth_vis = cv2.GaussianBlur(depth_vis, (kernel, kernel), 0)
             depth_resized = cv2.resize(depth_vis, (rgb.shape[1], rgb.shape[0]))
+            stitched = cv2.hconcat([rgb, depth_resized])
+            stitched_frames.append(stitched)
+        save_video(np.array(stitched_frames), stitched_video_path, fps=fps)
+        temp_audio_path = stitched_video_path.replace('_RGBD.mp4', '_RGBD_audio.mp4')
+        cmd = [
+            "ffmpeg", "-y", "-i", stitched_video_path, "-i", input_path,
+            "-c:v", "copy", "-c:a", "aac", "-map", "0:v:0", "-map", "1:a:0?",
+            "-shortest", temp_audio_path
+        ]
+        subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+        os.replace(temp_audio_path, stitched_video_path)
+    gc.collect()
+    torch.cuda.empty_cache()
+    return vis_video_path, stitched_video_path
 # Gradio UI
 with gr.Blocks(analytics_enabled=False, css="""
 """) as demo:
     gr.Markdown("# Video Depth Anything + RGBD sbs output")
+    gr.Markdown("Upload a video or paste a URL to generate RGBD output.
+[Project Page](https://videodepthanything.github.io/)")
     with gr.Row(equal_height=True):
         upload_video = gr.Video(label="Upload Video", height=360, scale=1)
         rgbd_out = gr.Video(label="RGBD Output", interactive=False, autoplay=True, show_share_button=True, height=360, scale=2)
     with gr.Row():
+        video_url = gr.Textbox(label="Paste MJ video URL (experimental)", scale=3)
         use_blip = gr.Checkbox(label="Use BLIP for automatic file name", value=True, scale=1)
         blip_name_display = gr.Textbox(label="BLIP file name", interactive=False, scale=2)
         custom_name = gr.Textbox(label="Custom file name", scale=3)
+    # Neue Trigger
+    def handle_mj_url(url, use_blip):
+        if not url.strip():
+            return None, ""
+        try:
+            temp_path = download_video_from_url(url)
+            frames, _ = read_video_frames(temp_path, 999, -1, 480)
+            blip = generate_blip_name(frames[len(frames) // 2]) if use_blip else ""
+            return temp_path, blip
+        except Exception as e:
+            return None, f"Download error: {e}"
+    video_url.change(
+        fn=handle_mj_url,
+        inputs=[video_url, use_blip],
+        outputs=[upload_video, blip_name_display]
+    )
+    def handle_upload(path, use_blip):
+        if not path or not use_blip:
+            return ""
+        frames, _ = read_video_frames(path, 999, -1, 480)
+        return generate_blip_name(frames[len(frames) // 2])
+    upload_video.change(
+        fn=handle_upload,
+        inputs=[upload_video, use_blip],
+        outputs=[blip_name_display]
+    )
     with gr.Accordion("Advanced Settings", open=False):
         max_len = gr.Slider(label="Max process length", minimum=-1, maximum=1000, value=-1, step=1)
         blur = gr.Slider(label="Blur (for edge smoothing)", minimum=0, maximum=1, value=0.3, step=0.01)
     run_btn = gr.Button("Generate")
     run_btn.click(
         fn=infer_video_depth_from_source,
         inputs=[upload_video, video_url, custom_name, use_blip, max_len, target_fps, max_res, stitch, grayscale, convert_from_color, blur],
+        outputs=[depth_out, rgbd_out]
     )
 demo.queue()