Krokodilpirat commited on
Commit
b77d16c
Β·
verified Β·
1 Parent(s): b0290d7

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +273 -232
app.py CHANGED
@@ -7,7 +7,6 @@ import numpy as np
7
  import gradio as gr
8
  import subprocess
9
  import requests
10
- import time
11
  from urllib.parse import urlparse
12
  from huggingface_hub import hf_hub_download
13
  from video_depth_anything.video_depth import VideoDepthAnything
@@ -46,84 +45,6 @@ print("Loading BLIP model...")
46
  blip_processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
47
  blip_model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base").to("cpu")
48
 
49
- # --- Load depth model ---
50
- print("Loading Video Depth Anything model...")
51
- DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
52
- encoder = 'vitl'
53
- model_name = 'Large'
54
- model_configs = {
55
- 'vitl': {'encoder': 'vitl', 'features': 256, 'out_channels': [256, 512, 1024, 1024]},
56
- }
57
- video_depth_anything = VideoDepthAnything(**model_configs[encoder])
58
- ckpt_path = hf_hub_download(repo_id=f"depth-anything/Video-Depth-Anything-{model_name}",
59
- filename=f"video_depth_anything_{encoder}.pth",
60
- cache_dir="/tmp/huggingface")
61
- video_depth_anything.load_state_dict(torch.load(ckpt_path, map_location='cpu'))
62
- video_depth_anything = video_depth_anything.to(DEVICE).eval()
63
-
64
- # --- Global variables for toggling ---
65
- current_video_file = None
66
- current_video_url = None
67
- blip_generated_name = ""
68
- original_filename = ""
69
-
70
- # --- Optimized BLIP processing ---
71
- def get_middle_frame_for_blip(video_path, target_size=480):
72
- """Efficiently extract only the middle frame for BLIP processing"""
73
- try:
74
- cap = cv2.VideoCapture(video_path)
75
- if not cap.isOpened():
76
- raise ValueError(f"Could not open video: {video_path}")
77
-
78
- # Get total frame count
79
- frame_count = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
80
- if frame_count <= 0:
81
- raise ValueError("Video has no frames")
82
-
83
- # Calculate middle frame index
84
- middle_frame_idx = frame_count // 2
85
- print(f"DEBUG: Video has {frame_count} frames, extracting frame {middle_frame_idx} for BLIP")
86
-
87
- # Jump directly to middle frame (no loading of other frames!)
88
- cap.set(cv2.CAP_PROP_POS_FRAMES, middle_frame_idx)
89
- ret, frame = cap.read()
90
-
91
- if not ret or frame is None:
92
- raise ValueError("Could not read middle frame")
93
-
94
- # Get original dimensions
95
- original_height, original_width = frame.shape[:2]
96
- print(f"DEBUG: Original frame size: {original_width} x {original_height}")
97
-
98
- # Calculate new dimensions maintaining aspect ratio
99
- if original_width > original_height:
100
- new_width = target_size
101
- new_height = int((original_height * target_size) / original_width)
102
- else:
103
- new_height = target_size
104
- new_width = int((original_width * target_size) / original_height)
105
-
106
- # Ensure even dimensions for compatibility
107
- new_width = new_width if new_width % 2 == 0 else new_width + 1
108
- new_height = new_height if new_height % 2 == 0 else new_height + 1
109
-
110
- print(f"DEBUG: BLIP frame resized to: {new_width} x {new_height}")
111
-
112
- # Resize only this one frame
113
- frame_resized = cv2.resize(frame, (new_width, new_height))
114
-
115
- # Convert BGR to RGB for BLIP
116
- frame_rgb = cv2.cvtColor(frame_resized, cv2.COLOR_BGR2RGB)
117
-
118
- cap.release()
119
- return frame_rgb
120
-
121
- except Exception as e:
122
- print(f"ERROR: Failed to extract middle frame: {e}")
123
- if 'cap' in locals():
124
- cap.release()
125
- raise
126
-
127
  def generate_blip_name(frame: np.ndarray) -> str:
128
  """Generate filename from frame using BLIP image captioning"""
129
  try:
@@ -139,17 +60,28 @@ def generate_blip_name(frame: np.ndarray) -> str:
139
  # Remove common stopwords and create filename
140
  stopwords = {"a", "an", "the", "in", "on", "at", "with", "by", "of", "for", "under", "through", "and", "is"}
141
  words = [w for w in caption.split() if w not in stopwords and w.isalpha()]
142
-
143
- # Remove duplicates while preserving order
144
- words = list(dict.fromkeys(words))
145
-
146
  trimmed = "_".join(words[:3])
147
  return trimmed[:30] if trimmed else "video"
148
  except Exception as e:
149
  print(f"BLIP error: {e}")
150
  return "video"
151
 
152
- # --- URL validation and video source detection ---
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
153
  def validate_url(url):
154
  """Validate if URL is properly formatted"""
155
  try:
@@ -169,7 +101,6 @@ def detect_video_source(url):
169
  else:
170
  return "unknown"
171
 
172
- # --- Video download functions ---
173
  def optimize_civitai_url(url):
174
  """Convert gallery Civitai URLs to original quality to avoid dimension issues"""
175
  if "image.civitai.com" in url and "width=450" in url:
@@ -187,12 +118,14 @@ def download_civitai_video(civitai_url):
187
  # Optimize URL to avoid dimension issues
188
  civitai_url = optimize_civitai_url(civitai_url)
189
 
 
190
  headers = {
191
  'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
192
  'Referer': 'https://civitai.com/',
193
  'Accept': 'video/webm,video/mp4,video/*;q=0.9,*/*;q=0.8',
194
  }
195
 
 
196
  print(f"DEBUG: Downloading optimized Civitai video: {civitai_url}")
197
 
198
  response = requests.get(civitai_url, headers=headers, stream=True, timeout=30)
@@ -201,8 +134,10 @@ def download_civitai_video(civitai_url):
201
  # Create filename based on URL
202
  try:
203
  parsed_url = urlparse(civitai_url)
 
204
  path_parts = parsed_url.path.split('/')
205
  if len(path_parts) > 1:
 
206
  filename_part = path_parts[-1]
207
  if '.' in filename_part:
208
  temp_path = f"temp_civitai_{filename_part}"
@@ -211,6 +146,7 @@ def download_civitai_video(civitai_url):
211
  else:
212
  temp_path = f"temp_civitai_{int(time.time())}.webm"
213
  except:
 
214
  temp_path = f"temp_civitai_{int(time.time())}.webm"
215
 
216
  # Download the file
@@ -225,43 +161,25 @@ def download_civitai_video(civitai_url):
225
  except Exception as e:
226
  raise RuntimeError(f"Failed to download Civitai video: {e}")
227
 
228
- def download_kling_video(kling_url):
229
- """Direct download for Kling videos (no proxy needed)"""
230
  try:
231
- headers = {
232
- 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
233
- 'Referer': 'https://kling.ai/',
234
- 'Accept': 'video/mp4,video/*;q=0.9,*/*;q=0.8',
235
- }
236
-
237
- print(f"DEBUG: Downloading Kling video: {kling_url}")
238
-
239
- response = requests.get(kling_url, headers=headers, stream=True, timeout=30)
240
- response.raise_for_status()
241
-
242
- # Create filename - extract video ID from URL
243
- try:
244
- import re
245
- match = re.search(r'/([a-f0-9-]{36})_', kling_url)
246
- if match:
247
- video_id = match.group(1)[:12]
248
- temp_path = f"temp_kling_{video_id}.mp4"
249
- else:
250
- temp_path = f"temp_kling_{int(time.time())}.mp4"
251
- except:
252
- temp_path = f"temp_kling_{int(time.time())}.mp4"
253
-
254
- # Download the file
255
- with open(temp_path, "wb") as f:
256
- for chunk in response.iter_content(chunk_size=8192):
257
- if chunk:
258
- f.write(chunk)
259
 
260
- print(f"DEBUG: Kling video downloaded to: {temp_path}")
261
- return temp_path
 
262
 
 
 
 
 
 
 
 
263
  except Exception as e:
264
- raise RuntimeError(f"Failed to download Kling video: {e}")
265
 
266
  def download_midjourney_video(mj_url):
267
  """Download MidJourney videos via proxy"""
@@ -269,14 +187,17 @@ def download_midjourney_video(mj_url):
269
  proxy_base = "https://9cee417c-5874-4e53-939a-52ad3f6f2f30-00-16i6nbwyeqga.picard.replit.dev/"
270
  proxy_url = f"{proxy_base}?url={mj_url}"
271
 
 
272
  try:
273
  parsed_url = urlparse(mj_url)
274
  url_filename = os.path.basename(parsed_url.path)
275
  if url_filename and '.' in url_filename:
276
  temp_path = f"temp_mj_{url_filename}"
277
  else:
 
278
  temp_path = f"temp_mj_{int(time.time())}.mp4"
279
  except:
 
280
  temp_path = f"temp_mj_{int(time.time())}.mp4"
281
 
282
  print(f"DEBUG: Downloading MJ video via proxy: {proxy_url}")
@@ -302,6 +223,7 @@ def download_generic_video(url):
302
  response = requests.get(url, headers=headers, stream=True, timeout=30)
303
  response.raise_for_status()
304
 
 
305
  temp_path = f"temp_generic_{int(time.time())}.mp4"
306
 
307
  with open(temp_path, "wb") as f:
@@ -313,52 +235,11 @@ def download_generic_video(url):
313
  except Exception as e:
314
  raise RuntimeError(f"Failed to download generic video: {e}")
315
 
316
- def download_video_from_url(original_url):
317
- """Universal video downloader for MJ, Civitai, Kling, and others"""
318
- try:
319
- if not validate_url(original_url):
320
- raise ValueError("Invalid URL format")
321
-
322
- source = detect_video_source(original_url)
323
- print(f"DEBUG: Detected video source: {source}")
324
-
325
- if source == "civitai":
326
- return download_civitai_video(original_url)
327
- elif source == "kling":
328
- return download_kling_video(original_url)
329
- elif source == "midjourney":
330
- return download_midjourney_video(original_url)
331
- else:
332
- return download_generic_video(original_url)
333
-
334
- except Exception as e:
335
- raise RuntimeError(f"Failed to download video: {e}")
336
-
337
- # --- Testing functions ---
338
- def test_civitai_download(url):
339
- """Test function to check what format we get from Civitai"""
340
- try:
341
- print(f"πŸ§ͺ Testing Civitai download: {url}")
342
-
343
- headers = {
344
- 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36',
345
- 'Referer': 'https://civitai.com/',
346
- 'Accept': 'video/webm,video/mp4,video/*;q=0.9,*/*;q=0.8',
347
- }
348
-
349
- response = requests.head(url, headers=headers, timeout=10)
350
-
351
- print(f"πŸ“‹ Response Status: {response.status_code}")
352
- print(f"πŸ“‹ Content-Type: {response.headers.get('content-type', 'Unknown')}")
353
- print(f"πŸ“‹ Content-Length: {response.headers.get('content-length', 'Unknown')} bytes")
354
-
355
- if response.status_code == 200:
356
- return True, "βœ… Civitai URL is accessible"
357
- else:
358
- return False, f"❌ Status: {response.status_code}"
359
-
360
- except Exception as e:
361
- return False, f"❌ Error: {str(e)}"
362
 
363
  # --- Main inference function ---
364
  def infer_video_depth_from_source(upload_video, video_url, filename, use_blip, *args):
@@ -366,35 +247,43 @@ def infer_video_depth_from_source(upload_video, video_url, filename, use_blip, *
366
  try:
367
  max_len, target_fps, max_res, stitch, grayscale, convert_from_color, blur = args
368
 
 
369
  input_path = upload_video or video_url
370
  if not input_path:
371
  return None, None, "Error: No video source provided"
372
 
 
373
  base_name = filename.strip().replace(" ", "_")[:30] if filename.strip() else "output"
374
 
375
  print(f"DEBUG: Final filename locked in: '{base_name}'")
376
 
 
377
  output_dir = "./outputs"
378
  os.makedirs(output_dir, exist_ok=True)
379
 
 
380
  vis_video_path = os.path.join(output_dir, base_name + "_vis.mp4")
381
  rgbd_video_path = os.path.join(output_dir, base_name + "_RGBD.mp4")
382
 
383
  print(f"DEBUG: Output files - Vis: '{vis_video_path}', RGBD: '{rgbd_video_path}'")
384
 
 
385
  print("Reading video frames...")
386
  frames, target_fps = read_video_frames(input_path, max_len, target_fps, max_res)
387
  if len(frames) == 0:
388
  return None, None, "Error: No frames could be extracted from video"
389
 
 
390
  print("Generating depth maps...")
391
  depths, fps = video_depth_anything.infer_video_depth(frames, target_fps, input_size=518, device=DEVICE)
392
 
 
393
  save_video(depths, vis_video_path, fps=fps, is_depths=True)
394
 
395
  rgbd_path = None
396
  if stitch:
397
  print("Creating RGBD stitched video...")
 
398
  full_frames, _ = read_video_frames(input_path, max_len, target_fps, max_res=-1)
399
  d_min, d_max = depths.min(), depths.max()
400
  stitched_frames = []
@@ -403,6 +292,7 @@ def infer_video_depth_from_source(upload_video, video_url, filename, use_blip, *
403
  rgb = full_frames[i]
404
  depth = ((depths[i] - d_min) / (d_max - d_min) * 255).astype(np.uint8)
405
 
 
406
  if grayscale:
407
  if convert_from_color:
408
  import matplotlib
@@ -417,16 +307,20 @@ def infer_video_depth_from_source(upload_video, video_url, filename, use_blip, *
417
  cmap = matplotlib.colormaps.get_cmap("inferno")
418
  depth_vis = (cmap(depth / 255.0)[..., :3] * 255).astype(np.uint8)
419
 
 
420
  if blur > 0:
421
  kernel = int(blur * 20) * 2 + 1
422
  depth_vis = cv2.GaussianBlur(depth_vis, (kernel, kernel), 0)
423
 
 
424
  depth_resized = cv2.resize(depth_vis, (rgb.shape[1], rgb.shape[0]))
425
  stitched = cv2.hconcat([rgb, depth_resized])
426
  stitched_frames.append(stitched)
427
 
 
428
  save_video(np.array(stitched_frames), rgbd_video_path, fps=fps)
429
 
 
430
  try:
431
  temp_audio_path = rgbd_video_path.replace('.mp4', '_audio.mp4')
432
  cmd = [
@@ -442,6 +336,7 @@ def infer_video_depth_from_source(upload_video, video_url, filename, use_blip, *
442
  print(f"Audio processing failed: {e}")
443
  rgbd_path = rgbd_video_path
444
 
 
445
  gc.collect()
446
  if torch.cuda.is_available():
447
  torch.cuda.empty_cache()
@@ -466,16 +361,19 @@ def on_video_upload_change(video_file, use_blip):
466
  current_video_file = None
467
  blip_generated_name = ""
468
  original_filename = ""
469
- return "", gr.update(), "Upload a video file"
470
 
471
  try:
 
472
  current_video_file = video_file
473
- current_video_url = None
474
 
475
  print(f"DEBUG: Processing upload - video_file type: {type(video_file)}")
476
 
477
- original_filename = "uploaded_video"
 
478
 
 
479
  if hasattr(video_file, 'name') and video_file.name:
480
  print(f"DEBUG: video_file.name = '{video_file.name}'")
481
  original_name = os.path.splitext(os.path.basename(video_file.name))[0]
@@ -484,6 +382,7 @@ def on_video_upload_change(video_file, use_blip):
484
  original_filename = cleaned
485
  print(f"DEBUG: Method 1 success: '{original_filename}'")
486
 
 
487
  elif hasattr(video_file, 'orig_name') and video_file.orig_name:
488
  print(f"DEBUG: video_file.orig_name = '{video_file.orig_name}'")
489
  original_name = os.path.splitext(os.path.basename(video_file.orig_name))[0]
@@ -492,6 +391,7 @@ def on_video_upload_change(video_file, use_blip):
492
  original_filename = cleaned
493
  print(f"DEBUG: Method 2 success: '{original_filename}'")
494
 
 
495
  elif isinstance(video_file, str):
496
  print(f"DEBUG: video_file is string: '{video_file}'")
497
  original_name = os.path.splitext(os.path.basename(video_file))[0]
@@ -502,6 +402,7 @@ def on_video_upload_change(video_file, use_blip):
502
 
503
  print(f"DEBUG: Final original filename set to: '{original_filename}'")
504
 
 
505
  blip_generated_name = ""
506
  if use_blip:
507
  print("DEBUG: Starting optimized BLIP processing...")
@@ -509,10 +410,11 @@ def on_video_upload_change(video_file, use_blip):
509
  blip_generated_name = generate_blip_name(frame)
510
  print(f"DEBUG: BLIP name generated: '{blip_generated_name}'")
511
 
 
512
  final_name = blip_generated_name if (use_blip and blip_generated_name) else original_filename
513
  print(f"DEBUG: Final name returned: '{final_name}' (BLIP: {use_blip})")
514
-
515
- return final_name, gr.update(), "Video uploaded successfully!"
516
 
517
  except Exception as e:
518
  error_msg = f"Upload processing failed: {str(e)}"
@@ -520,80 +422,55 @@ def on_video_upload_change(video_file, use_blip):
520
  return "uploaded_video", gr.update(), error_msg
521
 
522
  def on_video_url_change(url, use_blip):
523
- """Handle URL input change with support for MJ, Civitai, and Kling"""
524
  global current_video_file, current_video_url, blip_generated_name, original_filename
525
 
526
- print(f"DEBUG: URL handler called with URL: '{url}'")
527
-
528
  if not url or url.strip() == "":
529
- print("DEBUG: Empty URL - clearing state")
530
  current_video_file = None
531
  current_video_url = None
532
  blip_generated_name = ""
533
  original_filename = ""
534
- return gr.update(), "", "Enter a video URL (MidJourney, Civitai, or Kling supported)"
535
 
536
  try:
537
  source = detect_video_source(url)
538
- print(f"DEBUG: Processing URL for source: {source}")
539
 
540
- if source == "civitai":
541
- print("πŸ” Civitai URL detected - running test...")
542
- test_success, test_message = test_civitai_download(url)
543
- print(test_message)
544
-
545
- if not test_success:
546
- return gr.update(), "", f"Civitai test failed: {test_message}"
547
 
548
- # Extract filename
549
  try:
550
- parsed_url = urlparse(url)
551
- url_path = parsed_url.path
552
-
553
  if source == "civitai":
554
- path_parts = url_path.split('/')
 
 
 
555
  for part in reversed(path_parts):
556
- if part and '.' in part:
557
- clean_name = os.path.splitext(part)[0]
558
- original_filename = "".join(c for c in clean_name if c.isalnum() or c in "_-")[:30]
559
- break
560
- elif part and len(part) > 2 and not part.startswith('transcode'):
561
- original_filename = "".join(c for c in part if c.isalnum() or c in "_-")[:30]
562
- break
563
  else:
564
  original_filename = "civitai_video"
565
 
566
- elif source == "kling":
567
- import re
568
- match = re.search(r'/([a-f0-9-]{36})_', url)
569
- if match:
570
- video_id = match.group(1)[:12]
571
- original_filename = f"kling_{video_id}"
572
- else:
573
- original_filename = "kling_video"
574
-
575
  else:
576
- url_filename = os.path.basename(url_path)
577
- if url_filename and '.' in url_filename:
578
- url_name = os.path.splitext(url_filename)[0]
579
- original_filename = "".join(c for c in url_name if c.isalnum() or c in "_-")[:30]
580
- if not original_filename:
581
- original_filename = "downloaded_video"
582
- else:
583
- original_filename = "downloaded_video"
584
  except:
585
  original_filename = f"{source}_video" if source != "unknown" else "downloaded_video"
586
 
587
- print(f"DEBUG: CLEAN original filename extracted: '{original_filename}' (source: {source})")
588
-
589
- print(f"Downloading {source} video from URL: {url}")
590
- video_path = download_video_from_url(url)
591
-
592
- current_video_file = None
593
- current_video_url = video_path
594
 
595
  blip_generated_name = ""
596
 
 
597
  if use_blip and video_path:
598
  try:
599
  print("DEBUG: Starting optimized BLIP processing for URL video...")
@@ -604,46 +481,41 @@ def on_video_url_change(url, use_blip):
604
  print(f"BLIP naming failed: {e}")
605
  blip_generated_name = ""
606
 
 
607
  final_name = blip_generated_name if (use_blip and blip_generated_name) else original_filename
 
608
  print(f"DEBUG: {source.title()} final name returned: '{final_name}' (BLIP: {use_blip})")
609
-
610
- if source in ["civitai", "kling"]:
611
- if os.path.exists(video_path):
612
- file_size = os.path.getsize(video_path)
613
- print(f"πŸ“ Downloaded file: {video_path} ({file_size} bytes)")
614
- success_msg = f"βœ… {source.title()} video downloaded! File: {os.path.basename(video_path)}"
615
- else:
616
- success_msg = f"βœ… {source.title()} video processed!"
617
- else:
618
- success_msg = f"βœ… {source.title()} video downloaded successfully!"
619
-
620
  return video_path, final_name, success_msg
621
 
622
  except Exception as e:
623
  error_msg = f"Download failed: {str(e)}"
624
- print(f"DEBUG: URL handler error: {error_msg}")
625
- return gr.update(), "", error_msg
626
 
627
  def on_blip_toggle(use_blip):
628
  """Handle BLIP checkbox toggle - switch between BLIP and original name"""
629
  global current_video_file, current_video_url, blip_generated_name, original_filename
630
 
 
631
  if current_video_file is None and current_video_url is None:
632
  return "", "No video loaded"
633
 
634
  print(f"DEBUG: Toggle called - BLIP: {use_blip}, Original: '{original_filename}', BLIP name: '{blip_generated_name}'")
635
 
636
  try:
 
637
  if use_blip and not blip_generated_name:
638
  if current_video_file:
639
  frame = get_middle_frame_for_blip(current_video_file, target_size=480)
640
  blip_generated_name = generate_blip_name(frame)
641
  print(f"DEBUG: Generated new BLIP name from file: '{blip_generated_name}'")
642
  elif current_video_url:
 
643
  frame = get_middle_frame_for_blip(current_video_url, target_size=480)
644
  blip_generated_name = generate_blip_name(frame)
645
  print(f"DEBUG: Generated new BLIP name from URL: '{blip_generated_name}'")
646
 
 
647
  if use_blip and blip_generated_name:
648
  final_name = blip_generated_name
649
  status = "Using BLIP generated name"
@@ -651,4 +523,173 @@ def on_blip_toggle(use_blip):
651
  final_name = original_filename if original_filename else "video"
652
  status = "Using original filename"
653
 
654
- print(f"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7
  import gradio as gr
8
  import subprocess
9
  import requests
 
10
  from urllib.parse import urlparse
11
  from huggingface_hub import hf_hub_download
12
  from video_depth_anything.video_depth import VideoDepthAnything
 
45
  blip_processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
46
  blip_model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base").to("cpu")
47
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
48
  def generate_blip_name(frame: np.ndarray) -> str:
49
  """Generate filename from frame using BLIP image captioning"""
50
  try:
 
60
  # Remove common stopwords and create filename
61
  stopwords = {"a", "an", "the", "in", "on", "at", "with", "by", "of", "for", "under", "through", "and", "is"}
62
  words = [w for w in caption.split() if w not in stopwords and w.isalpha()]
 
 
 
 
63
  trimmed = "_".join(words[:3])
64
  return trimmed[:30] if trimmed else "video"
65
  except Exception as e:
66
  print(f"BLIP error: {e}")
67
  return "video"
68
 
69
+ # --- Load depth model ---
70
+ print("Loading Video Depth Anything model...")
71
+ DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
72
+ encoder = 'vitl'
73
+ model_name = 'Large'
74
+ model_configs = {
75
+ 'vitl': {'encoder': 'vitl', 'features': 256, 'out_channels': [256, 512, 1024, 1024]},
76
+ }
77
+ video_depth_anything = VideoDepthAnything(**model_configs[encoder])
78
+ ckpt_path = hf_hub_download(repo_id=f"depth-anything/Video-Depth-Anything-{model_name}",
79
+ filename=f"video_depth_anything_{encoder}.pth",
80
+ cache_dir="/tmp/huggingface")
81
+ video_depth_anything.load_state_dict(torch.load(ckpt_path, map_location='cpu'))
82
+ video_depth_anything = video_depth_anything.to(DEVICE).eval()
83
+
84
+ # --- URL validation and download ---
85
  def validate_url(url):
86
  """Validate if URL is properly formatted"""
87
  try:
 
101
  else:
102
  return "unknown"
103
 
 
104
  def optimize_civitai_url(url):
105
  """Convert gallery Civitai URLs to original quality to avoid dimension issues"""
106
  if "image.civitai.com" in url and "width=450" in url:
 
118
  # Optimize URL to avoid dimension issues
119
  civitai_url = optimize_civitai_url(civitai_url)
120
 
121
+ # Civitai videos kΓΆnnen oft direkt geladen werden
122
  headers = {
123
  'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
124
  'Referer': 'https://civitai.com/',
125
  'Accept': 'video/webm,video/mp4,video/*;q=0.9,*/*;q=0.8',
126
  }
127
 
128
+ # Try direct download first
129
  print(f"DEBUG: Downloading optimized Civitai video: {civitai_url}")
130
 
131
  response = requests.get(civitai_url, headers=headers, stream=True, timeout=30)
 
134
  # Create filename based on URL
135
  try:
136
  parsed_url = urlparse(civitai_url)
137
+ # Extract filename from URL path
138
  path_parts = parsed_url.path.split('/')
139
  if len(path_parts) > 1:
140
+ # Get the last part that might be a filename
141
  filename_part = path_parts[-1]
142
  if '.' in filename_part:
143
  temp_path = f"temp_civitai_{filename_part}"
 
146
  else:
147
  temp_path = f"temp_civitai_{int(time.time())}.webm"
148
  except:
149
+ import time
150
  temp_path = f"temp_civitai_{int(time.time())}.webm"
151
 
152
  # Download the file
 
161
  except Exception as e:
162
  raise RuntimeError(f"Failed to download Civitai video: {e}")
163
 
164
+ def download_video_from_url(original_url):
165
+ """Universal video downloader for MJ, Civitai, and others"""
166
  try:
167
+ if not validate_url(original_url):
168
+ raise ValueError("Invalid URL format")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
169
 
170
+ # Detect source and use appropriate method
171
+ source = detect_video_source(original_url)
172
+ print(f"DEBUG: Detected video source: {source}")
173
 
174
+ if source == "civitai":
175
+ return download_civitai_video(original_url)
176
+ elif source == "midjourney":
177
+ return download_midjourney_video(original_url)
178
+ else:
179
+ return download_generic_video(original_url)
180
+
181
  except Exception as e:
182
+ raise RuntimeError(f"Failed to download video: {e}")
183
 
184
  def download_midjourney_video(mj_url):
185
  """Download MidJourney videos via proxy"""
 
187
  proxy_base = "https://9cee417c-5874-4e53-939a-52ad3f6f2f30-00-16i6nbwyeqga.picard.replit.dev/"
188
  proxy_url = f"{proxy_base}?url={mj_url}"
189
 
190
+ # Create filename
191
  try:
192
  parsed_url = urlparse(mj_url)
193
  url_filename = os.path.basename(parsed_url.path)
194
  if url_filename and '.' in url_filename:
195
  temp_path = f"temp_mj_{url_filename}"
196
  else:
197
+ import time
198
  temp_path = f"temp_mj_{int(time.time())}.mp4"
199
  except:
200
+ import time
201
  temp_path = f"temp_mj_{int(time.time())}.mp4"
202
 
203
  print(f"DEBUG: Downloading MJ video via proxy: {proxy_url}")
 
223
  response = requests.get(url, headers=headers, stream=True, timeout=30)
224
  response.raise_for_status()
225
 
226
+ import time
227
  temp_path = f"temp_generic_{int(time.time())}.mp4"
228
 
229
  with open(temp_path, "wb") as f:
 
235
  except Exception as e:
236
  raise RuntimeError(f"Failed to download generic video: {e}")
237
 
238
+ # --- Global variables for toggling ---
239
+ current_video_file = None
240
+ current_video_url = None
241
+ blip_generated_name = ""
242
+ original_filename = ""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
243
 
244
  # --- Main inference function ---
245
  def infer_video_depth_from_source(upload_video, video_url, filename, use_blip, *args):
 
247
  try:
248
  max_len, target_fps, max_res, stitch, grayscale, convert_from_color, blur = args
249
 
250
+ # Determine input source
251
  input_path = upload_video or video_url
252
  if not input_path:
253
  return None, None, "Error: No video source provided"
254
 
255
+ # Fix filename at generation time (no more changing after this point)
256
  base_name = filename.strip().replace(" ", "_")[:30] if filename.strip() else "output"
257
 
258
  print(f"DEBUG: Final filename locked in: '{base_name}'")
259
 
260
+ # Create output directory
261
  output_dir = "./outputs"
262
  os.makedirs(output_dir, exist_ok=True)
263
 
264
+ # Use final names (not temp names!)
265
  vis_video_path = os.path.join(output_dir, base_name + "_vis.mp4")
266
  rgbd_video_path = os.path.join(output_dir, base_name + "_RGBD.mp4")
267
 
268
  print(f"DEBUG: Output files - Vis: '{vis_video_path}', RGBD: '{rgbd_video_path}'")
269
 
270
+ # Process video frames
271
  print("Reading video frames...")
272
  frames, target_fps = read_video_frames(input_path, max_len, target_fps, max_res)
273
  if len(frames) == 0:
274
  return None, None, "Error: No frames could be extracted from video"
275
 
276
+ # Generate depth maps
277
  print("Generating depth maps...")
278
  depths, fps = video_depth_anything.infer_video_depth(frames, target_fps, input_size=518, device=DEVICE)
279
 
280
+ # Save depth visualization with final name
281
  save_video(depths, vis_video_path, fps=fps, is_depths=True)
282
 
283
  rgbd_path = None
284
  if stitch:
285
  print("Creating RGBD stitched video...")
286
+ # Read full resolution frames for stitching
287
  full_frames, _ = read_video_frames(input_path, max_len, target_fps, max_res=-1)
288
  d_min, d_max = depths.min(), depths.max()
289
  stitched_frames = []
 
292
  rgb = full_frames[i]
293
  depth = ((depths[i] - d_min) / (d_max - d_min) * 255).astype(np.uint8)
294
 
295
+ # Apply depth visualization options
296
  if grayscale:
297
  if convert_from_color:
298
  import matplotlib
 
307
  cmap = matplotlib.colormaps.get_cmap("inferno")
308
  depth_vis = (cmap(depth / 255.0)[..., :3] * 255).astype(np.uint8)
309
 
310
+ # Apply blur if requested
311
  if blur > 0:
312
  kernel = int(blur * 20) * 2 + 1
313
  depth_vis = cv2.GaussianBlur(depth_vis, (kernel, kernel), 0)
314
 
315
+ # Resize depth to match RGB and stitch side by side
316
  depth_resized = cv2.resize(depth_vis, (rgb.shape[1], rgb.shape[0]))
317
  stitched = cv2.hconcat([rgb, depth_resized])
318
  stitched_frames.append(stitched)
319
 
320
+ # Save stitched video with final name
321
  save_video(np.array(stitched_frames), rgbd_video_path, fps=fps)
322
 
323
+ # Add audio from original video if possible
324
  try:
325
  temp_audio_path = rgbd_video_path.replace('.mp4', '_audio.mp4')
326
  cmd = [
 
336
  print(f"Audio processing failed: {e}")
337
  rgbd_path = rgbd_video_path
338
 
339
+ # Clean up memory
340
  gc.collect()
341
  if torch.cuda.is_available():
342
  torch.cuda.empty_cache()
 
361
  current_video_file = None
362
  blip_generated_name = ""
363
  original_filename = ""
364
+ return "", gr.update(), "Upload a video file" # Don't change URL when clearing
365
 
366
  try:
367
+ # Store the current video
368
  current_video_file = video_file
369
+ current_video_url = None # Clear URL when uploading file
370
 
371
  print(f"DEBUG: Processing upload - video_file type: {type(video_file)}")
372
 
373
+ # Generate original filename FIRST - try multiple ways
374
+ original_filename = "uploaded_video" # Default fallback
375
 
376
+ # Method 1: Check .name attribute
377
  if hasattr(video_file, 'name') and video_file.name:
378
  print(f"DEBUG: video_file.name = '{video_file.name}'")
379
  original_name = os.path.splitext(os.path.basename(video_file.name))[0]
 
382
  original_filename = cleaned
383
  print(f"DEBUG: Method 1 success: '{original_filename}'")
384
 
385
+ # Method 2: Check .orig_name attribute (Gradio sometimes uses this)
386
  elif hasattr(video_file, 'orig_name') and video_file.orig_name:
387
  print(f"DEBUG: video_file.orig_name = '{video_file.orig_name}'")
388
  original_name = os.path.splitext(os.path.basename(video_file.orig_name))[0]
 
391
  original_filename = cleaned
392
  print(f"DEBUG: Method 2 success: '{original_filename}'")
393
 
394
+ # Method 3: Try to get filename from the file path itself
395
  elif isinstance(video_file, str):
396
  print(f"DEBUG: video_file is string: '{video_file}'")
397
  original_name = os.path.splitext(os.path.basename(video_file))[0]
 
402
 
403
  print(f"DEBUG: Final original filename set to: '{original_filename}'")
404
 
405
+ # Generate BLIP name
406
  blip_generated_name = ""
407
  if use_blip:
408
  print("DEBUG: Starting optimized BLIP processing...")
 
410
  blip_generated_name = generate_blip_name(frame)
411
  print(f"DEBUG: BLIP name generated: '{blip_generated_name}'")
412
 
413
+ # Return appropriate name based on BLIP setting
414
  final_name = blip_generated_name if (use_blip and blip_generated_name) else original_filename
415
  print(f"DEBUG: Final name returned: '{final_name}' (BLIP: {use_blip})")
416
+ print(f"DEBUG: Returning - filename: '{final_name}', clear URL: '', status: 'success'")
417
+ return final_name, "", "Video uploaded successfully!" # Clear URL when video uploaded
418
 
419
  except Exception as e:
420
  error_msg = f"Upload processing failed: {str(e)}"
 
422
  return "uploaded_video", gr.update(), error_msg
423
 
424
  def on_video_url_change(url, use_blip):
425
+ """Handle URL input change with support for MJ and Civitai"""
426
  global current_video_file, current_video_url, blip_generated_name, original_filename
427
 
 
 
428
  if not url or url.strip() == "":
 
429
  current_video_file = None
430
  current_video_url = None
431
  blip_generated_name = ""
432
  original_filename = ""
433
+ return None, "", "Enter a video URL (MidJourney or Civitai supported)"
434
 
435
  try:
436
  source = detect_video_source(url)
437
+ print(f"Downloading {source} video from URL: {url}")
438
 
439
+ video_path = download_video_from_url(url)
440
+
441
+ # Store the current video info
442
+ current_video_file = None # Clear file when using URL
443
+ current_video_url = video_path
 
 
444
 
445
+ # Set original filename based on source
446
  try:
 
 
 
447
  if source == "civitai":
448
+ # Extract filename from Civitai URL
449
+ parsed_url = urlparse(url)
450
+ path_parts = parsed_url.path.split('/')
451
+ # Look for meaningful filename in path
452
  for part in reversed(path_parts):
453
+ if part and '.' not in part and len(part) > 3:
454
+ cleaned = "".join(c for c in part if c.isalnum() or c in "_-")[:20]
455
+ if cleaned:
456
+ original_filename = f"civitai_{cleaned}"
457
+ break
 
 
458
  else:
459
  original_filename = "civitai_video"
460
 
461
+ elif source == "midjourney":
462
+ original_filename = "midjourney_video"
 
 
 
 
 
 
 
463
  else:
464
+ original_filename = "downloaded_video"
465
+
 
 
 
 
 
 
466
  except:
467
  original_filename = f"{source}_video" if source != "unknown" else "downloaded_video"
468
 
469
+ print(f"DEBUG: {source.title()} original filename set to: '{original_filename}'")
 
 
 
 
 
 
470
 
471
  blip_generated_name = ""
472
 
473
+ # Generate BLIP name if requested
474
  if use_blip and video_path:
475
  try:
476
  print("DEBUG: Starting optimized BLIP processing for URL video...")
 
481
  print(f"BLIP naming failed: {e}")
482
  blip_generated_name = ""
483
 
484
+ # Return appropriate name
485
  final_name = blip_generated_name if (use_blip and blip_generated_name) else original_filename
486
+ success_msg = f"βœ… {source.title()} video downloaded successfully!"
487
  print(f"DEBUG: {source.title()} final name returned: '{final_name}' (BLIP: {use_blip})")
 
 
 
 
 
 
 
 
 
 
 
488
  return video_path, final_name, success_msg
489
 
490
  except Exception as e:
491
  error_msg = f"Download failed: {str(e)}"
492
+ print(error_msg)
493
+ return None, "", error_msg
494
 
495
  def on_blip_toggle(use_blip):
496
  """Handle BLIP checkbox toggle - switch between BLIP and original name"""
497
  global current_video_file, current_video_url, blip_generated_name, original_filename
498
 
499
+ # Only react if we have a video loaded
500
  if current_video_file is None and current_video_url is None:
501
  return "", "No video loaded"
502
 
503
  print(f"DEBUG: Toggle called - BLIP: {use_blip}, Original: '{original_filename}', BLIP name: '{blip_generated_name}'")
504
 
505
  try:
506
+ # If toggling BLIP on and we don't have a BLIP name yet, generate it
507
  if use_blip and not blip_generated_name:
508
  if current_video_file:
509
  frame = get_middle_frame_for_blip(current_video_file, target_size=480)
510
  blip_generated_name = generate_blip_name(frame)
511
  print(f"DEBUG: Generated new BLIP name from file: '{blip_generated_name}'")
512
  elif current_video_url:
513
+ # For URL videos, we might need to re-read frames
514
  frame = get_middle_frame_for_blip(current_video_url, target_size=480)
515
  blip_generated_name = generate_blip_name(frame)
516
  print(f"DEBUG: Generated new BLIP name from URL: '{blip_generated_name}'")
517
 
518
+ # Return appropriate name based on toggle
519
  if use_blip and blip_generated_name:
520
  final_name = blip_generated_name
521
  status = "Using BLIP generated name"
 
523
  final_name = original_filename if original_filename else "video"
524
  status = "Using original filename"
525
 
526
+ print(f"DEBUG: Toggle returning: '{final_name}' - {status}")
527
+ return final_name, status
528
+
529
+ except Exception as e:
530
+ error_msg = f"Name generation failed: {str(e)}"
531
+ print(error_msg)
532
+ fallback = original_filename if original_filename else "video"
533
+ return fallback, error_msg
534
+
535
+ # --- Gradio Interface ---
536
+ with gr.Blocks(analytics_enabled=False, title="Video Depth Anything") as demo:
537
+ gr.Markdown("""
538
+ # πŸŽ₯ Video Depth Anything + RGBD Output
539
+
540
+ Generate depth maps from videos and watch RGBD videos on holographic displays like Looking Glass Go.
541
+ Upload a video or paste a video URL (Midjourney, Civitai, or Kling).
542
+
543
+ [πŸ”— Project Page](https://videodepthanything.github.io/) | [πŸ“– Paper](https://arxiv.org/abs/2401.01884)
544
+ """)
545
+
546
+ # Status display
547
+ status_display = gr.HTML("")
548
+
549
+ with gr.Row(equal_height=True):
550
+ with gr.Column(scale=1):
551
+ upload_video = gr.Video(
552
+ label="πŸ“ Upload Video",
553
+ height=500,
554
+ show_label=True
555
+ )
556
+ with gr.Column(scale=1):
557
+ depth_out = gr.Video(
558
+ label="🎨 Depth Visualization",
559
+ interactive=False,
560
+ autoplay=True,
561
+ height=500,
562
+ show_label=True
563
+ )
564
+ with gr.Column(scale=2):
565
+ rgbd_out = gr.Video(
566
+ label="πŸ”„ RGBD Side-by-Side",
567
+ interactive=False,
568
+ autoplay=True,
569
+ height=500,
570
+ show_label=True
571
+ )
572
+
573
+ with gr.Row():
574
+ video_url = gr.Textbox(
575
+ label="πŸ”— Video URL (MJ, Civitai, or Kling)",
576
+ placeholder="Paste MidJourney, Civitai, or Kling video URL here...",
577
+ scale=4
578
+ )
579
+ use_blip = gr.Checkbox(
580
+ label="πŸ€– Auto-name with BLIP",
581
+ value=True,
582
+ scale=2,
583
+ info="Generate filename from video content"
584
+ )
585
+ filename = gr.Textbox(
586
+ label="πŸ“ Output Filename (_RGBD.mp4 will be added)",
587
+ placeholder="Enter filename or let BLIP generate it",
588
+ scale=4
589
+ )
590
+
591
+ # Event handlers for input changes - FIXED to prevent interference
592
+ video_url.change(
593
+ fn=on_video_url_change,
594
+ inputs=[video_url, use_blip],
595
+ outputs=[upload_video, filename, status_display], # URL loads video to upload field
596
+ queue=False # Don't queue URL changes
597
+ )
598
+
599
+ upload_video.upload( # Use .upload instead of .change
600
+ fn=on_video_upload_change,
601
+ inputs=[upload_video, use_blip],
602
+ outputs=[filename, video_url, status_display], # Upload clears URL field
603
+ queue=False # Don't queue uploads
604
+ )
605
+
606
+ # Toggle BLIP checkbox to switch between names
607
+ use_blip.change(
608
+ fn=on_blip_toggle,
609
+ inputs=[use_blip],
610
+ outputs=[filename, status_display]
611
+ )
612
+
613
+ with gr.Accordion("βš™οΈ Advanced Settings", open=False):
614
+ with gr.Row():
615
+ max_len = gr.Slider(
616
+ label="Max Frames",
617
+ minimum=-1,
618
+ maximum=1000,
619
+ value=-1,
620
+ step=1,
621
+ info="Maximum frames to process (-1 for all)"
622
+ )
623
+ target_fps = gr.Slider(
624
+ label="Target FPS",
625
+ minimum=-1,
626
+ maximum=30,
627
+ value=-1,
628
+ step=1,
629
+ info="Output FPS (-1 for original)"
630
+ )
631
+ max_res = gr.Slider(
632
+ label="Max Resolution",
633
+ minimum=480,
634
+ maximum=1920,
635
+ value=1280,
636
+ step=1,
637
+ info="Maximum resolution for processing"
638
+ )
639
+
640
+ with gr.Row():
641
+ stitch = gr.Checkbox(
642
+ label="Create RGBD Output",
643
+ value=True,
644
+ info="Generate side-by-side RGB + Depth video"
645
+ )
646
+ grayscale = gr.Checkbox(
647
+ label="Grayscale Depth",
648
+ value=True,
649
+ info="Convert depth to grayscale"
650
+ )
651
+ convert_from_color = gr.Checkbox(
652
+ label="From Colormap",
653
+ value=True,
654
+ info="Convert from color before grayscale"
655
+ )
656
+ blur = gr.Slider(
657
+ label="Depth Blur",
658
+ minimum=0,
659
+ maximum=1,
660
+ value=0.3,
661
+ step=0.01,
662
+ info="Blur amount for depth visualization"
663
+ )
664
+
665
+ run_btn = gr.Button("πŸš€ Generate Depth Video", variant="primary", size="lg")
666
+
667
+ # Main processing event
668
+ run_btn.click(
669
+ fn=infer_video_depth_from_source,
670
+ inputs=[
671
+ upload_video, video_url, filename, use_blip,
672
+ max_len, target_fps, max_res, stitch,
673
+ grayscale, convert_from_color, blur
674
+ ],
675
+ outputs=[depth_out, rgbd_out, status_display]
676
+ )
677
+
678
+ gr.Markdown("""
679
+ ### πŸ’‘ Tips:
680
+ - **Upload formats**: MP4, AVI, MOV, etc.
681
+ - **BLIP naming**: Automatically generates descriptive filenames
682
+ - **RGBD output**: Side-by-side comparison of original and depth
683
+ - **Processing time**: Depends on video length and resolution
684
+ - **Filename**: Set your preferred name before clicking Generate!
685
+ """)
686
+
687
+ demo.queue(max_size=10)
688
+
689
+ if __name__ == "__main__":
690
+ print("Starting Video Depth Anything interface...")
691
+ demo.launch(
692
+ server_name="0.0.0.0",
693
+ server_port=7860,
694
+ show_error=True
695
+ )