Spaces:
Running
on
Zero
Running
on
Zero
John Ho
commited on
Commit
·
4361fd1
1
Parent(s):
ba43302
updated app to load multiple models
Browse files- README.md +3 -3
- app.py +39 -13
- requirements.txt +2 -2
README.md
CHANGED
|
@@ -1,13 +1,13 @@
|
|
| 1 |
---
|
| 2 |
-
title:
|
| 3 |
emoji: 📸
|
| 4 |
colorFrom: blue
|
| 5 |
colorTo: yellow
|
| 6 |
sdk: gradio
|
| 7 |
sdk_version: 5.32.0
|
| 8 |
-
app_file: app.py
|
| 9 |
pinned: false
|
| 10 |
-
short_description:
|
| 11 |
---
|
| 12 |
|
| 13 |
# The HuggingFace Space Template
|
|
|
|
| 1 |
---
|
| 2 |
+
title: Video Captioning
|
| 3 |
emoji: 📸
|
| 4 |
colorFrom: blue
|
| 5 |
colorTo: yellow
|
| 6 |
sdk: gradio
|
| 7 |
sdk_version: 5.32.0
|
| 8 |
+
app_file: app.py
|
| 9 |
pinned: false
|
| 10 |
+
short_description: Using VLMs for video captioning
|
| 11 |
---
|
| 12 |
|
| 13 |
# The HuggingFace Space Template
|
app.py
CHANGED
|
@@ -1,8 +1,8 @@
|
|
| 1 |
-
from statistics import quantiles
|
| 2 |
import spaces, ffmpeg, os, sys, torch
|
| 3 |
import gradio as gr
|
| 4 |
from transformers import (
|
| 5 |
Qwen2_5_VLForConditionalGeneration,
|
|
|
|
| 6 |
AutoProcessor,
|
| 7 |
BitsAndBytesConfig,
|
| 8 |
)
|
|
@@ -85,8 +85,7 @@ def load_model(
|
|
| 85 |
)
|
| 86 |
)
|
| 87 |
# Set model to evaluation mode for inference (disables dropout, etc.)
|
| 88 |
-
model.eval()
|
| 89 |
-
return model
|
| 90 |
|
| 91 |
|
| 92 |
def load_processor(model_name="Qwen/Qwen2.5-VL-7B-Instruct"):
|
|
@@ -98,23 +97,49 @@ def load_processor(model_name="Qwen/Qwen2.5-VL-7B-Instruct"):
|
|
| 98 |
)
|
| 99 |
|
| 100 |
|
| 101 |
-
|
| 102 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 103 |
|
| 104 |
|
| 105 |
@spaces.GPU(duration=120)
|
| 106 |
def inference(
|
| 107 |
video_path: str,
|
| 108 |
prompt: str = "Describe the camera motion in this video.",
|
| 109 |
-
|
| 110 |
-
|
|
|
|
| 111 |
):
|
| 112 |
# default processor
|
| 113 |
# processor, model = PROCESSOR, MODEL
|
| 114 |
-
processor = load_processor()
|
| 115 |
-
model = load_model(
|
| 116 |
-
|
| 117 |
-
)
|
|
|
|
|
|
|
| 118 |
|
| 119 |
# The model is trained on 8.0 FPS which we recommend for optimal inference
|
| 120 |
fps = get_fps_ffmpeg(video_path)
|
|
@@ -173,8 +198,9 @@ demo = gr.Interface(
|
|
| 173 |
inputs=[
|
| 174 |
gr.Video(label="Input Video"),
|
| 175 |
gr.Textbox(label="Prompt", value="Describe the camera motion in this video."),
|
| 176 |
-
gr.
|
| 177 |
-
gr.Checkbox(label="
|
|
|
|
| 178 |
],
|
| 179 |
outputs=gr.JSON(label="Output JSON"),
|
| 180 |
title="",
|
|
|
|
|
|
|
| 1 |
import spaces, ffmpeg, os, sys, torch
|
| 2 |
import gradio as gr
|
| 3 |
from transformers import (
|
| 4 |
Qwen2_5_VLForConditionalGeneration,
|
| 5 |
+
AutoModelForImageTextToText,
|
| 6 |
AutoProcessor,
|
| 7 |
BitsAndBytesConfig,
|
| 8 |
)
|
|
|
|
| 85 |
)
|
| 86 |
)
|
| 87 |
# Set model to evaluation mode for inference (disables dropout, etc.)
|
| 88 |
+
return model.eval()
|
|
|
|
| 89 |
|
| 90 |
|
| 91 |
def load_processor(model_name="Qwen/Qwen2.5-VL-7B-Instruct"):
|
|
|
|
| 97 |
)
|
| 98 |
|
| 99 |
|
| 100 |
+
logger.debug("Loading Models and Processors...")
|
| 101 |
+
MODEL_ZOO = {
|
| 102 |
+
"qwen2.5-vl-7b-cam-motion-preview": load_model(
|
| 103 |
+
model_name="chancharikm/qwen2.5-vl-7b-cam-motion-preview",
|
| 104 |
+
use_flash_attention=False,
|
| 105 |
+
apply_quantization=False,
|
| 106 |
+
),
|
| 107 |
+
"qwen2.5-vl-7b-instruct": load_model(
|
| 108 |
+
model_name="Qwen/Qwen2.5-VL-7B-Instruct",
|
| 109 |
+
use_flash_attention=False,
|
| 110 |
+
apply_quantization=False,
|
| 111 |
+
),
|
| 112 |
+
"qwen2.5-vl-3b-instruct": load_model(
|
| 113 |
+
model_name="Qwen/Qwen2.5-VL-3B-Instruct",
|
| 114 |
+
use_flash_attention=False,
|
| 115 |
+
apply_quantization=False,
|
| 116 |
+
),
|
| 117 |
+
}
|
| 118 |
+
|
| 119 |
+
PROCESSORS = {
|
| 120 |
+
"qwen2.5-vl-7b-cam-motion-preview": load_processor("Qwen/Qwen2.5-VL-7B-Instruct"),
|
| 121 |
+
"qwen2.5-vl-7b-instruct": load_processor("Qwen/Qwen2.5-VL-7B-Instruct"),
|
| 122 |
+
"qwen2.5-vl-3b-instruct": load_processor("Qwen/Qwen2.5-VL-3B-Instruct"),
|
| 123 |
+
}
|
| 124 |
+
logger.debug("Models and Processors Loaded!")
|
| 125 |
|
| 126 |
|
| 127 |
@spaces.GPU(duration=120)
|
| 128 |
def inference(
|
| 129 |
video_path: str,
|
| 130 |
prompt: str = "Describe the camera motion in this video.",
|
| 131 |
+
model_name: str = "qwen2.5-vl-7b-instruct",
|
| 132 |
+
# use_flash_attention: bool = True,
|
| 133 |
+
# apply_quantization: bool = True,
|
| 134 |
):
|
| 135 |
# default processor
|
| 136 |
# processor, model = PROCESSOR, MODEL
|
| 137 |
+
# processor = load_processor()
|
| 138 |
+
# model = load_model(
|
| 139 |
+
# use_flash_attention=use_flash_attention, apply_quantization=apply_quantization
|
| 140 |
+
# )
|
| 141 |
+
model = MODEL_ZOO[model_name]
|
| 142 |
+
processor = PROCESSORS[model_name]
|
| 143 |
|
| 144 |
# The model is trained on 8.0 FPS which we recommend for optimal inference
|
| 145 |
fps = get_fps_ffmpeg(video_path)
|
|
|
|
| 198 |
inputs=[
|
| 199 |
gr.Video(label="Input Video"),
|
| 200 |
gr.Textbox(label="Prompt", value="Describe the camera motion in this video."),
|
| 201 |
+
gr.Dropdown(label="Model", choices=list(MODEL_ZOO.keys())),
|
| 202 |
+
# gr.Checkbox(label="Use Flash Attention", value=False),
|
| 203 |
+
# gr.Checkbox(label="Apply Quantization", value=True),
|
| 204 |
],
|
| 205 |
outputs=gr.JSON(label="Output JSON"),
|
| 206 |
title="",
|
requirements.txt
CHANGED
|
@@ -1,10 +1,10 @@
|
|
| 1 |
torch
|
| 2 |
torchvision
|
| 3 |
#transformers==4.51.3
|
| 4 |
-
transformers
|
| 5 |
accelerate
|
| 6 |
qwen-vl-utils
|
| 7 |
ffmpeg-python
|
| 8 |
loguru
|
| 9 |
bitsandbytes
|
| 10 |
-
scipy
|
|
|
|
| 1 |
torch
|
| 2 |
torchvision
|
| 3 |
#transformers==4.51.3
|
| 4 |
+
transformers
|
| 5 |
accelerate
|
| 6 |
qwen-vl-utils
|
| 7 |
ffmpeg-python
|
| 8 |
loguru
|
| 9 |
bitsandbytes
|
| 10 |
+
scipy
|