Update app.py
Browse files
app.py
CHANGED
|
@@ -17,7 +17,10 @@ zephyr_model = "HuggingFaceH4/zephyr-7b-beta"
|
|
| 17 |
pipe = pipeline("text-generation", model=zephyr_model, torch_dtype=torch.bfloat16, device_map="auto")
|
| 18 |
|
| 19 |
standard_sys = f"""
|
| 20 |
-
|
|
|
|
|
|
|
|
|
|
| 21 |
"""
|
| 22 |
|
| 23 |
def extract_frames(video_in, interval=24, output_format='.jpg'):
|
|
@@ -83,7 +86,7 @@ def extract_audio(video_path):
|
|
| 83 |
return "output_audio.mp3"
|
| 84 |
|
| 85 |
def get_salmonn(audio_in):
|
| 86 |
-
salmonn_prompt = "
|
| 87 |
client = Client("fffiloni/SALMONN-7B-gradio")
|
| 88 |
result = client.predict(
|
| 89 |
audio_in, # filepath in 'Audio' Audio component
|
|
@@ -141,9 +144,9 @@ def infer(video_in):
|
|
| 141 |
print(formatted_captions)
|
| 142 |
|
| 143 |
# Send formatted captions to LLM
|
| 144 |
-
|
| 145 |
|
| 146 |
-
return
|
| 147 |
|
| 148 |
with gr.Blocks() as demo :
|
| 149 |
with gr.Column(elem_id="col-container"):
|
|
|
|
| 17 |
pipe = pipeline("text-generation", model=zephyr_model, torch_dtype=torch.bfloat16, device_map="auto")
|
| 18 |
|
| 19 |
standard_sys = f"""
|
| 20 |
+
You will be provided a list of visual events, and an audio description. All these informations come from a single video.
|
| 21 |
+
List of visual events are actually images extracted from this video every 12 frames.
|
| 22 |
+
Audio events are actually the description from the audio of the video.
|
| 23 |
+
Your job is to use these information to provide a short resume about what is happening in the video.
|
| 24 |
"""
|
| 25 |
|
| 26 |
def extract_frames(video_in, interval=24, output_format='.jpg'):
|
|
|
|
| 86 |
return "output_audio.mp3"
|
| 87 |
|
| 88 |
def get_salmonn(audio_in):
|
| 89 |
+
salmonn_prompt = "Please describe the audio"
|
| 90 |
client = Client("fffiloni/SALMONN-7B-gradio")
|
| 91 |
result = client.predict(
|
| 92 |
audio_in, # filepath in 'Audio' Audio component
|
|
|
|
| 144 |
print(formatted_captions)
|
| 145 |
|
| 146 |
# Send formatted captions to LLM
|
| 147 |
+
video_description_from_llm = llm_process(formatted_captions)
|
| 148 |
|
| 149 |
+
return video_description_from_llm
|
| 150 |
|
| 151 |
with gr.Blocks() as demo :
|
| 152 |
with gr.Column(elem_id="col-container"):
|