littlebird13's picture
Update app.py
dd2e2b8 verified
raw
history blame
4.22 kB
import os
import dashscope
from argparse import ArgumentParser
import gradio as gr
os.environ['VLLM_USE_V1'] = '0'
os.environ['VLLM_WORKER_MULTIPROC_METHOD'] = 'spawn'
dashscope.api_key = os.environ.get("DASHSCOPE_API_KEY")
def _launch_demo(args):
def generate_caption_from_audio(audio_path, temperature, top_p, top_k):
messages = [
{
"role": "user",
"content": [{"audio": "file://" + audio_path}],
}
]
response = dashscope.MultiModalConversation.call(
model="qwen3-omni-30b-a3b-captioner",
top_p=top_p,
top_k=top_k,
temperature=temperature,
messages=messages)
return response["output"]["choices"][0]["message"].content[0]["text"]
def on_submit(audio_path, temperature, top_p, top_k):
if not audio_path:
yield None, gr.update(interactive=True)
return
caption = generate_caption_from_audio(audio_path, temperature, top_p, top_k)
yield caption, gr.update(interactive=True)
with gr.Blocks(theme=gr.themes.Soft(font=[gr.themes.GoogleFont("Source Sans Pro"), "Arial", "sans-serif"])) as demo:
gr.Markdown("# Qwen3-Omni-30B-A3B-Captioner Demo")
with gr.Row():
with gr.Column(scale=1):
audio_input = gr.Audio(sources=['upload', 'microphone'], type="filepath", label="Upload or record an audio")
with gr.Accordion("Generation Parameters", open=True):
temperature = gr.Slider(label="Temperature", minimum=0.1, maximum=2.0, value=0.6, step=0.1)
top_p = gr.Slider(label="Top P", minimum=0.05, maximum=1.0, value=0.95, step=0.05)
top_k = gr.Slider(label="Top K", minimum=1, maximum=100, value=20, step=1)
with gr.Row():
submit_btn = gr.Button("Submit", variant="primary")
clear_btn = gr.Button("Clear")
with gr.Column(scale=2):
output_caption = gr.Textbox(
label="Caption Result",
lines=15,
interactive=False
)
def clear_fields():
return None, ""
submit_btn.click(
fn=on_submit,
inputs=[audio_input, temperature, top_p, top_k],
outputs=[output_caption, submit_btn]
)
clear_btn.click(fn=clear_fields, inputs=None, outputs=[audio_input, output_caption])
demo.queue(100, max_size=100).launch(max_threads=100,
ssr_mode=False,
share=args.share,
inbrowser=args.inbrowser,
server_port=args.server_port,
server_name=args.server_name,)
DEFAULT_CKPT_PATH = "Qwen/Qwen3-Omni-30B-A3B-Captioner"
def _get_args():
parser = ArgumentParser()
parser.add_argument('-c', '--checkpoint-path', type=str, default=DEFAULT_CKPT_PATH,
help='Checkpoint name or path, default to %(default)r')
parser.add_argument('--flash-attn2', action='store_true', default=False,
help='Enable flash_attention_2 when loading the model.')
parser.add_argument('--use-transformers', action='store_true', default=False,
help='Use transformers for inference instead of vLLM.')
parser.add_argument('--share', action='store_true', default=False,
help='Create a publicly shareable link for the interface.')
parser.add_argument('--inbrowser', action='store_true', default=False,
help='Automatically launch the interface in a new tab on the default browser.')
parser.add_argument('--server-port', type=int, help='Demo server port.')
parser.add_argument('--server-name', type=str, default='0.0.0.0', help='Demo server name.')
args = parser.parse_args()
return args
if __name__ == "__main__":
args = _get_args()
_launch_demo(args)