import os import dashscope from argparse import ArgumentParser import gradio as gr os.environ['VLLM_USE_V1'] = '0' os.environ['VLLM_WORKER_MULTIPROC_METHOD'] = 'spawn' dashscope.api_key = os.environ.get("DASHSCOPE_API_KEY") def _launch_demo(args): def generate_caption_from_audio(audio_path, temperature, top_p, top_k): messages = [ { "role": "user", "content": [{"audio": "file://" + audio_path}], } ] response = dashscope.MultiModalConversation.call( model="qwen3-omni-30b-a3b-captioner", top_p=top_p, top_k=top_k, temperature=temperature, messages=messages) return response["output"]["choices"][0]["message"].content[0]["text"] def on_submit(audio_path, temperature, top_p, top_k): if not audio_path: yield None, gr.update(interactive=True) return caption = generate_caption_from_audio(audio_path, temperature, top_p, top_k) yield caption, gr.update(interactive=True) with gr.Blocks(theme=gr.themes.Soft(font=[gr.themes.GoogleFont("Source Sans Pro"), "Arial", "sans-serif"])) as demo: gr.Markdown("# Qwen3-Omni-30B-A3B-Captioner Demo") with gr.Row(): with gr.Column(scale=1): audio_input = gr.Audio(sources=['upload', 'microphone'], type="filepath", label="Upload or record an audio") with gr.Accordion("Generation Parameters", open=True): temperature = gr.Slider(label="Temperature", minimum=0.1, maximum=2.0, value=0.6, step=0.1) top_p = gr.Slider(label="Top P", minimum=0.05, maximum=1.0, value=0.95, step=0.05) top_k = gr.Slider(label="Top K", minimum=1, maximum=100, value=20, step=1) with gr.Row(): submit_btn = gr.Button("Submit", variant="primary") clear_btn = gr.Button("Clear") with gr.Column(scale=2): output_caption = gr.Textbox( label="Caption Result", lines=15, interactive=False ) def clear_fields(): return None, "" submit_btn.click( fn=on_submit, inputs=[audio_input, temperature, top_p, top_k], outputs=[output_caption, submit_btn] ) clear_btn.click(fn=clear_fields, inputs=None, outputs=[audio_input, output_caption]) demo.queue(100, max_size=100).launch(max_threads=100, ssr_mode=False, share=args.share, inbrowser=args.inbrowser, server_port=args.server_port, server_name=args.server_name,) DEFAULT_CKPT_PATH = "Qwen/Qwen3-Omni-30B-A3B-Captioner" def _get_args(): parser = ArgumentParser() parser.add_argument('-c', '--checkpoint-path', type=str, default=DEFAULT_CKPT_PATH, help='Checkpoint name or path, default to %(default)r') parser.add_argument('--flash-attn2', action='store_true', default=False, help='Enable flash_attention_2 when loading the model.') parser.add_argument('--use-transformers', action='store_true', default=False, help='Use transformers for inference instead of vLLM.') parser.add_argument('--share', action='store_true', default=False, help='Create a publicly shareable link for the interface.') parser.add_argument('--inbrowser', action='store_true', default=False, help='Automatically launch the interface in a new tab on the default browser.') parser.add_argument('--server-port', type=int, help='Demo server port.') parser.add_argument('--server-name', type=str, default='0.0.0.0', help='Demo server name.') args = parser.parse_args() return args if __name__ == "__main__": args = _get_args() _launch_demo(args)