akhaliq's picture
akhaliq HF Staff
Deploy Gradio app with multiple files
a14c972 verified
raw
history blame
4.8 kB
import gradio as gr
from transformers import Qwen3VLForConditionalGeneration, AutoProcessor
import torch
from PIL import Image
import io
import base64
import spaces
# Load model and processor
model = Qwen3VLForConditionalGeneration.from_pretrained(
"Qwen/Qwen3-VL-2B-Instruct",
torch_dtype=torch.bfloat16,
device_map="auto"
)
processor = AutoProcessor.from_pretrained("Qwen/Qwen3-VL-2B-Instruct")
def process_image(image):
"""Convert image to base64 string for processing"""
if isinstance(image, str):
return image
if isinstance(image, Image.Image):
buffered = io.BytesIO()
image.save(buffered, format="PNG")
img_str = base64.b64encode(buffered.getvalue()).decode()
return f"data:image/png;base64,{img_str}"
return image
@spaces.GPU(duration=120)
def qwen_chat(message, image, chat_history):
"""
Process chat message with optional image input
Args:
message (str): User's text message
image: Optional image input
chat_history (list): Previous conversation history
Returns:
tuple: Updated chat history and empty message input
"""
if not message and image is None:
return chat_history, ""
# Build messages list
messages = []
# Add previous chat history
for user_msg, assistant_msg in chat_history:
messages.append({"role": "user", "content": [{"type": "text", "text": user_msg}]})
messages.append({"role": "assistant", "content": [{"type": "text", "text": assistant_msg}]})
# Add current message with optional image
current_content = []
if image is not None:
current_content.append({
"type": "image",
"image": image
})
if message:
current_content.append({
"type": "text",
"text": message
})
messages.append({
"role": "user",
"content": current_content
})
# Prepare inputs
inputs = processor.apply_chat_template(
messages,
tokenize=True,
add_generation_prompt=True,
return_dict=True,
return_tensors="pt"
)
inputs = inputs.to(model.device)
# Generate response
with torch.no_grad():
generated_ids = model.generate(**inputs, max_new_tokens=256)
# Decode output
generated_ids_trimmed = [
out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
output_text = processor.batch_decode(
generated_ids_trimmed,
skip_special_tokens=True,
clean_up_tokenization_spaces=False
)[0]
# Update chat history
chat_history.append((message if message else "[Image provided]", output_text))
return chat_history, ""
# Create Gradio interface
with gr.Blocks(title="Qwen3-VL Chat") as demo:
gr.Markdown(
"""
# 🎨 Qwen3-VL Chat
Chat with Qwen3-VL-2B-Instruct - A multimodal AI that can understand both text and images!
[Built with anycoder](https://huggingface.co/spaces/akhaliq/anycoder)
"""
)
with gr.Row():
with gr.Column(scale=3):
chatbot = gr.Chatbot(
label="Chat History",
type="messages",
height=600,
show_copy_button=True
)
with gr.Column(scale=1):
image_input = gr.Image(
label="Upload Image (Optional)",
type="pil",
sources=["upload", "clipboard"],
interactive=True
)
with gr.Row():
message_input = gr.Textbox(
label="Message",
placeholder="Type your message here...",
lines=2,
scale=4
)
send_btn = gr.Button("Send", scale=1, variant="primary")
with gr.Row():
clear_btn = gr.Button("Clear Chat", variant="secondary")
gr.Markdown(
"""
### Tips:
- Upload an image to ask questions about it
- Describe what you see or ask for analysis
- The model can answer questions about images and text
"""
)
# Event handlers
def send_message(msg, img, history):
return qwen_chat(msg, img, history)
send_btn.click(
send_message,
inputs=[message_input, image_input, chatbot],
outputs=[chatbot, message_input]
)
message_input.submit(
send_message,
inputs=[message_input, image_input, chatbot],
outputs=[chatbot, message_input]
)
clear_btn.click(
lambda: ([], None, ""),
outputs=[chatbot, image_input, message_input]
)
if __name__ == "__main__":
demo.launch(share=False)