Spaces:

akhaliq
/

VibeThinker-1.5B

Running on Zero

App Files Files Community

VibeThinker-1.5B / app.py

akhaliq HF Staff

Update Gradio app with multiple files

7359698 verified 28 days ago

raw

history blame

3.56 kB

	import gradio as gr
	from transformers import AutoModelForCausalLM, AutoTokenizer, GenerationConfig
	import torch
	import spaces
	import re


	# Initialize the model and tokenizer
	print("Loading VibeThinker model...")
	model = AutoModelForCausalLM.from_pretrained(
	"WeiboAI/VibeThinker-1.5B",
	low_cpu_mem_usage=True,
	torch_dtype=torch.bfloat16,
	device_map="auto",
	trust_remote_code=True
	)
	tokenizer = AutoTokenizer.from_pretrained(
	"WeiboAI/VibeThinker-1.5B",
	trust_remote_code=True
	)
	print("Model loaded successfully!")


	@spaces.GPU
	def respond(message, history):
	"""
	Generate streaming response for the chatbot.

	Args:
	message: The user's current message
	history: List of previous conversation messages
	"""
	# Build messages from history
	messages = history if history else []

	# Add current message
	messages.append({"role": "user", "content": message})

	# Apply chat template
	text = tokenizer.apply_chat_template(
	messages,
	tokenize=False,
	add_generation_prompt=True
	)

	# Tokenize
	model_inputs = tokenizer([text], return_tensors="pt").to(model.device)

	# Generation config - using dict format as in official docs
	generation_config = dict(
	max_new_tokens=2048,
	do_sample=True,
	temperature=0.6,
	top_p=0.95,
	top_k=None
	)

	# Generate - passing GenerationConfig exactly as in docs
	generated_ids = model.generate(
	**model_inputs,
	generation_config=GenerationConfig(**generation_config)
	)

	# Trim input from output - exactly as in official docs
	generated_ids = [
	output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
	]

	# Decode - skip special tokens will help but we'll also filter manually
	response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]

	# For streaming effect, yield character by character
	partial_response = ""
	for char in response:
	partial_response += char
	yield partial_response


	# Create the Gradio interface
	with gr.Blocks(
	theme=gr.themes.Soft(),
	css="""
	.header-link { text-decoration: none; color: inherit; }
	.header-link:hover { text-decoration: underline; }
	"""
	) as demo:
	gr.Markdown(
	"""
	# 💭 VibeThinker Chatbot
	Chat with [WeiboAI/VibeThinker-1.5B](https://huggingface.co/WeiboAI/VibeThinker-1.5B) - a powerful conversational AI model.

	<a href="https://huggingface.co/spaces/akhaliq/anycoder" class="header-link">Built with anycoder</a>
	"""
	)

	gr.ChatInterface(
	fn=respond,
	type="messages",
	title="",
	description="Ask me anything! I'm powered by VibeThinker with ZeroGPU acceleration.",
	examples=[
	"What is 2 + 2?",
	"Tell me a short joke",
	"What is the capital of France?",
	"Explain AI in one sentence",
	],
	cache_examples=False,
	chatbot=gr.Chatbot(allow_tags=["think"]),
	)

	gr.Markdown(
	"""
	### About VibeThinker
	VibeThinker is a 1.5B parameter conversational AI model designed for engaging and thoughtful conversations.
	The model uses temperature sampling (0.6) for balanced creativity and coherence.

	Powered by ZeroGPU for efficient GPU resource allocation.
	"""
	)

	if __name__ == "__main__":
	demo.launch()