Spaces:

Javedalam
/

my-fresh-gen

Running on Zero

App Files Files Community

my-fresh-gen / app.py.old.anycoder

Javedalam

Rename app.py to app.py.old.anycoder

c878c9a verified 28 days ago

raw

history blame

2.77 kB

	import gradio as gr
	import torch
	from transformers import AutoModelForCausalLM, AutoTokenizer
	import spaces

	# Model config
	MODEL_ID = "WeiboAI/VibeThinker-1.5B"
	SYSTEM_PROMPT = "You are a concise solver. Respond briefly."

	# Load model
	print("Loading model...")
	tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
	model = AutoModelForCausalLM.from_pretrained(
	MODEL_ID,
	torch_dtype=torch.float16,
	device_map="auto",
	)
	print("Model loaded!")

	@spaces.GPU
	def chat_with_stream(message, history, progress=gr.Progress()):
	"""Chat with streaming output"""

	# Handle inputs safely
	if message is None:
	message = "Hello"
	if history is None:
	history = []

	# Convert to string
	message = str(message)

	progress(0.1, desc="Building conversation...")

	# Build messages
	messages = [{"role": "system", "content": SYSTEM_PROMPT}]

	# Add history
	for user_msg, assistant_msg in history:
	if user_msg is not None:
	messages.append({"role": "user", "content": str(user_msg)})
	if assistant_msg is not None:
	messages.append({"role": "assistant", "content": str(assistant_msg)})

	progress(0.3, desc="Adding your message...")
	messages.append({"role": "user", "content": message})

	progress(0.5, desc="Formatting input...")
	prompt = tokenizer.apply_chat_template(
	messages,
	tokenize=False,
	add_generation_prompt=True
	)

	progress(0.6, desc="Tokenizing...")
	inputs = tokenizer([prompt], return_tensors="pt").to(model.device)

	progress(0.7, desc="Starting generation...")

	# Generate with streaming
	with torch.no_grad():
	outputs = model.generate(
	**inputs,
	max_new_tokens=1000,
	do_sample=True,
	temperature=0.7,
	top_p=0.9,
	pad_token_id=tokenizer.eos_token_id,
	return_dict_in_generate=True,
	output_scores=False,
	)

	progress(0.9, desc="Decoding response...")

	# Decode
	full_text = tokenizer.decode(outputs[0], skip_special_tokens=True)

	# Extract assistant response
	if "assistant" in full_text:
	response = full_text.split("assistant")[-1].strip()
	else:
	response = full_text

	progress(1.0, desc="Complete!")
	return response

	def create_demo():
	"""Create simple demo"""
	demo = gr.ChatInterface(
	fn=chat_with_stream,
	title="VibeThinker Chat",
	description="Simple chat with VibeThinker-1.5B",
	examples=["2+2", "What is AI?", "Write a poem"]
	)
	return demo

	if __name__ == "__main__":
	print("Starting...")
	demo = create_demo()
	demo.launch(share=False)