Spaces:

ThongCoder
/

Qwen3-0.6B-Coder

Paused

App Files Files Community

Qwen3-0.6B-Coder / app.py

ThongCoder

Update app.py

9132bb2 verified 3 months ago

raw

history blame contribute delete

1.92 kB

	import gradio as gr
	import torch
	from transformers import AutoTokenizer, AutoModelForCausalLM

	# Load tokenizer and model
	model_name = "prithivMLmods/rStar-Coder-Qwen3-0.6B"
	tokenizer = AutoTokenizer.from_pretrained(model_name)
	model = AutoModelForCausalLM.from_pretrained(model_name)
	model.eval()

	model = torch.compile(model)

	if torch.cuda.is_available():
	model = model.to("cuda")

	history = []

	def stream_chat(user_input):
	global history
	history.append(f"User: {user_input}")
	context = "\n".join(history) + "\nBot:"

	# Tokenize input
	input_ids = tokenizer(context, return_tensors="pt").input_ids
	if torch.cuda.is_available():
	input_ids = input_ids.to("cuda")

	# Generate token by token
	output_ids = input_ids.clone()
	bot_reply = ""
	max_new_tokens = 200 # adjust as needed

	for _ in range(max_new_tokens):
	with torch.no_grad():
	outputs = model(output_ids)
	next_token_logits = outputs.logits[0, -1, :]
	next_token = torch.argmax(next_token_logits).unsqueeze(0)
	output_ids = torch.cat([output_ids, next_token.unsqueeze(0)], dim=1)
	token_str = tokenizer.decode(next_token)
	bot_reply += token_str

	# Yield streaming output
	yield bot_reply

	# Stop if EOS token
	if next_token.item() == tokenizer.eos_token_id:
	break

	history.append(f"Bot: {bot_reply}")

	# Gradio interface
	with gr.Blocks() as demo:
	chatbot_ui = gr.Chatbot()
	msg = gr.Textbox(placeholder="Type a message...")

	def respond(user_input, chat_history):
	chat_history.append((user_input, ""))
	for partial in stream_chat(user_input):
	chat_history[-1] = (user_input, partial)
	yield chat_history, chat_history

	state = gr.State([])
	msg.submit(respond, [msg, state], [chatbot_ui, state])

	demo.launch()