from transformers import pipeline import gradio as gr import time # Load model pipe = pipeline("text-generation", model="prithivMLmods/rStar-Coder-Qwen3-0.6B") history = [] def chat_fn_stream(user_input): global history history.append(f"User: {user_input}") context = "\n".join(history) + "\nBot:" # Use a generator for streaming for i in range(0, 8192, 20): # fake streaming in chunks output = pipe( context, max_new_tokens=i+20, do_sample=True, top_p=0.9, return_full_text=False )[0]['generated_text'] bot_reply = output.split("Bot:")[-1].strip() yield bot_reply # stream partial reply time.sleep(0.1) # small delay to simulate streaming history.append(f"Bot: {bot_reply}") # Gradio interface with gr.Blocks() as demo: chatbot_ui = gr.Chatbot() msg = gr.Textbox(placeholder="Type a message...") def respond(user_input, chat_history): # stream output bot_reply = "" for partial in chat_fn_stream(user_input): bot_reply = partial # append the latest partial to chat chat_history[-1] = (user_input, bot_reply) yield chat_history, chat_history state = gr.State([]) msg.submit(respond, [msg, state], [chatbot_ui, state]) demo.launch()