from transformers import pipeline
import gradio as gr
import time

# Load model
pipe = pipeline("text-generation", model="prithivMLmods/rStar-Coder-Qwen3-0.6B")

history = []

def chat_fn_stream(user_input):
    global history
    history.append(f"User: {user_input}")
    context = "\n".join(history) + "\nBot:"

    # Use a generator for streaming
    for i in range(0, 8192, 20):  # fake streaming in chunks
        output = pipe(
            context,
            max_new_tokens=i+20,
            do_sample=True,
            top_p=0.9,
            return_full_text=False
        )[0]['generated_text']
        bot_reply = output.split("Bot:")[-1].strip()
        yield bot_reply  # stream partial reply
        time.sleep(0.1)  # small delay to simulate streaming

    history.append(f"Bot: {bot_reply}")

# Gradio interface
with gr.Blocks() as demo:
    chatbot_ui = gr.Chatbot()
    msg = gr.Textbox(placeholder="Type a message...")

    def respond(user_input, chat_history):
        # stream output
        bot_reply = ""
        for partial in chat_fn_stream(user_input):
            bot_reply = partial
            # append the latest partial to chat
            chat_history[-1] = (user_input, bot_reply)
            yield chat_history, chat_history

    state = gr.State([])
    msg.submit(respond, [msg, state], [chatbot_ui, state])

demo.launch()