import gradio as gr from transformers import AutoTokenizer, AutoModelForCausalLM, TextIteratorStreamer import torch import threading MODEL = "wmaousley/MiniCrit-1.5B" tokenizer = AutoTokenizer.from_pretrained(MODEL) model = AutoModelForCausalLM.from_pretrained( MODEL, torch_dtype=torch.float16, device_map="cpu" ) def generate_stream(prompt): """Streaming generator.""" inputs = tokenizer(prompt, return_tensors="pt") streamer = TextIteratorStreamer(tokenizer, skip_special_tokens=True) generation_kwargs = dict( **inputs, max_new_tokens=200, temperature=0.7, do_sample=True, streamer=streamer ) thread = threading.Thread(target=model.generate, kwargs=generation_kwargs) thread.start() for new_token in streamer: yield new_token def chat_fn(message, history): """Formats chat history + generates streaming reply.""" # Build conversation text conversation = "" for user, bot in history: conversation += f"User: {user}\nMiniCrit: {bot}\n" conversation += f"User: {message}\nMiniCrit:" # Stream tokens reply = "" for token in generate_stream(conversation): reply += token yield reply # -------- UI -------- with gr.Blocks(theme=gr.themes.Base()) as demo: gr.Markdown( """
Enhanced Streaming Interface
""" ) chatbox = gr.Chatbot( label="MiniCrit-1.5B", height=500 ) with gr.Row(): msg = gr.Textbox( placeholder="Ask something...", label="Message", scale=10 ) send = gr.Button("Send", variant="primary") clear = gr.Button("Clear") send.click(chat_fn, [msg, chatbox], chatbox) send.click(lambda: "", None, msg) clear.click(lambda: [], None, chatbox) demo.launch(debug=True)