Spaces:
Runtime error
Runtime error
| import gradio as gr | |
| import torch | |
| from transformers import AutoModelForCausalLM, AutoTokenizer | |
| import spaces | |
| # Model configuration | |
| MODEL_PATH = "ibm-granite/granite-4.0-h-small" | |
| # Load tokenizer (doesn't need GPU) | |
| tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH) | |
| # Load model and move to GPU | |
| model = AutoModelForCausalLM.from_pretrained( | |
| MODEL_PATH, | |
| torch_dtype=torch.float16, | |
| low_cpu_mem_usage=True | |
| ) | |
| model.to('cuda') | |
| model.eval() | |
| def generate_response(message, history): | |
| """Generate response using IBM Granite model with ZeroGPU with streaming.""" | |
| # Format the conversation history | |
| chat = [] | |
| # Add conversation history | |
| for user_msg, assistant_msg in history: | |
| chat.append({"role": "user", "content": user_msg}) | |
| if assistant_msg: | |
| chat.append({"role": "assistant", "content": assistant_msg}) | |
| # Add current message | |
| chat.append({"role": "user", "content": message}) | |
| # Apply chat template | |
| formatted_chat = tokenizer.apply_chat_template( | |
| chat, | |
| tokenize=False, | |
| add_generation_prompt=True | |
| ) | |
| # Tokenize the text | |
| input_tokens = tokenizer( | |
| formatted_chat, | |
| return_tensors="pt", | |
| truncation=True, | |
| max_length=2048 | |
| ).to('cuda') | |
| # Setup for streaming generation | |
| from transformers import TextIteratorStreamer | |
| from threading import Thread | |
| streamer = TextIteratorStreamer( | |
| tokenizer, | |
| skip_prompt=True, | |
| skip_special_tokens=True | |
| ) | |
| # Generation kwargs | |
| generation_kwargs = dict( | |
| **input_tokens, | |
| max_new_tokens=512, | |
| temperature=0.7, | |
| top_p=0.95, | |
| do_sample=True, | |
| pad_token_id=tokenizer.eos_token_id, | |
| eos_token_id=tokenizer.eos_token_id, | |
| streamer=streamer | |
| ) | |
| # Start generation in a separate thread | |
| thread = Thread(target=model.generate, kwargs=generation_kwargs) | |
| thread.start() | |
| # Stream the response | |
| response = "" | |
| for new_text in streamer: | |
| response += new_text | |
| yield response | |
| thread.join() | |
| # Create the Gradio interface | |
| with gr.Blocks(title="IBM Granite Chat", theme=gr.themes.Soft()) as demo: | |
| gr.HTML( | |
| """ | |
| <div style="text-align: center; max-width: 800px; margin: 0 auto; padding: 20px;"> | |
| <h1 style="font-size: 2.5em; margin-bottom: 0.5em;">🪨 IBM Granite 4.0 Chat</h1> | |
| <p style="font-size: 1.1em; color: #666; margin-bottom: 1em;"> | |
| Chat with IBM Granite 4.0-h Small model powered by ZeroGPU | |
| </p> | |
| <p style="font-size: 0.9em; color: #888;"> | |
| <a href="https://huggingface.co/spaces/akhaliq/anycoder" target="_blank" style="color: #007bff; text-decoration: none;"> | |
| Built with anycoder | |
| </a> | |
| </p> | |
| </div> | |
| """ | |
| ) | |
| chatbot = gr.Chatbot( | |
| height=500, | |
| bubble_full_width=False, | |
| show_copy_button=True, | |
| layout="panel" | |
| ) | |
| with gr.Row(): | |
| msg = gr.Textbox( | |
| label="Your Message", | |
| placeholder="Type your message here and press Enter...", | |
| lines=2, | |
| scale=9, | |
| autofocus=True | |
| ) | |
| submit_btn = gr.Button("Send", variant="primary", scale=1) | |
| with gr.Row(): | |
| clear_btn = gr.ClearButton([msg, chatbot], value="🗑️ Clear Chat") | |
| with gr.Accordion("Advanced Settings", open=False): | |
| gr.Markdown(""" | |
| ### Model Information | |
| - **Model**: IBM Granite 4.0-h Small | |
| - **Parameters**: Optimized for efficient inference | |
| - **Powered by**: Hugging Face ZeroGPU | |
| ### Tips for Better Responses: | |
| - Be specific and clear in your questions | |
| - Provide context when needed | |
| - The model excels at various tasks including coding, analysis, and general conversation | |
| """) | |
| # Example prompts | |
| gr.Examples( | |
| examples=[ | |
| "Explain quantum computing in simple terms", | |
| "Write a Python function to calculate factorial", | |
| "What are the main differences between machine learning and deep learning?", | |
| "Help me debug this code: def add(a, b) return a + b", | |
| "Create a healthy meal plan for a week", | |
| "Explain the concept of blockchain technology", | |
| ], | |
| inputs=msg, | |
| label="Example Prompts" | |
| ) | |
| # Event handlers | |
| def user_submit(message, history): | |
| if not message.strip(): | |
| return "", history | |
| return "", history + [[message, None]] | |
| def bot_response(history): | |
| if not history or history[-1][1] is not None: | |
| yield history | |
| return | |
| user_message = history[-1][0] | |
| history[-1][1] = "" | |
| for partial_response in generate_response(user_message, history[:-1]): | |
| history[-1][1] = partial_response | |
| yield history | |
| # Connect events | |
| msg.submit(user_submit, [msg, chatbot], [msg, chatbot], queue=False).then( | |
| bot_response, chatbot, chatbot | |
| ) | |
| submit_btn.click(user_submit, [msg, chatbot], [msg, chatbot], queue=False).then( | |
| bot_response, chatbot, chatbot | |
| ) | |
| # Add footer | |
| gr.HTML( | |
| """ | |
| <div style="text-align: center; margin-top: 30px; padding: 20px; border-top: 1px solid #e0e0e0;"> | |
| <p style="color: #666; font-size: 0.9em;"> | |
| This application uses the IBM Granite 4.0-h Small model for generating responses. | |
| <br>Responses are generated using AI and should be verified for accuracy. | |
| </p> | |
| </div> | |
| """ | |
| ) | |
| # Launch the application | |
| if __name__ == "__main__": | |
| demo.queue() | |
| demo.launch( | |
| show_api=False, | |
| share=False | |
| ) |