🪨 IBM Granite 4.0 Chat

import gradio as gr
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
import spaces

# Model configuration
MODEL_PATH = "ibm-granite/granite-4.0-h-small"

# Load tokenizer (doesn't need GPU)
tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH)

# Load model and move to GPU
model = AutoModelForCausalLM.from_pretrained(
    MODEL_PATH,
    torch_dtype=torch.float16,
    low_cpu_mem_usage=True
)
model.to('cuda')
model.eval()

@spaces.GPU(duration=60)
def generate_response(message, history):
    """Generate response using IBM Granite model with ZeroGPU."""
    
    # Format the conversation history
    chat = []
    
    # Add conversation history
    for user_msg, assistant_msg in history:
        chat.append({"role": "user", "content": user_msg})
        if assistant_msg:
            chat.append({"role": "assistant", "content": assistant_msg})
    
    # Add current message
    chat.append({"role": "user", "content": message})
    
    # Apply chat template
    formatted_chat = tokenizer.apply_chat_template(
        chat, 
        tokenize=False, 
        add_generation_prompt=True
    )
    
    # Tokenize the text
    input_tokens = tokenizer(
        formatted_chat, 
        return_tensors="pt",
        truncation=True,
        max_length=2048
    ).to('cuda')
    
    # Generate output tokens
    with torch.no_grad():
        output = model.generate(
            **input_tokens,
            max_new_tokens=512,
            temperature=0.7,
            top_p=0.95,
            do_sample=True,
            pad_token_id=tokenizer.eos_token_id,
            eos_token_id=tokenizer.eos_token_id
        )
    
    # Decode output tokens into text
    generated_text = tokenizer.decode(output[0], skip_special_tokens=True)
    
    # Extract only the assistant's response
    # Remove the input prompt from the generated text
    response = generated_text[len(formatted_chat):].strip()
    
    # Clean up the response if needed
    if response.startswith("assistant"):
        response = response[len("assistant"):].strip()
    
    return response

# Create the Gradio interface
with gr.Blocks(title="IBM Granite Chat", theme=gr.themes.Soft()) as demo:
    gr.HTML(
        """
        <div style="text-align: center; max-width: 800px; margin: 0 auto; padding: 20px;">
            <h1 style="font-size: 2.5em; margin-bottom: 0.5em;">🪨 IBM Granite 4.0 Chat</h1>
            <p style="font-size: 1.1em; color: #666; margin-bottom: 1em;">
                Chat with IBM Granite 4.0-h Small model powered by ZeroGPU
            </p>
            <p style="font-size: 0.9em; color: #888;">
                <a href="https://huggingface.co/spaces/akhaliq/anycoder" target="_blank" style="color: #007bff; text-decoration: none;">
                    Built with anycoder
                </a>
            </p>
        </div>
        """
    )
    
    chatbot = gr.Chatbot(
        height=500,
        bubble_full_width=False,
        show_copy_button=True,
        layout="panel"
    )
    
    with gr.Row():
        msg = gr.Textbox(
            label="Your Message",
            placeholder="Type your message here and press Enter...",
            lines=2,
            scale=9,
            autofocus=True
        )
        submit_btn = gr.Button("Send", variant="primary", scale=1)
    
    with gr.Row():
        clear_btn = gr.ClearButton([msg, chatbot], value="🗑️ Clear Chat")
        
    with gr.Accordion("Advanced Settings", open=False):
        gr.Markdown("""
        ### Model Information
        - **Model**: IBM Granite 4.0-h Small
        - **Parameters**: Optimized for efficient inference
        - **Powered by**: Hugging Face ZeroGPU
        
        ### Tips for Better Responses:
        - Be specific and clear in your questions
        - Provide context when needed
        - The model excels at various tasks including coding, analysis, and general conversation
        """)
    
    # Example prompts
    gr.Examples(
        examples=[
            "Explain quantum computing in simple terms",
            "Write a Python function to calculate factorial",
            "What are the main differences between machine learning and deep learning?",
            "Help me debug this code: def add(a, b) return a + b",
            "Create a healthy meal plan for a week",
            "Explain the concept of blockchain technology",
        ],
        inputs=msg,
        label="Example Prompts"
    )
    
    # Event handlers
    def user_submit(message, history):
        if not message.strip():
            return "", history
        return "", history + [[message, None]]
    
    def bot_response(history):
        if not history or history[-1][1] is not None:
            return history
        
        user_message = history[-1][0]
        bot_message = generate_response(user_message, history[:-1])
        history[-1][1] = bot_message
        return history
    
    # Connect events
    msg.submit(user_submit, [msg, chatbot], [msg, chatbot], queue=False).then(
        bot_response, chatbot, chatbot
    )
    
    submit_btn.click(user_submit, [msg, chatbot], [msg, chatbot], queue=False).then(
        bot_response, chatbot, chatbot
    )
    
    # Add footer
    gr.HTML(
        """
        <div style="text-align: center; margin-top: 30px; padding: 20px; border-top: 1px solid #e0e0e0;">
            <p style="color: #666; font-size: 0.9em;">
                This application uses the IBM Granite 4.0-h Small model for generating responses.
                <br>Responses are generated using AI and should be verified for accuracy.
            </p>
        </div>
        """
    )

# Launch the application
if __name__ == "__main__":
    demo.queue()
    demo.launch(
        show_api=False,
        share=False
    )