Spaces:
Runtime error
Runtime error
File size: 5,795 Bytes
c2ef2a2 ae90fce |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 |
import gradio as gr
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
import spaces
# Model configuration
MODEL_PATH = "ibm-granite/granite-4.0-h-small"
# Load tokenizer (doesn't need GPU)
tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH)
# Load model and move to GPU
model = AutoModelForCausalLM.from_pretrained(
MODEL_PATH,
torch_dtype=torch.float16,
low_cpu_mem_usage=True
)
model.to('cuda')
model.eval()
@spaces.GPU(duration=60)
def generate_response(message, history):
"""Generate response using IBM Granite model with ZeroGPU."""
# Format the conversation history
chat = []
# Add conversation history
for user_msg, assistant_msg in history:
chat.append({"role": "user", "content": user_msg})
if assistant_msg:
chat.append({"role": "assistant", "content": assistant_msg})
# Add current message
chat.append({"role": "user", "content": message})
# Apply chat template
formatted_chat = tokenizer.apply_chat_template(
chat,
tokenize=False,
add_generation_prompt=True
)
# Tokenize the text
input_tokens = tokenizer(
formatted_chat,
return_tensors="pt",
truncation=True,
max_length=2048
).to('cuda')
# Generate output tokens
with torch.no_grad():
output = model.generate(
**input_tokens,
max_new_tokens=512,
temperature=0.7,
top_p=0.95,
do_sample=True,
pad_token_id=tokenizer.eos_token_id,
eos_token_id=tokenizer.eos_token_id
)
# Decode output tokens into text
generated_text = tokenizer.decode(output[0], skip_special_tokens=True)
# Extract only the assistant's response
# Remove the input prompt from the generated text
response = generated_text[len(formatted_chat):].strip()
# Clean up the response if needed
if response.startswith("assistant"):
response = response[len("assistant"):].strip()
return response
# Create the Gradio interface
with gr.Blocks(title="IBM Granite Chat", theme=gr.themes.Soft()) as demo:
gr.HTML(
"""
<div style="text-align: center; max-width: 800px; margin: 0 auto; padding: 20px;">
<h1 style="font-size: 2.5em; margin-bottom: 0.5em;">🪨 IBM Granite 4.0 Chat</h1>
<p style="font-size: 1.1em; color: #666; margin-bottom: 1em;">
Chat with IBM Granite 4.0-h Small model powered by ZeroGPU
</p>
<p style="font-size: 0.9em; color: #888;">
<a href="https://huggingface.co/spaces/akhaliq/anycoder" target="_blank" style="color: #007bff; text-decoration: none;">
Built with anycoder
</a>
</p>
</div>
"""
)
chatbot = gr.Chatbot(
height=500,
bubble_full_width=False,
show_copy_button=True,
layout="panel"
)
with gr.Row():
msg = gr.Textbox(
label="Your Message",
placeholder="Type your message here and press Enter...",
lines=2,
scale=9,
autofocus=True
)
submit_btn = gr.Button("Send", variant="primary", scale=1)
with gr.Row():
clear_btn = gr.ClearButton([msg, chatbot], value="🗑️ Clear Chat")
with gr.Accordion("Advanced Settings", open=False):
gr.Markdown("""
### Model Information
- **Model**: IBM Granite 4.0-h Small
- **Parameters**: Optimized for efficient inference
- **Powered by**: Hugging Face ZeroGPU
### Tips for Better Responses:
- Be specific and clear in your questions
- Provide context when needed
- The model excels at various tasks including coding, analysis, and general conversation
""")
# Example prompts
gr.Examples(
examples=[
"Explain quantum computing in simple terms",
"Write a Python function to calculate factorial",
"What are the main differences between machine learning and deep learning?",
"Help me debug this code: def add(a, b) return a + b",
"Create a healthy meal plan for a week",
"Explain the concept of blockchain technology",
],
inputs=msg,
label="Example Prompts"
)
# Event handlers
def user_submit(message, history):
if not message.strip():
return "", history
return "", history + [[message, None]]
def bot_response(history):
if not history or history[-1][1] is not None:
return history
user_message = history[-1][0]
bot_message = generate_response(user_message, history[:-1])
history[-1][1] = bot_message
return history
# Connect events
msg.submit(user_submit, [msg, chatbot], [msg, chatbot], queue=False).then(
bot_response, chatbot, chatbot
)
submit_btn.click(user_submit, [msg, chatbot], [msg, chatbot], queue=False).then(
bot_response, chatbot, chatbot
)
# Add footer
gr.HTML(
"""
<div style="text-align: center; margin-top: 30px; padding: 20px; border-top: 1px solid #e0e0e0;">
<p style="color: #666; font-size: 0.9em;">
This application uses the IBM Granite 4.0-h Small model for generating responses.
<br>Responses are generated using AI and should be verified for accuracy.
</p>
</div>
"""
)
# Launch the application
if __name__ == "__main__":
demo.queue()
demo.launch(
show_api=False,
share=False
) |