Spaces:
Runtime error
Runtime error
File size: 5,949 Bytes
c2ef2a2 18575e5 c2ef2a2 18575e5 c2ef2a2 18575e5 c2ef2a2 18575e5 c2ef2a2 18575e5 c2ef2a2 18575e5 c2ef2a2 18575e5 c2ef2a2 18575e5 c2ef2a2 ae90fce |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 |
import gradio as gr
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
import spaces
# Model configuration
MODEL_PATH = "ibm-granite/granite-4.0-h-small"
# Load tokenizer (doesn't need GPU)
tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH)
# Load model and move to GPU
model = AutoModelForCausalLM.from_pretrained(
MODEL_PATH,
torch_dtype=torch.float16,
low_cpu_mem_usage=True
)
model.to('cuda')
model.eval()
@spaces.GPU(duration=60)
def generate_response(message, history):
"""Generate response using IBM Granite model with ZeroGPU with streaming."""
# Format the conversation history
chat = []
# Add conversation history
for user_msg, assistant_msg in history:
chat.append({"role": "user", "content": user_msg})
if assistant_msg:
chat.append({"role": "assistant", "content": assistant_msg})
# Add current message
chat.append({"role": "user", "content": message})
# Apply chat template
formatted_chat = tokenizer.apply_chat_template(
chat,
tokenize=False,
add_generation_prompt=True
)
# Tokenize the text
input_tokens = tokenizer(
formatted_chat,
return_tensors="pt",
truncation=True,
max_length=2048
).to('cuda')
# Setup for streaming generation
from transformers import TextIteratorStreamer
from threading import Thread
streamer = TextIteratorStreamer(
tokenizer,
skip_prompt=True,
skip_special_tokens=True
)
# Generation kwargs
generation_kwargs = dict(
**input_tokens,
max_new_tokens=512,
temperature=0.7,
top_p=0.95,
do_sample=True,
pad_token_id=tokenizer.eos_token_id,
eos_token_id=tokenizer.eos_token_id,
streamer=streamer
)
# Start generation in a separate thread
thread = Thread(target=model.generate, kwargs=generation_kwargs)
thread.start()
# Stream the response
response = ""
for new_text in streamer:
response += new_text
yield response
thread.join()
# Create the Gradio interface
with gr.Blocks(title="IBM Granite Chat", theme=gr.themes.Soft()) as demo:
gr.HTML(
"""
<div style="text-align: center; max-width: 800px; margin: 0 auto; padding: 20px;">
<h1 style="font-size: 2.5em; margin-bottom: 0.5em;">🪨 IBM Granite 4.0 Chat</h1>
<p style="font-size: 1.1em; color: #666; margin-bottom: 1em;">
Chat with IBM Granite 4.0-h Small model powered by ZeroGPU
</p>
<p style="font-size: 0.9em; color: #888;">
<a href="https://huggingface.co/spaces/akhaliq/anycoder" target="_blank" style="color: #007bff; text-decoration: none;">
Built with anycoder
</a>
</p>
</div>
"""
)
chatbot = gr.Chatbot(
height=500,
bubble_full_width=False,
show_copy_button=True,
layout="panel"
)
with gr.Row():
msg = gr.Textbox(
label="Your Message",
placeholder="Type your message here and press Enter...",
lines=2,
scale=9,
autofocus=True
)
submit_btn = gr.Button("Send", variant="primary", scale=1)
with gr.Row():
clear_btn = gr.ClearButton([msg, chatbot], value="🗑️ Clear Chat")
with gr.Accordion("Advanced Settings", open=False):
gr.Markdown("""
### Model Information
- **Model**: IBM Granite 4.0-h Small
- **Parameters**: Optimized for efficient inference
- **Powered by**: Hugging Face ZeroGPU
### Tips for Better Responses:
- Be specific and clear in your questions
- Provide context when needed
- The model excels at various tasks including coding, analysis, and general conversation
""")
# Example prompts
gr.Examples(
examples=[
"Explain quantum computing in simple terms",
"Write a Python function to calculate factorial",
"What are the main differences between machine learning and deep learning?",
"Help me debug this code: def add(a, b) return a + b",
"Create a healthy meal plan for a week",
"Explain the concept of blockchain technology",
],
inputs=msg,
label="Example Prompts"
)
# Event handlers
def user_submit(message, history):
if not message.strip():
return "", history
return "", history + [[message, None]]
def bot_response(history):
if not history or history[-1][1] is not None:
yield history
return
user_message = history[-1][0]
history[-1][1] = ""
for partial_response in generate_response(user_message, history[:-1]):
history[-1][1] = partial_response
yield history
# Connect events
msg.submit(user_submit, [msg, chatbot], [msg, chatbot], queue=False).then(
bot_response, chatbot, chatbot
)
submit_btn.click(user_submit, [msg, chatbot], [msg, chatbot], queue=False).then(
bot_response, chatbot, chatbot
)
# Add footer
gr.HTML(
"""
<div style="text-align: center; margin-top: 30px; padding: 20px; border-top: 1px solid #e0e0e0;">
<p style="color: #666; font-size: 0.9em;">
This application uses the IBM Granite 4.0-h Small model for generating responses.
<br>Responses are generated using AI and should be verified for accuracy.
</p>
</div>
"""
)
# Launch the application
if __name__ == "__main__":
demo.queue()
demo.launch(
show_api=False,
share=False
) |