test1

Runtime error

App Files Files Community

test1 / app.py

akhaliq HF Staff

Update Gradio app with multiple files

18575e5 verified 2 months ago

raw

history blame contribute delete

5.95 kB

	import gradio as gr
	import torch
	from transformers import AutoModelForCausalLM, AutoTokenizer
	import spaces

	# Model configuration
	MODEL_PATH = "ibm-granite/granite-4.0-h-small"

	# Load tokenizer (doesn't need GPU)
	tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH)

	# Load model and move to GPU
	model = AutoModelForCausalLM.from_pretrained(
	MODEL_PATH,
	torch_dtype=torch.float16,
	low_cpu_mem_usage=True
	)
	model.to('cuda')
	model.eval()

	@spaces.GPU(duration=60)
	def generate_response(message, history):
	"""Generate response using IBM Granite model with ZeroGPU with streaming."""

	# Format the conversation history
	chat = []

	# Add conversation history
	for user_msg, assistant_msg in history:
	chat.append({"role": "user", "content": user_msg})
	if assistant_msg:
	chat.append({"role": "assistant", "content": assistant_msg})

	# Add current message
	chat.append({"role": "user", "content": message})

	# Apply chat template
	formatted_chat = tokenizer.apply_chat_template(
	chat,
	tokenize=False,
	add_generation_prompt=True
	)

	# Tokenize the text
	input_tokens = tokenizer(
	formatted_chat,
	return_tensors="pt",
	truncation=True,
	max_length=2048
	).to('cuda')

	# Setup for streaming generation
	from transformers import TextIteratorStreamer
	from threading import Thread

	streamer = TextIteratorStreamer(
	tokenizer,
	skip_prompt=True,
	skip_special_tokens=True
	)

	# Generation kwargs
	generation_kwargs = dict(
	**input_tokens,
	max_new_tokens=512,
	temperature=0.7,
	top_p=0.95,
	do_sample=True,
	pad_token_id=tokenizer.eos_token_id,
	eos_token_id=tokenizer.eos_token_id,
	streamer=streamer
	)

	# Start generation in a separate thread
	thread = Thread(target=model.generate, kwargs=generation_kwargs)
	thread.start()

	# Stream the response
	response = ""
	for new_text in streamer:
	response += new_text
	yield response

	thread.join()

	# Create the Gradio interface
	with gr.Blocks(title="IBM Granite Chat", theme=gr.themes.Soft()) as demo:
	gr.HTML(
	"""
	<div style="text-align: center; max-width: 800px; margin: 0 auto; padding: 20px;">
	<h1 style="font-size: 2.5em; margin-bottom: 0.5em;">🪨 IBM Granite 4.0 Chat</h1>
	<p style="font-size: 1.1em; color: #666; margin-bottom: 1em;">
	Chat with IBM Granite 4.0-h Small model powered by ZeroGPU
	</p>
	<p style="font-size: 0.9em; color: #888;">
	<a href="https://huggingface.co/spaces/akhaliq/anycoder" target="_blank" style="color: #007bff; text-decoration: none;">
	Built with anycoder
	</a>
	</p>
	</div>
	"""
	)

	chatbot = gr.Chatbot(
	height=500,
	bubble_full_width=False,
	show_copy_button=True,
	layout="panel"
	)

	with gr.Row():
	msg = gr.Textbox(
	label="Your Message",
	placeholder="Type your message here and press Enter...",
	lines=2,
	scale=9,
	autofocus=True
	)
	submit_btn = gr.Button("Send", variant="primary", scale=1)

	with gr.Row():
	clear_btn = gr.ClearButton([msg, chatbot], value="🗑️ Clear Chat")

	with gr.Accordion("Advanced Settings", open=False):
	gr.Markdown("""
	### Model Information
	- Model: IBM Granite 4.0-h Small
	- Parameters: Optimized for efficient inference
	- Powered by: Hugging Face ZeroGPU

	### Tips for Better Responses:
	- Be specific and clear in your questions
	- Provide context when needed
	- The model excels at various tasks including coding, analysis, and general conversation
	""")

	# Example prompts
	gr.Examples(
	examples=[
	"Explain quantum computing in simple terms",
	"Write a Python function to calculate factorial",
	"What are the main differences between machine learning and deep learning?",
	"Help me debug this code: def add(a, b) return a + b",
	"Create a healthy meal plan for a week",
	"Explain the concept of blockchain technology",
	],
	inputs=msg,
	label="Example Prompts"
	)

	# Event handlers
	def user_submit(message, history):
	if not message.strip():
	return "", history
	return "", history + [[message, None]]

	def bot_response(history):
	if not history or history[-1][1] is not None:
	yield history
	return

	user_message = history[-1][0]
	history[-1][1] = ""

	for partial_response in generate_response(user_message, history[:-1]):
	history[-1][1] = partial_response
	yield history

	# Connect events
	msg.submit(user_submit, [msg, chatbot], [msg, chatbot], queue=False).then(
	bot_response, chatbot, chatbot
	)

	submit_btn.click(user_submit, [msg, chatbot], [msg, chatbot], queue=False).then(
	bot_response, chatbot, chatbot
	)

	# Add footer
	gr.HTML(
	"""
	<div style="text-align: center; margin-top: 30px; padding: 20px; border-top: 1px solid #e0e0e0;">
	<p style="color: #666; font-size: 0.9em;">
	This application uses the IBM Granite 4.0-h Small model for generating responses.
	<br>Responses are generated using AI and should be verified for accuracy.
	</p>
	</div>
	"""
	)

	# Launch the application
	if __name__ == "__main__":
	demo.queue()
	demo.launch(
	show_api=False,
	share=False
	)