akhaliq HF Staff commited on
Commit
18575e5
·
verified ·
1 Parent(s): 07c769e

Update Gradio app with multiple files

Browse files
Files changed (1) hide show
  1. app.py +37 -25
app.py CHANGED
@@ -20,7 +20,7 @@ model.eval()
20
 
21
  @spaces.GPU(duration=60)
22
  def generate_response(message, history):
23
- """Generate response using IBM Granite model with ZeroGPU."""
24
 
25
  # Format the conversation history
26
  chat = []
@@ -49,30 +49,39 @@ def generate_response(message, history):
49
  max_length=2048
50
  ).to('cuda')
51
 
52
- # Generate output tokens
53
- with torch.no_grad():
54
- output = model.generate(
55
- **input_tokens,
56
- max_new_tokens=512,
57
- temperature=0.7,
58
- top_p=0.95,
59
- do_sample=True,
60
- pad_token_id=tokenizer.eos_token_id,
61
- eos_token_id=tokenizer.eos_token_id
62
- )
63
 
64
- # Decode output tokens into text
65
- generated_text = tokenizer.decode(output[0], skip_special_tokens=True)
 
 
 
66
 
67
- # Extract only the assistant's response
68
- # Remove the input prompt from the generated text
69
- response = generated_text[len(formatted_chat):].strip()
 
 
 
 
 
 
 
 
70
 
71
- # Clean up the response if needed
72
- if response.startswith("assistant"):
73
- response = response[len("assistant"):].strip()
74
 
75
- return response
 
 
 
 
 
 
76
 
77
  # Create the Gradio interface
78
  with gr.Blocks(title="IBM Granite Chat", theme=gr.themes.Soft()) as demo:
@@ -147,12 +156,15 @@ with gr.Blocks(title="IBM Granite Chat", theme=gr.themes.Soft()) as demo:
147
 
148
  def bot_response(history):
149
  if not history or history[-1][1] is not None:
150
- return history
 
151
 
152
  user_message = history[-1][0]
153
- bot_message = generate_response(user_message, history[:-1])
154
- history[-1][1] = bot_message
155
- return history
 
 
156
 
157
  # Connect events
158
  msg.submit(user_submit, [msg, chatbot], [msg, chatbot], queue=False).then(
 
20
 
21
  @spaces.GPU(duration=60)
22
  def generate_response(message, history):
23
+ """Generate response using IBM Granite model with ZeroGPU with streaming."""
24
 
25
  # Format the conversation history
26
  chat = []
 
49
  max_length=2048
50
  ).to('cuda')
51
 
52
+ # Setup for streaming generation
53
+ from transformers import TextIteratorStreamer
54
+ from threading import Thread
 
 
 
 
 
 
 
 
55
 
56
+ streamer = TextIteratorStreamer(
57
+ tokenizer,
58
+ skip_prompt=True,
59
+ skip_special_tokens=True
60
+ )
61
 
62
+ # Generation kwargs
63
+ generation_kwargs = dict(
64
+ **input_tokens,
65
+ max_new_tokens=512,
66
+ temperature=0.7,
67
+ top_p=0.95,
68
+ do_sample=True,
69
+ pad_token_id=tokenizer.eos_token_id,
70
+ eos_token_id=tokenizer.eos_token_id,
71
+ streamer=streamer
72
+ )
73
 
74
+ # Start generation in a separate thread
75
+ thread = Thread(target=model.generate, kwargs=generation_kwargs)
76
+ thread.start()
77
 
78
+ # Stream the response
79
+ response = ""
80
+ for new_text in streamer:
81
+ response += new_text
82
+ yield response
83
+
84
+ thread.join()
85
 
86
  # Create the Gradio interface
87
  with gr.Blocks(title="IBM Granite Chat", theme=gr.themes.Soft()) as demo:
 
156
 
157
  def bot_response(history):
158
  if not history or history[-1][1] is not None:
159
+ yield history
160
+ return
161
 
162
  user_message = history[-1][0]
163
+ history[-1][1] = ""
164
+
165
+ for partial_response in generate_response(user_message, history[:-1]):
166
+ history[-1][1] = partial_response
167
+ yield history
168
 
169
  # Connect events
170
  msg.submit(user_submit, [msg, chatbot], [msg, chatbot], queue=False).then(