test1

Runtime error

App Files Files Community

akhaliq HF Staff commited on Oct 2

Commit

18575e5

verified ·

1 Parent(s): 07c769e

Update Gradio app with multiple files

Browse files

Files changed (1) hide show

app.py +37 -25

app.py CHANGED Viewed

@@ -20,7 +20,7 @@ model.eval()
 @spaces.GPU(duration=60)
 def generate_response(message, history):
-    """Generate response using IBM Granite model with ZeroGPU."""
     # Format the conversation history
     chat = []
@@ -49,30 +49,39 @@ def generate_response(message, history):
         max_length=2048
     ).to('cuda')
-    # Generate output tokens
-    with torch.no_grad():
-        output = model.generate(
-            **input_tokens,
-            max_new_tokens=512,
-            temperature=0.7,
-            top_p=0.95,
-            do_sample=True,
-            pad_token_id=tokenizer.eos_token_id,
-            eos_token_id=tokenizer.eos_token_id
-        )
-    # Decode output tokens into text
-    generated_text = tokenizer.decode(output[0], skip_special_tokens=True)
-    # Extract only the assistant's response
-    # Remove the input prompt from the generated text
-    response = generated_text[len(formatted_chat):].strip()
-    # Clean up the response if needed
-    if response.startswith("assistant"):
-        response = response[len("assistant"):].strip()
-    return response
 # Create the Gradio interface
 with gr.Blocks(title="IBM Granite Chat", theme=gr.themes.Soft()) as demo:
@@ -147,12 +156,15 @@ with gr.Blocks(title="IBM Granite Chat", theme=gr.themes.Soft()) as demo:
     def bot_response(history):
         if not history or history[-1][1] is not None:
-            return history
         user_message = history[-1][0]
-        bot_message = generate_response(user_message, history[:-1])
-        history[-1][1] = bot_message
-        return history
     # Connect events
     msg.submit(user_submit, [msg, chatbot], [msg, chatbot], queue=False).then(

 @spaces.GPU(duration=60)
 def generate_response(message, history):
+    """Generate response using IBM Granite model with ZeroGPU with streaming."""
     # Format the conversation history
     chat = []
         max_length=2048
     ).to('cuda')
+    # Setup for streaming generation
+    from transformers import TextIteratorStreamer
+    from threading import Thread
+    streamer = TextIteratorStreamer(
+        tokenizer,
+        skip_prompt=True,
+        skip_special_tokens=True
+    )
+    # Generation kwargs
+    generation_kwargs = dict(
+        **input_tokens,
+        max_new_tokens=512,
+        temperature=0.7,
+        top_p=0.95,
+        do_sample=True,
+        pad_token_id=tokenizer.eos_token_id,
+        eos_token_id=tokenizer.eos_token_id,
+        streamer=streamer
+    )
+    # Start generation in a separate thread
+    thread = Thread(target=model.generate, kwargs=generation_kwargs)
+    thread.start()
+    # Stream the response
+    response = ""
+    for new_text in streamer:
+        response += new_text
+        yield response
+    thread.join()
 # Create the Gradio interface
 with gr.Blocks(title="IBM Granite Chat", theme=gr.themes.Soft()) as demo:
     def bot_response(history):
         if not history or history[-1][1] is not None:
+            yield history
+            return
         user_message = history[-1][0]
+        history[-1][1] = ""
+        for partial_response in generate_response(user_message, history[:-1]):
+            history[-1][1] = partial_response
+            yield history
     # Connect events
     msg.submit(user_submit, [msg, chatbot], [msg, chatbot], queue=False).then(