Spaces:

VanguardAI
/

MultiModal_OpenSource_AI

Paused

App Files Files Community

VanguardAI commited on Aug 15, 2024

Commit

a685a6f

verified ·

1 Parent(s): c7c3138

Update app.py

Browse files

Files changed (1) hide show

app.py +62 -26

app.py CHANGED Viewed

@@ -135,50 +135,79 @@ class DuckDuckGoSearchRun(Tool):
         return answer
 # Function to handle different input types and choose the right tool
-def handle_input(user_prompt, image=None, audio=None, voice_only=False, websearch=False, document=None):
-    # Initialize the LLM
-    llm = ChatGroq(model=MODEL, api_key=os.environ.get("GROQ_API_KEY"))
-    # Initialize tools
     tools = [
-        DuckDuckGoSearchRun(),
-        ImageGeneration(),
-        NumpyCodeCalculator(),
     ]
     # Add the web search tool only if websearch mode is enabled
     if websearch:
-        tools.append(WebSearch())
     # Add the document question answering tool only if a document is provided
     if document:
-        tools.append(DocumentQuestionAnswering(document))
-    # Handle voice input
-    if voice_only and audio:
-        # TODO: Implement Whisper integration for voice-to-text
-        user_prompt = "Whisper transcription of audio" # Replace with actual transcription
-    # Handle image and text input
-    if image and user_prompt:
-        image = Image.open(image).convert('RGB')
-        messages = [{"role": "user", "content": [image, user_prompt]}]
-        response = vqa_model.chat(image=None, msgs=messages, tokenizer=tokenizer)
-        return response
     # Check if the input requires any tools
-    requires_tool = any(tool.name.lower() in user_prompt.lower() for tool in tools)
-    # Use agent if tools are required, otherwise use LLM directly
-    if requires_tool:
         agent = initialize_agent(
             tools,
             llm,
             agent=AgentType.ZERO_SHOT_REACT_DESCRIPTION,
             verbose=True
         )
-        response = agent.run(user_prompt)
     else:
         response = llm.call(query=user_prompt)
     return response
@@ -394,6 +423,7 @@ def create_ui():
     return demo
 @spaces.GPU(duration=180)
 def main_interface(user_prompt, image=None, audio=None, voice_only=False, websearch=False, document=None):
     print("Starting main_interface function")
@@ -404,7 +434,7 @@ def main_interface(user_prompt, image=None, audio=None, voice_only=False, websea
     print(f"user_prompt: {user_prompt}, image: {image}, audio: {audio}, voice_only: {voice_only}, websearch: {websearch}, document: {document}")
     try:
-        response = handle_input(user_prompt, image=image, audio=audio, voice_only=voice_only, websearch=websearch, document=document)
         print("handle_input function executed successfully")
     except Exception as e:
         print(f"Error in handle_input: {e}")
@@ -412,6 +442,12 @@ def main_interface(user_prompt, image=None, audio=None, voice_only=False, websea
     if voice_only:
         try:
             audio_output = play_voice_output(response)
             print("play_voice_output function executed successfully")
             return "Response generated.", audio_output

         return answer
 # Function to handle different input types and choose the right tool
+def handle_input(user_prompt, image=None, audio=None, websearch=False, document=None):
+    # Initialize the search tool
+    search = DuckDuckGoSearchRun()
     tools = [
+        Tool(
+            name="Search",
+            func=search.run,
+            description="Useful for searching the internet for general information"
+        ),
+        Tool(
+            name="Image",
+            func=ImageGeneration()._run,
+            description="Useful for generating images based on text descriptions"
+        ),
     ]
+    # Add the numpy tool, but with a more specific description
+    tools.append(Tool(
+        name="Numpy",
+        func=NumpyCodeCalculator()._run,
+        description="Useful only for performing numerical computations, not for general searches"
+    ))
     # Add the web search tool only if websearch mode is enabled
     if websearch:
+        tools.append(Tool(
+            name="Web",
+            func=WebSearch()._run,
+            description="Useful for advanced web searching beyond general information"
+        ))
     # Add the document question answering tool only if a document is provided
     if document:
+        tools.append(Tool(
+            name="Document",
+            func=DocumentQuestionAnswering(document)._run,
+            description="Useful for answering questions about a specific document"
+        ))
+    llm = ChatGroq(model=MODEL, api_key=os.environ.get("GROQ_API_KEY"))
     # Check if the input requires any tools
+    requires_tool = False
+    for tool in tools:
+        if tool.name.lower() in user_prompt.lower():
+            requires_tool = True
+            break
+    if image or audio or requires_tool:
+        # Initialize the agent
         agent = initialize_agent(
             tools,
             llm,
             agent=AgentType.ZERO_SHOT_REACT_DESCRIPTION,
             verbose=True
         )
+        if image:
+            image = Image.open(image).convert('RGB')
+            messages = [{"role": "user", "content": [image, user_prompt]}]
+            response = vqa_model.chat(image=None, msgs=messages, tokenizer=tokenizer)
+        elif audio:
+            transcription = client.audio.transcriptions.create(
+                file=(audio.name, audio.read()),
+                model="whisper-large-v3"
+            )
+            user_prompt = transcription.text
+            response = agent.run(user_prompt)
+        else:
+            response = agent.run(user_prompt)
     else:
+        # If no tools are required, use the LLM directly
         response = llm.call(query=user_prompt)
     return response
     return demo
+# Main interface function
 @spaces.GPU(duration=180)
 def main_interface(user_prompt, image=None, audio=None, voice_only=False, websearch=False, document=None):
     print("Starting main_interface function")
     print(f"user_prompt: {user_prompt}, image: {image}, audio: {audio}, voice_only: {voice_only}, websearch: {websearch}, document: {document}")
     try:
+        response = handle_input(user_prompt, image=image, audio=audio, websearch=websearch, document=document)
         print("handle_input function executed successfully")
     except Exception as e:
         print(f"Error in handle_input: {e}")
     if voice_only:
         try:
+            transcription = client.audio.transcriptions.create(
+                file=("input.wav", open("input.wav", "rb").read()),
+                model="whisper-large-v3"
+            )
+            user_prompt = transcription.text
+            response = handle_input(user_prompt)
             audio_output = play_voice_output(response)
             print("play_voice_output function executed successfully")
             return "Response generated.", audio_output