Spaces:

VanguardAI
/

MultiModal_OpenSource_AI

Paused

App Files Files Community

VanguardAI commited on Aug 15, 2024

Commit

e1310ff

verified ·

1 Parent(s): cffc6a3

Update app.py

Browse files

Files changed (1) hide show

app.py +54 -53

app.py CHANGED Viewed

@@ -44,6 +44,7 @@ tavily_client = TavilyClient(api_key=os.environ.get("TAVILY_API"))
 # Function to play voice output
 def play_voice_output(response):
     description = "Jon's voice is monotone yet slightly fast in delivery, with a very close recording that almost has no background noise."
     input_ids = tts_tokenizer(description, return_tensors="pt").input_ids.to('cuda')
     prompt_input_ids = tts_tokenizer(response, return_tensors="pt").input_ids.to('cuda')
@@ -58,6 +59,7 @@ class NumpyCodeCalculator(Tool):
     description = "Useful only for performing numerical computations, not for general searches"
     def _run(self, query: str) -> str:
         try:
             local_dict = {"np": np}
             exec(query, local_dict)
@@ -72,6 +74,7 @@ class WebSearch(Tool):
     description = "Useful for advanced web searching beyond general information"
     def _run(self, query: str) -> str:
         answer = tavily_client.qna_search(query=query)
         return answer
@@ -81,6 +84,7 @@ class ImageGeneration(Tool):
     description = "Useful for generating images based on text descriptions"
     def _run(self, query: str) -> str:
         image = pipe(
             query,
             negative_prompt="",
@@ -101,6 +105,7 @@ class DocumentQuestionAnswering(Tool):
         self.qa_chain = self._setup_qa_chain()
     def _setup_qa_chain(self):
         loader = TextLoader(self.document)
         documents = loader.load()
         text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
@@ -116,77 +121,73 @@ class DocumentQuestionAnswering(Tool):
         return qa_chain
     def _run(self, query: str) -> str:
         response = self.qa_chain.run(query)
         return str(response)
 # Function to handle different input types and choose the right tool
 def handle_input(user_prompt, image=None, audio=None, websearch=False, document=None):
-    tools = [
-        Tool(
-            name="Image",
-            func=ImageGeneration(),  # Pass the class instance, not ImageGeneration()._run
-            description="Useful for generating images based on text descriptions"
-        ),
-    ]
-    # Add the numpy tool, but with a more specific description
-    tools.append(Tool(
-        name="Calculator",
-        func=NumpyCodeCalculator(),  # Pass the class instance, not NumpyCodeCalculator()._run
-        description="Useful only for performing numerical computations, not for general searches"
-    ))
-    # Add the web search tool only if websearch mode is enabled
-    if websearch:
-        tools.append(Tool(
-            name="Web",
-            func=WebSearch(),  # Pass the class instance, not WebSearch()._run
-            description="Useful for advanced web searching beyond general information"
-        ))
-    # Add the document question answering tool only if a document is provided
-    if document:
-        tools.append(Tool(
-            name="Document",
-            func=DocumentQuestionAnswering(document),  # This is already correct
-            description="Useful for answering questions about a specific document"
-        ))
-    llm = ChatGroq(model=MODEL, api_key=os.environ.get("GROQ_API_KEY"))
-    # Check if the input requires any tools
-    requires_tool = False
-    for tool in tools:
-        if tool.name.lower() in user_prompt.lower():
-            requires_tool = True
-            break
-    if image or audio or requires_tool:
-        # Initialize the agent
         agent = initialize_agent(
             tools,
             llm,
             agent=AgentType.ZERO_SHOT_REACT_DESCRIPTION,
             verbose=True
         )
-        if image:
-            image = Image.open(image).convert('RGB')
-            messages = [{"role": "user", "content": [image, user_prompt]}]
-            response = vqa_model.chat(image=None, msgs=messages, tokenizer=tokenizer)
-        elif audio:
-            transcription = client.audio.transcriptions.create(
-                file=(audio.name, audio.read()),
-                model="whisper-large-v3"
-            )
-            user_prompt = transcription.text
-            response = agent.run(user_prompt)
-        else:
-            response = agent.run(user_prompt)
     else:
-        # If no tools are required, use the LLM directly
         response = llm.call(query=user_prompt)
     return response

 # Function to play voice output
 def play_voice_output(response):
+    print("Executing play_voice_output function")
     description = "Jon's voice is monotone yet slightly fast in delivery, with a very close recording that almost has no background noise."
     input_ids = tts_tokenizer(description, return_tensors="pt").input_ids.to('cuda')
     prompt_input_ids = tts_tokenizer(response, return_tensors="pt").input_ids.to('cuda')
     description = "Useful only for performing numerical computations, not for general searches"
     def _run(self, query: str) -> str:
+        print("Executing NumpyCodeCalculator tool")
         try:
             local_dict = {"np": np}
             exec(query, local_dict)
     description = "Useful for advanced web searching beyond general information"
     def _run(self, query: str) -> str:
+        print("Executing WebSearch tool")
         answer = tavily_client.qna_search(query=query)
         return answer
     description = "Useful for generating images based on text descriptions"
     def _run(self, query: str) -> str:
+        print("Executing ImageGeneration tool")
         image = pipe(
             query,
             negative_prompt="",
         self.qa_chain = self._setup_qa_chain()
     def _setup_qa_chain(self):
+        print("Setting up DocumentQuestionAnswering tool")
         loader = TextLoader(self.document)
         documents = loader.load()
         text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
         return qa_chain
     def _run(self, query: str) -> str:
+        print("Executing DocumentQuestionAnswering tool")
         response = self.qa_chain.run(query)
         return str(response)
 # Function to handle different input types and choose the right tool
 def handle_input(user_prompt, image=None, audio=None, websearch=False, document=None):
+    print(f"Handling input: {user_prompt}")
+    # Initialize the LLM
+    llm = ChatGroq(model=MODEL, api_key=os.environ.get("GROQ_API_KEY"))
+    # Define the tools
+    tools = []
+    # Add Image Generation Tool
+    tools.append(ImageGeneration())
+    # Add Calculator Tool
+    tools.append(NumpyCodeCalculator())
+    # Add Web Search Tool if enabled
+    if websearch:
+        tools.append(WebSearch())
+    # Add Document QA Tool if document is provided
+    if document:
+        tools.append(DocumentQuestionAnswering(document))
+    # Check if any tools are mentioned in the user prompt
+    requires_tool = any([tool.name.lower() in user_prompt.lower() for tool in tools])
+    # Handle different input scenarios
+    if image:
+        print("Processing image input")
+        image = Image.open(image).convert('RGB')
+        messages = [{"role": "user", "content": [image, user_prompt]}]
+        response = vqa_model.chat(image=None, msgs=messages, tokenizer=tokenizer)
+    elif audio:
+        print("Processing audio input")
+        transcription = client.audio.transcriptions.create(
+            file=(audio.name, audio.read()),
+            model="whisper-large-v3"
+        )
+        user_prompt = transcription.text
+        # If tools are required, use an agent
+        if requires_tool:
+            agent = initialize_agent(
+                tools,
+                llm,
+                agent=AgentType.ZERO_SHOT_REACT_DESCRIPTION,
+                verbose=True
+            )
+            response = agent.run(user_prompt)
+        else:
+            response = llm.call(query=user_prompt)
+    elif requires_tool:
+        print("Using agent with tools")
         agent = initialize_agent(
             tools,
             llm,
             agent=AgentType.ZERO_SHOT_REACT_DESCRIPTION,
             verbose=True
         )
+        response = agent.run(user_prompt)
     else:
+        print("Using LLM directly")
         response = llm.call(query=user_prompt)
     return response