""" Voice Agent UI - Autonomous voice-controlled agent COMPLETE FIXED VERSION """ import gradio as gr import asyncio from pathlib import Path from utils.audio_utils import speech_to_text, text_to_speech import time def create_voice_agent_ui(agent): """Create voice agent interface""" with gr.Row(): # LEFT COLUMN — INPUTS with gr.Column(scale=1): gr.Markdown(""" ### 🎤 Voice Control **How to use:** 1. Upload files (optional) 2. Speak OR type your command 3. Click Execute 4. Watch agent work! **Example commands:** - "Extract text from my PDF" - "Summarize this document" - "Organize my files" """) # Audio input audio_input = gr.Audio( sources=["microphone"], type="filepath", label="đŸŽ™ī¸ Speak Your Command" ) # Manual command input text_input = gr.Textbox( label="âŒ¨ī¸ Or Type Command", placeholder="Example: Extract deadlines from all PDFs", lines=3 ) # Execute button execute_btn = gr.Button( "🚀 Execute Command", variant="primary", size="lg" ) # Status status_box = gr.Textbox( label="📊 Status", value="Ready to execute commands", interactive=False ) gr.Markdown("---") # File uploader voice_file_upload = gr.Files( label="📁 Upload Files for Agent", file_count="multiple", file_types=[".pdf", ".png", ".jpg", ".jpeg", ".docx", ".txt", ".csv"] ) uploaded_files_list = gr.Textbox( label="📂 Available Files", placeholder="No files uploaded yet", lines=4, interactive=False ) # RIGHT COLUMN — AGENT EXECUTION with gr.Column(scale=2): gr.Markdown("### 🤖 Agent Execution & Results") # Agent Reasoning Trace (Simple Chatbot - minimal parameters) thought_trace = gr.Chatbot( label="🧠 Agent Reasoning Steps", height=400 ) # Final Response final_response = gr.Textbox( label="✅ Final Answer", lines=6 ) # Voice Output audio_output = gr.Audio( label="🔊 Voice Response (if available)", type="filepath", autoplay=True ) # Generated Files with gr.Accordion("đŸ“Ĩ Generated Files", open=False): outputs_files = gr.Files( label="Download Outputs", file_count="multiple" ) # STATE: store uploaded files uploaded_files_state = gr.State([]) # FILE UPLOAD HANDLER async def handle_voice_file_upload(files): """Handle file uploads""" if not files: return "No files uploaded", [] file_paths = [] file_info_text = [] from utils.file_utils import copy_file, get_file_info import os for file in files: filename = os.path.basename(file) dest_path = f"data/uploads/{filename}" copy_file(file, dest_path) info = get_file_info(dest_path) file_paths.append(dest_path) file_info_text.append(f"â€ĸ {info['name']} ({info['size_mb']} MB)") # Add to RAG try: await agent.process_files_to_rag([{"path": dest_path, "name": info['name']}]) except Exception: pass return "\n".join(file_info_text), file_paths # MAIN COMMAND PROCESSOR async def process_audio_command(audio_file, text_command, files_list): """Process voice + text commands - COMPLETE FIXED VERSION""" # Step 1 — Identify user command if audio_file and not text_command: # Transcribe yield [], "🎤 Transcribing audio...", "", None, None cmd = await speech_to_text(audio_file) if not cmd: yield [], "âš ī¸ Failed to transcribe audio", "", None, None return else: yield [], f"🎤 Transcribed: {cmd}", "", None, None elif text_command: cmd = text_command else: yield [], "âš ī¸ Please provide a voice or text command", "", None, None return # Show planning state yield [], "🧠 Agent is planning...", "", None, None try: # Call agent (non-streaming) final_answer, thoughts = await agent.execute(cmd, files_list) # Convert AgentThought objects to Gradio 6.0 format # MUST be list of dicts with "role" and "content" keys messages = [] for i, t in enumerate(thoughts): # Handle both AgentThought objects and dicts if hasattr(t, "type"): t_type = t.type t_content = t.content t_tool = getattr(t, "tool_name", None) elif isinstance(t, dict): t_type = t.get("type", "info") t_content = t.get("content", "") t_tool = t.get("tool_name") else: t_type = "info" t_content = str(t) t_tool = None # Format message with icon icon = "â„šī¸" title = "" if t_type == "planning": icon = "🧠" title = "Planning" elif t_type == "tool_call": icon = "🔧" title = f"Tool: {t_tool}" if t_tool else "Tool Call" elif t_type == "reflection": icon = "💭" title = "Reflection" elif t_type == "answer": icon = "✅" title = "Answer" # Add as assistant message messages.append({ "role": "assistant", "content": f"{icon} **{title}**\n\n{t_content}" }) # Show results yield messages, "📊 Processing complete...", final_answer, None, None # TTS (optional - may fail if no API key) audio_path = None try: audio_path = await text_to_speech(final_answer) # Only yield audio if it's a valid file path, not a directory if audio_path and Path(audio_path).is_file(): yield messages, "✅ Complete!", final_answer, audio_path, None else: audio_path = None except Exception as e: print(f"TTS Error (non-critical): {e}") audio_path = None # Collect recent outputs (only files, not directories) output_dir = Path("data/outputs") files_generated = [] if output_dir.exists(): cutoff = time.time() - 300 # Last 5 minutes try: for f in output_dir.glob("*"): if f.is_file() and f.stat().st_mtime > cutoff: files_generated.append(str(f)) except Exception: pass yield messages, "✅ Complete!", final_answer, audio_path, files_generated except Exception as e: import traceback err_msg = f"âš ī¸ Error: {str(e)}\n\nTraceback:\n{traceback.format_exc()}" # Error message in dictionary format error_messages = [{ "role": "assistant", "content": f"❌ **Error**\n\n{str(e)}" }] yield error_messages, f"❌ Error: {str(e)}", err_msg, None, None # CONNECT EVENTS def handle_voice_file_upload_sync(files): """Sync wrapper for async function""" return asyncio.run(handle_voice_file_upload(files)) voice_file_upload.change( fn=handle_voice_file_upload_sync, inputs=[voice_file_upload], outputs=[uploaded_files_list, uploaded_files_state] ) execute_btn.click( fn=process_audio_command, inputs=[audio_input, text_input, uploaded_files_state], outputs=[thought_trace, status_box, final_response, audio_output, outputs_files] ) return gr.Column()