""" Voice Agent UI - Autonomous voice-controlled agent """ import gradio as gr import asyncio from pathlib import Path from utils.audio_utils import speech_to_text, text_to_speech import time def create_voice_agent_ui(agent): """Create voice agent interface""" with gr.Row(): # Left column - Voice control with gr.Column(scale=1): gr.Markdown(""" ### 🎤 Voice Control Click the microphone button and speak your command. The agent will autonomously execute your request. """) # Audio input audio_input = gr.Audio( sources=["microphone"], type="filepath", label="Speak Your Command" ) # Manual text input as fallback text_input = gr.Textbox( label="Or Type Your Command", placeholder="Example: Extract deadlines from my PDFs and create calendar events", lines=3 ) # Execute button execute_btn = gr.Button( "🚀 Execute Command", variant="primary", size="lg" ) # Status indicator status_box = gr.Textbox( label="Status", value="Ready", interactive=False ) gr.Markdown("---") # Upload files for agent to process voice_file_upload = gr.File( label="Upload Files for Agent", file_count="multiple", file_types=[".pdf", ".png", ".jpg", ".jpeg", ".docx", ".txt", ".csv"] ) uploaded_files_list = gr.Textbox( label="Available Files", placeholder="No files uploaded", interactive=False, lines=4 ) # Right column - Agent execution trace with gr.Column(scale=2): gr.Markdown("### 🤖 Agent Thoughts & Execution") # Chat-like interface for agent thoughts thought_trace = gr.Chatbot( label="Agent Reasoning", height=400, type="messages" ) # Final response final_response = gr.Textbox( label="Final Response", lines=6, placeholder="Agent's final answer will appear here..." ) # Audio output audio_output = gr.Audio( label="Voice Response", type="filepath", autoplay=True ) # Download outputs with gr.Accordion("đŸ“Ĩ Generated Files", open=False): outputs_files = gr.File( label="Download Generated Files", file_count="multiple" ) # State variables uploaded_files_state = gr.State([]) # Example commands with gr.Row(): gr.Markdown(""" ### 💡 Example Commands Try these voice commands: - "Extract all deadlines from my PDFs and add them to my calendar" - "Summarize this document and send me a professional email summary" - "Organize my uploaded files by type" - "Find all documents mentioning invoices and extract amounts" - "Create a calendar event for tomorrow at 2 PM titled Team Meeting" - "Draft a friendly email to John about the project update" """) # Event handlers async def handle_voice_file_upload(files): """Handle file uploads for voice agent""" if not files: return "No files uploaded", [] file_list = [] file_info_text = [] for file in files: from utils.file_utils import copy_file, get_file_info dest_path = f"data/uploads/{Path(file.name).name}" copy_file(file.name, dest_path) info = get_file_info(dest_path) file_list.append(dest_path) file_info_text.append(f"✓ {info['name']} ({info['size_mb']} MB)") # Add to RAG await agent.process_files_to_rag([{'path': dest_path, 'name': info['name']}]) return "\n".join(file_info_text), file_list async def process_audio_command(audio_file, text_command, files_list): """Process voice or text command""" # Determine input if audio_file and not text_command: # Transcribe audio yield [], "🎤 Transcribing audio...", "", None, None command_text = await speech_to_text(audio_file) if not command_text: yield [], "❌ Failed to transcribe audio", "", None, None return yield [], f"✓ Transcribed: {command_text}", "", None, None await asyncio.sleep(0.5) elif text_command: command_text = text_command else: yield [], "âš ī¸ Please provide a voice or text command", "", None, None return # Update status yield [], f"🤖 Planning: {command_text}", "", None, None # Execute with agent thoughts_display = [] final_answer = "" try: # Stream agent execution async for thought in agent.execute(command_text, files_list, stream_thoughts=True): if thought: # Format thought for display thought_msg = format_thought_message(thought) thoughts_display.append(thought_msg) # Update UI status = get_status_from_thought(thought) yield thoughts_display, status, "", None, None await asyncio.sleep(0.1) # Small delay for UI update # Get final answer final_answer, all_thoughts = await agent.execute(command_text, files_list, stream_thoughts=False) # Generate voice response yield thoughts_display, "🔊 Generating voice response...", final_answer, None, None if final_answer: audio_path = await text_to_speech(final_answer) # Collect generated files output_files = collect_output_files() yield thoughts_display, "✓ Complete!", final_answer, audio_path, output_files else: yield thoughts_display, "✓ Complete!", "Task executed successfully.", None, None except Exception as e: error_msg = f"❌ Error: {str(e)}" yield thoughts_display, error_msg, error_msg, None, None def format_thought_message(thought): """Format thought as chat message""" thought_type = thought.type content = thought.content # Choose role and styling based on thought type if thought_type == 'planning': role = "assistant" icon = "🧠" metadata = {"title": "🧠 Planning"} elif thought_type == 'tool_call': role = "assistant" icon = "🔧" tool_name = thought.tool_name or "unknown" metadata = {"title": f"🔧 Using Tool: {tool_name}"} elif thought_type == 'reflection': role = "assistant" icon = "💭" metadata = {"title": "💭 Reflecting"} elif thought_type == 'answer': role = "assistant" icon = "✅" metadata = {"title": "✅ Final Answer"} else: role = "assistant" icon = "â„šī¸" metadata = {"title": "â„šī¸ Info"} return { "role": role, "content": f"{icon} {content}", "metadata": metadata } def get_status_from_thought(thought): """Get status message from thought""" if thought.type == 'planning': return "🧠 Planning execution..." elif thought.type == 'tool_call': return f"🔧 Executing: {thought.tool_name or 'tool'}..." elif thought.type == 'reflection': return "💭 Analyzing results..." elif thought.type == 'answer': return "✅ Complete!" else: return "🤖 Processing..." def collect_output_files(): """Collect generated output files""" output_dir = Path("data/outputs") if not output_dir.exists(): return None # Get recent files (last 5 minutes) recent_files = [] cutoff_time = time.time() - 300 for file_path in output_dir.glob("*"): if file_path.is_file() and file_path.stat().st_mtime > cutoff_time: recent_files.append(str(file_path)) return recent_files if recent_files else None # Wire up events voice_file_upload.change( fn=handle_voice_file_upload, inputs=[voice_file_upload], outputs=[uploaded_files_list, uploaded_files_state] ) execute_btn.click( fn=process_audio_command, inputs=[audio_input, text_input, uploaded_files_state], outputs=[thought_trace, status_box, final_response, audio_output, outputs_files] ) # Quick action buttons gr.Markdown("### ⚡ Quick Actions") with gr.Row(): quick_summarize = gr.Button("📝 Summarize All Documents", size="sm") quick_calendar = gr.Button("📅 Extract & Create Events", size="sm") quick_organize = gr.Button("đŸ—‚ī¸ Organize Files", size="sm") quick_search = gr.Button("🔍 Search Documents", size="sm") async def quick_action(action_text, files_list): """Execute quick action""" async for update in process_audio_command(None, action_text, files_list): yield update quick_summarize.click( fn=lambda f: quick_action("Summarize all my uploaded documents", f), inputs=[uploaded_files_state], outputs=[thought_trace, status_box, final_response, audio_output, outputs_files] ) quick_calendar.click( fn=lambda f: quick_action("Extract all dates and deadlines from my documents and create calendar events", f), inputs=[uploaded_files_state], outputs=[thought_trace, status_box, final_response, audio_output, outputs_files] ) quick_organize.click( fn=lambda f: quick_action("Organize all my files by type", f), inputs=[uploaded_files_state], outputs=[thought_trace, status_box, final_response, audio_output, outputs_files] ) quick_search.click( fn=lambda f: quick_action("Search my documents for important information and summarize findings", f), inputs=[uploaded_files_state], outputs=[thought_trace, status_box, final_response, audio_output, outputs_files] )