Spaces:

MCP-1st-Birthday
/

LifeAdmin-AI

Running

File size: 11,377 Bytes

02476c0

"""
Voice Agent UI - Autonomous voice-controlled agent
"""

import gradio as gr
import asyncio
from pathlib import Path
from utils.audio_utils import speech_to_text, text_to_speech
import time


def create_voice_agent_ui(agent):
    """Create voice agent interface"""
    
    with gr.Row():
        # Left column - Voice control
        with gr.Column(scale=1):
            gr.Markdown("""
            ### 🎤 Voice Control
            
            Click the microphone button and speak your command.
            The agent will autonomously execute your request.
            """)
            
            # Audio input
            audio_input = gr.Audio(
                sources=["microphone"],
                type="filepath",
                label="Speak Your Command"
            )
            
            # Manual text input as fallback
            text_input = gr.Textbox(
                label="Or Type Your Command",
                placeholder="Example: Extract deadlines from my PDFs and create calendar events",
                lines=3
            )
            
            # Execute button
            execute_btn = gr.Button(
                "🚀 Execute Command",
                variant="primary",
                size="lg"
            )
            
            # Status indicator
            status_box = gr.Textbox(
                label="Status",
                value="Ready",
                interactive=False
            )
            
            gr.Markdown("---")
            
            # Upload files for agent to process
            voice_file_upload = gr.File(
                label="Upload Files for Agent",
                file_count="multiple",
                file_types=[".pdf", ".png", ".jpg", ".jpeg", ".docx", ".txt", ".csv"]
            )
            
            uploaded_files_list = gr.Textbox(
                label="Available Files",
                placeholder="No files uploaded",
                interactive=False,
                lines=4
            )
        
        # Right column - Agent execution trace
        with gr.Column(scale=2):
            gr.Markdown("### 🤖 Agent Thoughts & Execution")
            
            # Chat-like interface for agent thoughts
            thought_trace = gr.Chatbot(
                label="Agent Reasoning",
                height=400,
                type="messages"
            )
            
            # Final response
            final_response = gr.Textbox(
                label="Final Response",
                lines=6,
                placeholder="Agent's final answer will appear here..."
            )
            
            # Audio output
            audio_output = gr.Audio(
                label="Voice Response",
                type="filepath",
                autoplay=True
            )
            
            # Download outputs
            with gr.Accordion("📥 Generated Files", open=False):
                outputs_files = gr.File(
                    label="Download Generated Files",
                    file_count="multiple"
                )
    
    # State variables
    uploaded_files_state = gr.State([])
    
    # Example commands
    with gr.Row():
        gr.Markdown("""
        ### 💡 Example Commands
        
        Try these voice commands:
        - "Extract all deadlines from my PDFs and add them to my calendar"
        - "Summarize this document and send me a professional email summary"
        - "Organize my uploaded files by type"
        - "Find all documents mentioning invoices and extract amounts"
        - "Create a calendar event for tomorrow at 2 PM titled Team Meeting"
        - "Draft a friendly email to John about the project update"
        """)
    
    # Event handlers
    async def handle_voice_file_upload(files):
        """Handle file uploads for voice agent"""
        if not files:
            return "No files uploaded", []
        
        file_list = []
        file_info_text = []
        
        for file in files:
            from utils.file_utils import copy_file, get_file_info
            
            dest_path = f"data/uploads/{Path(file.name).name}"
            copy_file(file.name, dest_path)
            
            info = get_file_info(dest_path)
            file_list.append(dest_path)
            file_info_text.append(f"✓ {info['name']} ({info['size_mb']} MB)")
            
            # Add to RAG
            await agent.process_files_to_rag([{'path': dest_path, 'name': info['name']}])
        
        return "\n".join(file_info_text), file_list
    
    async def process_audio_command(audio_file, text_command, files_list):
        """Process voice or text command"""
        
        # Determine input
        if audio_file and not text_command:
            # Transcribe audio
            yield [], "🎤 Transcribing audio...", "", None, None
            command_text = await speech_to_text(audio_file)
            
            if not command_text:
                yield [], "❌ Failed to transcribe audio", "", None, None
                return
            
            yield [], f"✓ Transcribed: {command_text}", "", None, None
            await asyncio.sleep(0.5)
        
        elif text_command:
            command_text = text_command
        
        else:
            yield [], "⚠️ Please provide a voice or text command", "", None, None
            return
        
        # Update status
        yield [], f"🤖 Planning: {command_text}", "", None, None
        
        # Execute with agent
        thoughts_display = []
        final_answer = ""
        
        try:
            # Stream agent execution
            async for thought in agent.execute(command_text, files_list, stream_thoughts=True):
                if thought:
                    # Format thought for display
                    thought_msg = format_thought_message(thought)
                    thoughts_display.append(thought_msg)
                    
                    # Update UI
                    status = get_status_from_thought(thought)
                    yield thoughts_display, status, "", None, None
                    
                    await asyncio.sleep(0.1)  # Small delay for UI update
            
            # Get final answer
            final_answer, all_thoughts = await agent.execute(command_text, files_list, stream_thoughts=False)
            
            # Generate voice response
            yield thoughts_display, "🔊 Generating voice response...", final_answer, None, None
            
            if final_answer:
                audio_path = await text_to_speech(final_answer)
                
                # Collect generated files
                output_files = collect_output_files()
                
                yield thoughts_display, "✓ Complete!", final_answer, audio_path, output_files
            else:
                yield thoughts_display, "✓ Complete!", "Task executed successfully.", None, None
        
        except Exception as e:
            error_msg = f"❌ Error: {str(e)}"
            yield thoughts_display, error_msg, error_msg, None, None
    
    def format_thought_message(thought):
        """Format thought as chat message"""
        thought_type = thought.type
        content = thought.content
        
        # Choose role and styling based on thought type
        if thought_type == 'planning':
            role = "assistant"
            icon = "🧠"
            metadata = {"title": "🧠 Planning"}
        elif thought_type == 'tool_call':
            role = "assistant"
            icon = "🔧"
            tool_name = thought.tool_name or "unknown"
            metadata = {"title": f"🔧 Using Tool: {tool_name}"}
        elif thought_type == 'reflection':
            role = "assistant"
            icon = "💭"
            metadata = {"title": "💭 Reflecting"}
        elif thought_type == 'answer':
            role = "assistant"
            icon = "✅"
            metadata = {"title": "✅ Final Answer"}
        else:
            role = "assistant"
            icon = "ℹ️"
            metadata = {"title": "ℹ️ Info"}
        
        return {
            "role": role,
            "content": f"{icon} {content}",
            "metadata": metadata
        }
    
    def get_status_from_thought(thought):
        """Get status message from thought"""
        if thought.type == 'planning':
            return "🧠 Planning execution..."
        elif thought.type == 'tool_call':
            return f"🔧 Executing: {thought.tool_name or 'tool'}..."
        elif thought.type == 'reflection':
            return "💭 Analyzing results..."
        elif thought.type == 'answer':
            return "✅ Complete!"
        else:
            return "🤖 Processing..."
    
    def collect_output_files():
        """Collect generated output files"""
        output_dir = Path("data/outputs")
        if not output_dir.exists():
            return None
        
        # Get recent files (last 5 minutes)
        recent_files = []
        cutoff_time = time.time() - 300
        
        for file_path in output_dir.glob("*"):
            if file_path.is_file() and file_path.stat().st_mtime > cutoff_time:
                recent_files.append(str(file_path))
        
        return recent_files if recent_files else None
    
    # Wire up events
    voice_file_upload.change(
        fn=handle_voice_file_upload,
        inputs=[voice_file_upload],
        outputs=[uploaded_files_list, uploaded_files_state]
    )
    
    execute_btn.click(
        fn=process_audio_command,
        inputs=[audio_input, text_input, uploaded_files_state],
        outputs=[thought_trace, status_box, final_response, audio_output, outputs_files]
    )
    
    # Quick action buttons
    gr.Markdown("### ⚡ Quick Actions")
    
    with gr.Row():
        quick_summarize = gr.Button("📝 Summarize All Documents", size="sm")
        quick_calendar = gr.Button("📅 Extract & Create Events", size="sm")
        quick_organize = gr.Button("🗂️ Organize Files", size="sm")
        quick_search = gr.Button("🔍 Search Documents", size="sm")
    
    async def quick_action(action_text, files_list):
        """Execute quick action"""
        async for update in process_audio_command(None, action_text, files_list):
            yield update
    
    quick_summarize.click(
        fn=lambda f: quick_action("Summarize all my uploaded documents", f),
        inputs=[uploaded_files_state],
        outputs=[thought_trace, status_box, final_response, audio_output, outputs_files]
    )
    
    quick_calendar.click(
        fn=lambda f: quick_action("Extract all dates and deadlines from my documents and create calendar events", f),
        inputs=[uploaded_files_state],
        outputs=[thought_trace, status_box, final_response, audio_output, outputs_files]
    )
    
    quick_organize.click(
        fn=lambda f: quick_action("Organize all my files by type", f),
        inputs=[uploaded_files_state],
        outputs=[thought_trace, status_box, final_response, audio_output, outputs_files]
    )
    
    quick_search.click(
        fn=lambda f: quick_action("Search my documents for important information and summarize findings", f),
        inputs=[uploaded_files_state],
        outputs=[thought_trace, status_box, final_response, audio_output, outputs_files]
    )