Spaces:

Lesterchia174
/

fpoce_Multilingual_Translator_with_Speech_Support

Sleeping

fpoce_Multilingual_Translator_with_Speech_Support

File size: 12,962 Bytes

import gradio as gr
import os
import tempfile
import whisper
import re
from groq import Groq
from gtts import gTTS

# Load the local Whisper model for speech-to-text
whisper_model = whisper.load_model("small")

# Instantiate Groq client with API key
groq_client = Groq(api_key=os.getenv("GROQ_API_KEY"))

# Supported languages (separated Malaysian Malay & Indonesian Malay)
SUPPORTED_LANGUAGES = [
    "English", "Chinese", "Thai", 
    "Malaysian Malay", "Indonesian Malay",  # Split into two entries
    "Korean", "Japanese", "Spanish", "German", 
    "Hindi", "Urdu", "French", "Russian", 
    "Tagalog", "Arabic", "Myanmar", "Vietnamese",
    "Khmer"
]

LANGUAGE_CODES = {
    "English": "en", "Chinese": "zh", "Thai": "th", 
    "Malaysian Malay": "ms",  # Bahasa Malaysia (ms)
    "Indonesian Malay": "id",  # Bahasa Indonesia (id)
    "Korean": "ko", "Japanese": "ja", "Spanish": "es", 
    "German": "de", "Hindi": "hi", "Urdu": "ur", 
    "French": "fr", "Russian": "ru", "Tagalog": "tl", 
    "Arabic": "ar", "Myanmar": "my", "Vietnamese": "vi",
    "Khmer": "km"  # Added Khmer language code (km)
}

# Available LLM models
AVAILABLE_MODELS = {
    "Qwen3 32B": "qwen/qwen3-32b",
    "kimi-k2": "moonshotai/kimi-k2-instruct-0905",
    "Llama-3.3 70B": "llama-3.3-70b-versatile",
    "Llama-3.1 instant 8B": "llama-3.1-8b-instant",
    "Llama-4 guard 12B": "meta-llama/llama-guard-4-12b"
}

def transcribe_audio_locally(audio):
    """Transcribe audio using local Whisper model"""
    if audio is None:
        return ""
    
    try:
        audio_path = audio
        result = whisper_model.transcribe(audio_path)
        return result["text"]
    except Exception as e:
        print(f"Error transcribing audio locally: {e}")
        return f"Error transcribing audio: {str(e)}"

def translate_text(input_text, input_lang, output_langs, model_name):
    """Translate text using Groq's API with the selected model"""
    if not input_text or not output_langs:
        return ""
    
    try:
        # Get the actual model ID from our dictionary
        model_id = AVAILABLE_MODELS.get(model_name, "qwen/qwen3-32b")
        
        # Using a more direct instruction to avoid exposing the thinking process
        system_prompt = """You are a translation assistant that provides direct, accurate translations. 
        Do NOT include any thinking, reasoning, or explanations in your response.
        Do NOT use phrases like 'In [language]:', 'Translation:' or similar prefixes.
        Do NOT use any special formatting like asterisks (**) or other markdown.
        Always respond with ONLY the exact translation text itself."""
        
        user_prompt = f"Translate this {input_lang} text: '{input_text}' into the following languages: {', '.join(output_langs)}. Provide each translation on a separate line with the language name as a prefix. Do not use any special formatting or markdown."
        
        response = groq_client.chat.completions.create(
            model=model_id,
            messages=[
                {"role": "system", "content": system_prompt},
                {"role": "user", "content": user_prompt}
            ]
        )
        
        translation_text = response.choices[0].message.content.strip()
        
        # Remove any "thinking" patterns or COT that might have leaked through
        translation_text = re.sub(r'<think>.*?</think>', '', translation_text, flags=re.DOTALL)
        translation_text = translation_text.replace('**', '')
        
        # Remove any line starting with common thinking patterns
        thinking_patterns = [
            r'^\s*Let me think.*$',
            r'^\s*I need to.*$',
            r'^\s*First,.*$',
            r'^\s*Okay, so.*$',
            r'^\s*Hmm,.*$',
            r'^\s*Let\'s break this down.*$'
        ]
        
        for pattern in thinking_patterns:
            translation_text = re.sub(pattern, '', translation_text, flags=re.MULTILINE)
        
        return translation_text
    except Exception as e:
        print(f"Error translating text: {e}")
        return f"Error: {str(e)}"

def synthesize_speech(text, lang):
    """Generate speech from text"""
    if not text:
        return None
    
    try:
        lang_code = LANGUAGE_CODES.get(lang, "en")
        tts = gTTS(text=text, lang=lang_code)
        
        with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as fp:
            tts.save(fp.name)
            return fp.name
    except Exception as e:
        print(f"Error synthesizing speech: {e}")
        return None

def clear_all():
    """Clear all fields"""
    return [""] * 4 + [None] * 3

def process_speech_to_text(audio):
    """Process audio and return the transcribed text"""
    if not audio:
        return ""
    
    transcribed_text = transcribe_audio_locally(audio)
    return transcribed_text

def clean_translation_output(text):
    """Clean translation output to remove any thinking or processing text"""
    if not text:
        return ""
    
    # Remove any meta-content or thinking
    text = re.sub(r'<think>.*?</think>', '', text, flags=re.DOTALL)
    text = text.replace('**', '')
    text = text.replace('*', '')

    # Remove lines that appear to be thinking/reasoning
    lines = text.split('\n')
    cleaned_lines = []
    
    for line in lines:
        # Skip lines that look like thinking
        if re.search(r'(^I need to|^Let me|^First|^Okay|^Hmm|^I will|^I am thinking|^I should)', line, re.IGNORECASE):
            continue
            
        # Keep translations with language names
        if ':' in line and any(lang.lower() in line.lower() for lang in SUPPORTED_LANGUAGES):
            cleaned_lines.append(line)
        # Or keep direct translations without prefixes if they don't look like thinking
        elif line.strip() and not re.search(r'(thinking|translating|understand|process)', line, re.IGNORECASE):
            cleaned_lines.append(line)
    
    return '\n'.join(cleaned_lines)

def extract_translations(translations_text, output_langs):
    """Extract clean translations from the model output"""
    if not translations_text or not output_langs:
        return [""] * 3
    
    # Clean the translations text first
    clean_text = clean_translation_output(translations_text)
    
    # Try to match language patterns
    translation_results = []
    
    # First try to find language-labeled translations
    for lang in output_langs:
        pattern = rf'{re.escape(lang)}[\s]*:[\s]*(.*?)(?=\n\s*[A-Z]|$)'
        match = re.search(pattern, clean_text, re.IGNORECASE | re.DOTALL)
        if match:
            translation_results.append(match.group(1).strip())
    
    # If we couldn't find labeled translations, just split by lines
    if not translation_results and '\n' in clean_text:
        lines = [line.strip() for line in clean_text.split('\n') if line.strip()]
        
        for line in lines:
            # Check if this line has a language prefix
            if ':' in line:
                parts = line.split(':', 1)
                if len(parts) == 2:
                    translation_results.append(parts[1].strip())
            else:
                # Just add the line as is if it seems like a translation
                translation_results.append(line)
    elif not translation_results:
        # If no newlines, just use the whole text
        translation_results.append(clean_text)
    
    # Ensure we have exactly 3 results
    while len(translation_results) < 3:
        translation_results.append("")
    
    return translation_results[:3]

def perform_translation(audio, typed_text, input_lang, output_langs, model_name):
    """Main function to handle translation process"""
    # Check if we have valid inputs
    if not output_langs:
        return [typed_text] + [""] * 3 + [None] * 3
    
    # Limit to 3 output languages
    selected_langs = output_langs[:3]
    
    # Get the input text either from typed text or by transcribing audio
    input_text = typed_text
    if not input_text and audio:
        input_text = transcribe_audio_locally(audio)
    
    if not input_text:
        return [""] * 4 + [None] * 3
    
    # Get translations using the selected model
    translations_text = translate_text(input_text, input_lang, selected_langs, model_name)
    
    # Extract clean translations
    translation_results = extract_translations(translations_text, selected_langs)
    
    # Generate speech for each valid translation
    audio_paths = []
    for i, (trans, lang) in enumerate(zip(translation_results, selected_langs)):
        if trans and lang:
            audio_path = synthesize_speech(trans, lang)
            audio_paths.append(audio_path)
        else:
            audio_paths.append(None)
    
    # Ensure we have exactly 3 audio paths
    while len(audio_paths) < 3:
        audio_paths.append(None)
    
    # Return results in the expected format
    return [input_text] + translation_results + audio_paths

# Create the Gradio interface
with gr.Blocks(title="Multilingual Translator") as demo:
    gr.Markdown("## 🌍 Multilingual Translator with Speech Support")
    
    with gr.Row():
        with gr.Column():
            input_lang = gr.Dropdown(
                choices=SUPPORTED_LANGUAGES, 
                value="English", 
                label="Input Language"
            )
            # Fixed: Removed max_choices parameter which is not supported
            output_langs = gr.CheckboxGroup(
                choices=SUPPORTED_LANGUAGES, 
                label="Output Languages (select up to 3)"
            )
            model_selector = gr.Dropdown(
                choices=list(AVAILABLE_MODELS.keys()), 
                value="Qwen3 32B", 
                label="Translation Model"
            )
    
    with gr.Row():
        with gr.Column():
            audio_input = gr.Audio(
                sources=["microphone", "upload"],
                type="filepath", 
                label="Speak Your Input (upload or record)"
            )
            text_input = gr.Textbox(
                label="Or Type Text", 
                placeholder="Enter text to translate here..."
            )
    
    # Create output components with proper variable assignment
    transcribed_text = gr.Textbox(
        label="Transcribed Text (from audio)", 
        interactive=False
    )
    
    # Create translation outputs with proper variable assignment
    translated_outputs = []
    audio_outputs = []
    
    with gr.Row():
        for i in range(3):
            with gr.Column():
                translated_output = gr.Textbox(
                    label=f"Translation {i+1}", 
                    interactive=False,
                    visible=True  # Changed to always visible for simplicity
                )
                translated_outputs.append(translated_output)
                
                audio_output = gr.Audio(
                    label=f"Speech Output {i+1}",
                    visible=True  # Changed to always visible for simplicity
                )
                audio_outputs.append(audio_output)
    
    # Fixed: Simple validation for output languages selection
    def validate_output_langs(output_langs):
        if len(output_langs) > 3:
            # Show warning but still allow processing with first 3
            gr.Warning("Please select only up to 3 languages. Using first 3 selected.")
            return output_langs[:3]
        return output_langs
    
    with gr.Row():
        translate_btn = gr.Button("Translate", variant="primary")
        clear_btn = gr.Button("Clear All")
    
    # Handle audio transcription
    def handle_audio_transcription(audio):
        if audio:
            return process_speech_to_text(audio)
        return ""
    
    audio_input.change(
        handle_audio_transcription,
        inputs=[audio_input],
        outputs=[text_input]
    )
    
    # Handle translation with validation
    def handle_translation(audio, text, input_lang, output_langs, model):
        validated_langs = validate_output_langs(output_langs)
        return perform_translation(audio, text, input_lang, validated_langs, model)
    
    translate_btn.click(
        handle_translation,
        inputs=[audio_input, text_input, input_lang, output_langs, model_selector],
        outputs=[transcribed_text] + translated_outputs + audio_outputs
    )
    
    # Handle Enter key in text input
    text_input.submit(
        handle_translation,
        inputs=[audio_input, text_input, input_lang, output_langs, model_selector],
        outputs=[transcribed_text] + translated_outputs + audio_outputs
    )
    
    # Handle clear button
    def handle_clear():
        return [""] * 4 + [None] * 3
    
    clear_btn.click(
        handle_clear,
        inputs=[],
        outputs=[transcribed_text] + translated_outputs + audio_outputs
    )

# Launch the application
if __name__ == "__main__":
    demo.launch(share=True)