Spaces:

Lesterchia174
/

fpoce_Multilingual_Translator_with_Speech_Support

Running

App Files Files Community

Lesterchia174 commited on 13 days ago

Commit

f1dffb2

verified ·

1 Parent(s): 82c8222

Create app.py

Browse files

Files changed (1) hide show

app.py +351 -0

app.py ADDED Viewed

	@@ -0,0 +1,351 @@

+import gradio as gr
+import os
+import tempfile
+import whisper
+import re
+from groq import Groq
+from gtts import gTTS
+# Load the local Whisper model for speech-to-text
+whisper_model = whisper.load_model("base")
+# Instantiate Groq client with API key
+groq_client = Groq(api_key=os.getenv("GROQ_API_KEY"))
+# Supported languages (separated Malaysian Malay & Indonesian Malay)
+SUPPORTED_LANGUAGES = [
+    "English", "Chinese", "Thai",
+    "Malaysian Malay", "Indonesian Malay",  # Split into two entries
+    "Korean", "Japanese", "Spanish", "German",
+    "Hindi", "Urdu", "French", "Russian",
+    "Tagalog", "Arabic", "Myanmar", "Vietnamese",
+    "Khmer"
+]
+LANGUAGE_CODES = {
+    "English": "en", "Chinese": "zh", "Thai": "th",
+    "Malaysian Malay": "ms",  # Bahasa Malaysia (ms)
+    "Indonesian Malay": "id",  # Bahasa Indonesia (id)
+    "Korean": "ko", "Japanese": "ja", "Spanish": "es",
+    "German": "de", "Hindi": "hi", "Urdu": "ur",
+    "French": "fr", "Russian": "ru", "Tagalog": "tl",
+    "Arabic": "ar", "Myanmar": "my", "Vietnamese": "vi",
+    "Khmer": "km"  # Added Khmer language code (km)
+}
+# Available LLM models
+AVAILABLE_MODELS = {
+    "Qwen3 32B": "qwen/qwen3-32b",
+    "kimi-k2": "moonshotai/kimi-k2-instruct-0905",
+    "Llama-3.3 70B": "llama-3.3-70b-versatile",
+    "Llama-3.1 instant 8B": "llama-3.1-8b-instant",
+    "Llama-4 guard 12B": "meta-llama/llama-guard-4-12b"
+}
+def transcribe_audio_locally(audio):
+    """Transcribe audio using local Whisper model"""
+    if audio is None:
+        return ""
+    try:
+        audio_path = audio
+        result = whisper_model.transcribe(audio_path)
+        return result["text"]
+    except Exception as e:
+        print(f"Error transcribing audio locally: {e}")
+        return f"Error transcribing audio: {str(e)}"
+def translate_text(input_text, input_lang, output_langs, model_name):
+    """Translate text using Groq's API with the selected model"""
+    if not input_text or not output_langs:
+        return ""
+    try:
+        # Get the actual model ID from our dictionary
+        model_id = AVAILABLE_MODELS.get(model_name, "qwen/qwen3-32b")
+        # Using a more direct instruction to avoid exposing the thinking process
+        system_prompt = """You are a translation assistant that provides direct, accurate translations.
+        Do NOT include any thinking, reasoning, or explanations in your response.
+        Do NOT use phrases like 'In [language]:', 'Translation:' or similar prefixes.
+        Do NOT use any special formatting like asterisks (**) or other markdown.
+        Always respond with ONLY the exact translation text itself."""
+        user_prompt = f"Translate this {input_lang} text: '{input_text}' into the following languages: {', '.join(output_langs)}. Provide each translation on a separate line with the language name as a prefix. Do not use any special formatting or markdown."
+        response = groq_client.chat.completions.create(
+            model=model_id,
+            messages=[
+                {"role": "system", "content": system_prompt},
+                {"role": "user", "content": user_prompt}
+            ]
+        )
+        translation_text = response.choices[0].message.content.strip()
+        # Remove any "thinking" patterns or COT that might have leaked through
+        translation_text = re.sub(r'<think>.*?</think>', '', translation_text, flags=re.DOTALL)
+        translation_text = translation_text.replace('**', '')
+        # Remove any line starting with common thinking patterns
+        thinking_patterns = [
+            r'^\s*Let me think.*$',
+            r'^\s*I need to.*$',
+            r'^\s*First,.*$',
+            r'^\s*Okay, so.*$',
+            r'^\s*Hmm,.*$',
+            r'^\s*Let\'s break this down.*$'
+        ]
+        for pattern in thinking_patterns:
+            translation_text = re.sub(pattern, '', translation_text, flags=re.MULTILINE)
+        return translation_text
+    except Exception as e:
+        print(f"Error translating text: {e}")
+        return f"Error: {str(e)}"
+def synthesize_speech(text, lang):
+    """Generate speech from text"""
+    if not text:
+        return None
+    try:
+        lang_code = LANGUAGE_CODES.get(lang, "en")
+        tts = gTTS(text=text, lang=lang_code)
+        with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as fp:
+            tts.save(fp.name)
+            return fp.name
+    except Exception as e:
+        print(f"Error synthesizing speech: {e}")
+        return None
+def clear_all():
+    """Clear all fields"""
+    return [""] * 4 + [None] * 3
+def process_speech_to_text(audio):
+    """Process audio and return the transcribed text"""
+    if not audio:
+        return ""
+    transcribed_text = transcribe_audio_locally(audio)
+    return transcribed_text
+def clean_translation_output(text):
+    """Clean translation output to remove any thinking or processing text"""
+    if not text:
+        return ""
+    # Remove any meta-content or thinking
+    text = re.sub(r'<think>.*?</think>', '', text, flags=re.DOTALL)
+    text = text.replace('**', '')
+    text = text.replace('*', '')
+    # Remove lines that appear to be thinking/reasoning
+    lines = text.split('\n')
+    cleaned_lines = []
+    for line in lines:
+        # Skip lines that look like thinking
+        if re.search(r'(^I need to|^Let me|^First|^Okay|^Hmm|^I will|^I am thinking|^I should)', line, re.IGNORECASE):
+            continue
+        # Keep translations with language names
+        if ':' in line and any(lang.lower() in line.lower() for lang in SUPPORTED_LANGUAGES):
+            cleaned_lines.append(line)
+        # Or keep direct translations without prefixes if they don't look like thinking
+        elif line.strip() and not re.search(r'(thinking|translating|understand|process)', line, re.IGNORECASE):
+            cleaned_lines.append(line)
+    return '\n'.join(cleaned_lines)
+def extract_translations(translations_text, output_langs):
+    """Extract clean translations from the model output"""
+    if not translations_text or not output_langs:
+        return [""] * 3
+    # Clean the translations text first
+    clean_text = clean_translation_output(translations_text)
+    # Try to match language patterns
+    translation_results = []
+    # First try to find language-labeled translations
+    for lang in output_langs:
+        pattern = rf'{re.escape(lang)}[\s]*:[\s]*(.*?)(?=\n\s*[A-Z]|$)'
+        match = re.search(pattern, clean_text, re.IGNORECASE | re.DOTALL)
+        if match:
+            translation_results.append(match.group(1).strip())
+    # If we couldn't find labeled translations, just split by lines
+    if not translation_results and '\n' in clean_text:
+        lines = [line.strip() for line in clean_text.split('\n') if line.strip()]
+        for line in lines:
+            # Check if this line has a language prefix
+            if ':' in line:
+                parts = line.split(':', 1)
+                if len(parts) == 2:
+                    translation_results.append(parts[1].strip())
+            else:
+                # Just add the line as is if it seems like a translation
+                translation_results.append(line)
+    elif not translation_results:
+        # If no newlines, just use the whole text
+        translation_results.append(clean_text)
+    # Ensure we have exactly 3 results
+    while len(translation_results) < 3:
+        translation_results.append("")
+    return translation_results[:3]
+def perform_translation(audio, typed_text, input_lang, output_langs, model_name):
+    """Main function to handle translation process"""
+    # Check if we have valid inputs
+    if not output_langs:
+        return [typed_text] + [""] * 3 + [None] * 3
+    # Limit to 3 output languages
+    selected_langs = output_langs[:3]
+    # Get the input text either from typed text or by transcribing audio
+    input_text = typed_text
+    if not input_text and audio:
+        input_text = transcribe_audio_locally(audio)
+    if not input_text:
+        return [""] * 4 + [None] * 3
+    # Get translations using the selected model
+    translations_text = translate_text(input_text, input_lang, selected_langs, model_name)
+    # Extract clean translations
+    translation_results = extract_translations(translations_text, selected_langs)
+    # Generate speech for each valid translation
+    audio_paths = []
+    for i, (trans, lang) in enumerate(zip(translation_results, selected_langs)):
+        if trans and lang:
+            audio_path = synthesize_speech(trans, lang)
+            audio_paths.append(audio_path)
+        else:
+            audio_paths.append(None)
+    # Ensure we have exactly 3 audio paths
+    while len(audio_paths) < 3:
+        audio_paths.append(None)
+    # Return results in the expected format
+    return [input_text] + translation_results + audio_paths
+# Create the Gradio interface
+with gr.Blocks(title="Multilingual Translator") as demo:
+    gr.Markdown("## 🌍 Multilingual Translator with Speech Support")
+    with gr.Row():
+        with gr.Column():
+            input_lang = gr.Dropdown(
+                choices=SUPPORTED_LANGUAGES,
+                value="English",
+                label="Input Language"
+            )
+            output_langs = gr.CheckboxGroup(
+                choices=SUPPORTED_LANGUAGES,
+                label="Output Languages (select up to 3)",
+                max_choices=3
+            )
+            model_selector = gr.Dropdown(
+                choices=list(AVAILABLE_MODELS.keys()),
+                value="Qwen3 32B",
+                label="Translation Model"
+            )
+    with gr.Row():
+        with gr.Column():
+            audio_input = gr.Audio(
+                sources=["microphone", "upload"],
+                type="filepath",
+                label="Speak Your Input (upload or record)"
+            )
+            text_input = gr.Textbox(
+                label="Or Type Text",
+                placeholder="Enter text to translate here..."
+            )
+    with gr.Row():
+        transcribed_text = gr.Textbox(
+            label="Transcribed Text (from audio)",
+            interactive=False
+        )
+    # Create translation outputs in a grid
+    with gr.Row():
+        for i in range(3):
+            with gr.Column():
+                translated_outputs = gr.Textbox(
+                    label=f"Translation {i+1}",
+                    interactive=False,
+                    visible=False
+                )
+                audio_outputs = gr.Audio(
+                    label=f"Speech Output {i+1}",
+                    visible=False
+                )
+    # Make outputs visible based on selected languages
+    def update_output_visibility(output_langs):
+        visibilities = []
+        for i in range(3):
+            if i < len(output_langs):
+                visibilities.extend([True, True])  # Text and Audio both visible
+            else:
+                visibilities.extend([False, False])  # Both hidden
+        return visibilities
+    output_langs.change(
+        update_output_visibility,
+        inputs=[output_langs],
+        outputs=translated_outputs + audio_outputs
+    )
+    with gr.Row():
+        translate_btn = gr.Button("Translate", variant="primary")
+        clear_btn = gr.Button("Clear All")
+    # Handle audio transcription
+    audio_input.change(
+        process_speech_to_text,
+        inputs=[audio_input],
+        outputs=[text_input]
+    )
+    # Handle translation
+    def on_translate(audio, text, input_lang, output_langs, model):
+        return perform_translation(audio, text, input_lang, output_langs, model)
+    translate_btn.click(
+        on_translate,
+        inputs=[audio_input, text_input, input_lang, output_langs, model_selector],
+        outputs=[transcribed_text] + translated_outputs + audio_outputs
+    )
+    # Handle Enter key in text input
+    text_input.submit(
+        on_translate,
+        inputs=[audio_input, text_input, input_lang, output_langs, model_selector],
+        outputs=[transcribed_text] + translated_outputs + audio_outputs
+    )
+    # Handle clear button
+    clear_btn.click(
+        clear_all,
+        inputs=[],
+        outputs=[transcribed_text] + translated_outputs + audio_outputs
+    )
+# Launch the application
+if __name__ == "__main__":
+    demo.launch(share=True)