File size: 12,962 Bytes
f1dffb2
 
 
 
 
 
 
 
 
1f4aec3
f1dffb2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ed1fe42
f1dffb2
 
ed1fe42
f1dffb2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ed1fe42
 
 
 
 
 
 
 
 
f1dffb2
 
 
 
ed1fe42
f1dffb2
 
ed1fe42
f1dffb2
ed1fe42
 
 
f1dffb2
ed1fe42
f1dffb2
ed1fe42
f1dffb2
ed1fe42
 
 
 
 
 
 
f1dffb2
 
 
 
 
 
ed1fe42
 
 
 
 
f1dffb2
ed1fe42
f1dffb2
 
 
 
ed1fe42
 
 
 
f1dffb2
 
ed1fe42
f1dffb2
 
 
 
 
 
ed1fe42
f1dffb2
 
 
 
 
ed1fe42
 
 
f1dffb2
ed1fe42
f1dffb2
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
import gradio as gr
import os
import tempfile
import whisper
import re
from groq import Groq
from gtts import gTTS

# Load the local Whisper model for speech-to-text
whisper_model = whisper.load_model("small")

# Instantiate Groq client with API key
groq_client = Groq(api_key=os.getenv("GROQ_API_KEY"))

# Supported languages (separated Malaysian Malay & Indonesian Malay)
SUPPORTED_LANGUAGES = [
    "English", "Chinese", "Thai", 
    "Malaysian Malay", "Indonesian Malay",  # Split into two entries
    "Korean", "Japanese", "Spanish", "German", 
    "Hindi", "Urdu", "French", "Russian", 
    "Tagalog", "Arabic", "Myanmar", "Vietnamese",
    "Khmer"
]

LANGUAGE_CODES = {
    "English": "en", "Chinese": "zh", "Thai": "th", 
    "Malaysian Malay": "ms",  # Bahasa Malaysia (ms)
    "Indonesian Malay": "id",  # Bahasa Indonesia (id)
    "Korean": "ko", "Japanese": "ja", "Spanish": "es", 
    "German": "de", "Hindi": "hi", "Urdu": "ur", 
    "French": "fr", "Russian": "ru", "Tagalog": "tl", 
    "Arabic": "ar", "Myanmar": "my", "Vietnamese": "vi",
    "Khmer": "km"  # Added Khmer language code (km)
}

# Available LLM models
AVAILABLE_MODELS = {
    "Qwen3 32B": "qwen/qwen3-32b",
    "kimi-k2": "moonshotai/kimi-k2-instruct-0905",
    "Llama-3.3 70B": "llama-3.3-70b-versatile",
    "Llama-3.1 instant 8B": "llama-3.1-8b-instant",
    "Llama-4 guard 12B": "meta-llama/llama-guard-4-12b"
}

def transcribe_audio_locally(audio):
    """Transcribe audio using local Whisper model"""
    if audio is None:
        return ""
    
    try:
        audio_path = audio
        result = whisper_model.transcribe(audio_path)
        return result["text"]
    except Exception as e:
        print(f"Error transcribing audio locally: {e}")
        return f"Error transcribing audio: {str(e)}"

def translate_text(input_text, input_lang, output_langs, model_name):
    """Translate text using Groq's API with the selected model"""
    if not input_text or not output_langs:
        return ""
    
    try:
        # Get the actual model ID from our dictionary
        model_id = AVAILABLE_MODELS.get(model_name, "qwen/qwen3-32b")
        
        # Using a more direct instruction to avoid exposing the thinking process
        system_prompt = """You are a translation assistant that provides direct, accurate translations. 
        Do NOT include any thinking, reasoning, or explanations in your response.
        Do NOT use phrases like 'In [language]:', 'Translation:' or similar prefixes.
        Do NOT use any special formatting like asterisks (**) or other markdown.
        Always respond with ONLY the exact translation text itself."""
        
        user_prompt = f"Translate this {input_lang} text: '{input_text}' into the following languages: {', '.join(output_langs)}. Provide each translation on a separate line with the language name as a prefix. Do not use any special formatting or markdown."
        
        response = groq_client.chat.completions.create(
            model=model_id,
            messages=[
                {"role": "system", "content": system_prompt},
                {"role": "user", "content": user_prompt}
            ]
        )
        
        translation_text = response.choices[0].message.content.strip()
        
        # Remove any "thinking" patterns or COT that might have leaked through
        translation_text = re.sub(r'<think>.*?</think>', '', translation_text, flags=re.DOTALL)
        translation_text = translation_text.replace('**', '')
        
        # Remove any line starting with common thinking patterns
        thinking_patterns = [
            r'^\s*Let me think.*$',
            r'^\s*I need to.*$',
            r'^\s*First,.*$',
            r'^\s*Okay, so.*$',
            r'^\s*Hmm,.*$',
            r'^\s*Let\'s break this down.*$'
        ]
        
        for pattern in thinking_patterns:
            translation_text = re.sub(pattern, '', translation_text, flags=re.MULTILINE)
        
        return translation_text
    except Exception as e:
        print(f"Error translating text: {e}")
        return f"Error: {str(e)}"

def synthesize_speech(text, lang):
    """Generate speech from text"""
    if not text:
        return None
    
    try:
        lang_code = LANGUAGE_CODES.get(lang, "en")
        tts = gTTS(text=text, lang=lang_code)
        
        with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as fp:
            tts.save(fp.name)
            return fp.name
    except Exception as e:
        print(f"Error synthesizing speech: {e}")
        return None

def clear_all():
    """Clear all fields"""
    return [""] * 4 + [None] * 3

def process_speech_to_text(audio):
    """Process audio and return the transcribed text"""
    if not audio:
        return ""
    
    transcribed_text = transcribe_audio_locally(audio)
    return transcribed_text

def clean_translation_output(text):
    """Clean translation output to remove any thinking or processing text"""
    if not text:
        return ""
    
    # Remove any meta-content or thinking
    text = re.sub(r'<think>.*?</think>', '', text, flags=re.DOTALL)
    text = text.replace('**', '')
    text = text.replace('*', '')

    # Remove lines that appear to be thinking/reasoning
    lines = text.split('\n')
    cleaned_lines = []
    
    for line in lines:
        # Skip lines that look like thinking
        if re.search(r'(^I need to|^Let me|^First|^Okay|^Hmm|^I will|^I am thinking|^I should)', line, re.IGNORECASE):
            continue
            
        # Keep translations with language names
        if ':' in line and any(lang.lower() in line.lower() for lang in SUPPORTED_LANGUAGES):
            cleaned_lines.append(line)
        # Or keep direct translations without prefixes if they don't look like thinking
        elif line.strip() and not re.search(r'(thinking|translating|understand|process)', line, re.IGNORECASE):
            cleaned_lines.append(line)
    
    return '\n'.join(cleaned_lines)

def extract_translations(translations_text, output_langs):
    """Extract clean translations from the model output"""
    if not translations_text or not output_langs:
        return [""] * 3
    
    # Clean the translations text first
    clean_text = clean_translation_output(translations_text)
    
    # Try to match language patterns
    translation_results = []
    
    # First try to find language-labeled translations
    for lang in output_langs:
        pattern = rf'{re.escape(lang)}[\s]*:[\s]*(.*?)(?=\n\s*[A-Z]|$)'
        match = re.search(pattern, clean_text, re.IGNORECASE | re.DOTALL)
        if match:
            translation_results.append(match.group(1).strip())
    
    # If we couldn't find labeled translations, just split by lines
    if not translation_results and '\n' in clean_text:
        lines = [line.strip() for line in clean_text.split('\n') if line.strip()]
        
        for line in lines:
            # Check if this line has a language prefix
            if ':' in line:
                parts = line.split(':', 1)
                if len(parts) == 2:
                    translation_results.append(parts[1].strip())
            else:
                # Just add the line as is if it seems like a translation
                translation_results.append(line)
    elif not translation_results:
        # If no newlines, just use the whole text
        translation_results.append(clean_text)
    
    # Ensure we have exactly 3 results
    while len(translation_results) < 3:
        translation_results.append("")
    
    return translation_results[:3]

def perform_translation(audio, typed_text, input_lang, output_langs, model_name):
    """Main function to handle translation process"""
    # Check if we have valid inputs
    if not output_langs:
        return [typed_text] + [""] * 3 + [None] * 3
    
    # Limit to 3 output languages
    selected_langs = output_langs[:3]
    
    # Get the input text either from typed text or by transcribing audio
    input_text = typed_text
    if not input_text and audio:
        input_text = transcribe_audio_locally(audio)
    
    if not input_text:
        return [""] * 4 + [None] * 3
    
    # Get translations using the selected model
    translations_text = translate_text(input_text, input_lang, selected_langs, model_name)
    
    # Extract clean translations
    translation_results = extract_translations(translations_text, selected_langs)
    
    # Generate speech for each valid translation
    audio_paths = []
    for i, (trans, lang) in enumerate(zip(translation_results, selected_langs)):
        if trans and lang:
            audio_path = synthesize_speech(trans, lang)
            audio_paths.append(audio_path)
        else:
            audio_paths.append(None)
    
    # Ensure we have exactly 3 audio paths
    while len(audio_paths) < 3:
        audio_paths.append(None)
    
    # Return results in the expected format
    return [input_text] + translation_results + audio_paths

# Create the Gradio interface
with gr.Blocks(title="Multilingual Translator") as demo:
    gr.Markdown("## 🌍 Multilingual Translator with Speech Support")
    
    with gr.Row():
        with gr.Column():
            input_lang = gr.Dropdown(
                choices=SUPPORTED_LANGUAGES, 
                value="English", 
                label="Input Language"
            )
            # Fixed: Removed max_choices parameter which is not supported
            output_langs = gr.CheckboxGroup(
                choices=SUPPORTED_LANGUAGES, 
                label="Output Languages (select up to 3)"
            )
            model_selector = gr.Dropdown(
                choices=list(AVAILABLE_MODELS.keys()), 
                value="Qwen3 32B", 
                label="Translation Model"
            )
    
    with gr.Row():
        with gr.Column():
            audio_input = gr.Audio(
                sources=["microphone", "upload"],
                type="filepath", 
                label="Speak Your Input (upload or record)"
            )
            text_input = gr.Textbox(
                label="Or Type Text", 
                placeholder="Enter text to translate here..."
            )
    
    # Create output components with proper variable assignment
    transcribed_text = gr.Textbox(
        label="Transcribed Text (from audio)", 
        interactive=False
    )
    
    # Create translation outputs with proper variable assignment
    translated_outputs = []
    audio_outputs = []
    
    with gr.Row():
        for i in range(3):
            with gr.Column():
                translated_output = gr.Textbox(
                    label=f"Translation {i+1}", 
                    interactive=False,
                    visible=True  # Changed to always visible for simplicity
                )
                translated_outputs.append(translated_output)
                
                audio_output = gr.Audio(
                    label=f"Speech Output {i+1}",
                    visible=True  # Changed to always visible for simplicity
                )
                audio_outputs.append(audio_output)
    
    # Fixed: Simple validation for output languages selection
    def validate_output_langs(output_langs):
        if len(output_langs) > 3:
            # Show warning but still allow processing with first 3
            gr.Warning("Please select only up to 3 languages. Using first 3 selected.")
            return output_langs[:3]
        return output_langs
    
    with gr.Row():
        translate_btn = gr.Button("Translate", variant="primary")
        clear_btn = gr.Button("Clear All")
    
    # Handle audio transcription
    def handle_audio_transcription(audio):
        if audio:
            return process_speech_to_text(audio)
        return ""
    
    audio_input.change(
        handle_audio_transcription,
        inputs=[audio_input],
        outputs=[text_input]
    )
    
    # Handle translation with validation
    def handle_translation(audio, text, input_lang, output_langs, model):
        validated_langs = validate_output_langs(output_langs)
        return perform_translation(audio, text, input_lang, validated_langs, model)
    
    translate_btn.click(
        handle_translation,
        inputs=[audio_input, text_input, input_lang, output_langs, model_selector],
        outputs=[transcribed_text] + translated_outputs + audio_outputs
    )
    
    # Handle Enter key in text input
    text_input.submit(
        handle_translation,
        inputs=[audio_input, text_input, input_lang, output_langs, model_selector],
        outputs=[transcribed_text] + translated_outputs + audio_outputs
    )
    
    # Handle clear button
    def handle_clear():
        return [""] * 4 + [None] * 3
    
    clear_btn.click(
        handle_clear,
        inputs=[],
        outputs=[transcribed_text] + translated_outputs + audio_outputs
    )

# Launch the application
if __name__ == "__main__":
    demo.launch(share=True)