import os from huggingface_hub import InferenceClient import tempfile import uuid # Initialize the client # We rely on the free tier which works for these specific models without a token locally, # but in production/Spaces, it uses the environment's token automatically. client = InferenceClient() # Define Models TRANSLATION_MODEL = "Helsinki-NLP/opus-mt-th-en" IMAGE_MODEL = "stabilityai/stable-diffusion-xl-base-1.0" AUDIO_MODEL = "facebook/mms-tts-eng" def translate_text(text): """Translates Thai text to English.""" try: if not text.strip(): return "" # Using the translation API result = client.translation(text, model=TRANSLATION_MODEL) # The API usually returns [{'translation_text': '...'}] or similar object if hasattr(result, 'translation_text'): return result.translation_text return result[0]['translation_text'] except Exception as e: print(f"Translation Error: {e}") return f"Error translating: {text}" def generate_image(prompt, style): """Generates an image from text.""" try: # Enhance prompt based on style enhanced_prompt = prompt if style == "Cinematic": enhanced_prompt += ", cinematic lighting, highly detailed, photorealistic, 8k" elif style == "Anime": enhanced_prompt += ", anime style, japanese animation, vibrant colors" elif style == "3D Model": enhanced_prompt += ", 3d render, blender, unreal engine 5, isometric" elif style == "Oil Painting": enhanced_prompt += ", oil painting, textured, artistic, van gogh style" elif style == "Pixel Art": enhanced_prompt += ", pixel art, 16-bit, retro game style" image = client.text_to_image( enhanced_prompt, model=IMAGE_MODEL ) return image except Exception as e: print(f"Image Generation Error: {e}") return None def generate_audio(text): """Generates audio from English text.""" try: # Generate audio bytes audio_bytes = client.text_to_speech( text, model=AUDIO_MODEL ) # Save to a temporary file temp_dir = tempfile.gettempdir() filename = f"{uuid.uuid4()}.flac" filepath = os.path.join(temp_dir, filename) with open(filepath, "wb") as f: f.write(audio_bytes) return filepath except Exception as e: print(f"Audio Generation Error: {e}") return None def process_pipeline(thai_text, style): """Main function to orchestrate the flow.""" if not thai_text: return "Please enter text.", None, None print(f"Processing: {thai_text}") # Step 1: Translate eng_text = translate_text(thai_text) # Step 2 & 3: Generate Image and Audio (can be done in parallel ideally, but sequential here for simplicity) image = generate_image(eng_text, style) audio_path = generate_audio(eng_text) return eng_text, image, audio_path