File size: 3,120 Bytes
b84659d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
import os
from huggingface_hub import InferenceClient
import tempfile
import uuid

# Initialize the client
# We rely on the free tier which works for these specific models without a token locally,
# but in production/Spaces, it uses the environment's token automatically.
client = InferenceClient()

# Define Models
TRANSLATION_MODEL = "Helsinki-NLP/opus-mt-th-en"
IMAGE_MODEL = "stabilityai/stable-diffusion-xl-base-1.0"
AUDIO_MODEL = "facebook/mms-tts-eng"

def translate_text(text):
    """Translates Thai text to English."""
    try:
        if not text.strip():
            return ""
        
        # Using the translation API
        result = client.translation(text, model=TRANSLATION_MODEL)
        # The API usually returns [{'translation_text': '...'}] or similar object
        if hasattr(result, 'translation_text'):
             return result.translation_text
        return result[0]['translation_text']
    except Exception as e:
        print(f"Translation Error: {e}")
        return f"Error translating: {text}"

def generate_image(prompt, style):
    """Generates an image from text."""
    try:
        # Enhance prompt based on style
        enhanced_prompt = prompt
        if style == "Cinematic":
            enhanced_prompt += ", cinematic lighting, highly detailed, photorealistic, 8k"
        elif style == "Anime":
            enhanced_prompt += ", anime style, japanese animation, vibrant colors"
        elif style == "3D Model":
            enhanced_prompt += ", 3d render, blender, unreal engine 5, isometric"
        elif style == "Oil Painting":
            enhanced_prompt += ", oil painting, textured, artistic, van gogh style"
        elif style == "Pixel Art":
            enhanced_prompt += ", pixel art, 16-bit, retro game style"

        image = client.text_to_image(
            enhanced_prompt,
            model=IMAGE_MODEL
        )
        return image
    except Exception as e:
        print(f"Image Generation Error: {e}")
        return None

def generate_audio(text):
    """Generates audio from English text."""
    try:
        # Generate audio bytes
        audio_bytes = client.text_to_speech(
            text,
            model=AUDIO_MODEL
        )
        
        # Save to a temporary file
        temp_dir = tempfile.gettempdir()
        filename = f"{uuid.uuid4()}.flac"
        filepath = os.path.join(temp_dir, filename)
        
        with open(filepath, "wb") as f:
            f.write(audio_bytes)
            
        return filepath
    except Exception as e:
        print(f"Audio Generation Error: {e}")
        return None

def process_pipeline(thai_text, style):
    """Main function to orchestrate the flow."""
    if not thai_text:
        return "Please enter text.", None, None
    
    print(f"Processing: {thai_text}")
    
    # Step 1: Translate
    eng_text = translate_text(thai_text)
    
    # Step 2 & 3: Generate Image and Audio (can be done in parallel ideally, but sequential here for simplicity)
    image = generate_image(eng_text, style)
    audio_path = generate_audio(eng_text)
    
    return eng_text, image, audio_path