Spaces:
Sleeping
Sleeping
File size: 3,120 Bytes
b84659d |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 |
import os
from huggingface_hub import InferenceClient
import tempfile
import uuid
# Initialize the client
# We rely on the free tier which works for these specific models without a token locally,
# but in production/Spaces, it uses the environment's token automatically.
client = InferenceClient()
# Define Models
TRANSLATION_MODEL = "Helsinki-NLP/opus-mt-th-en"
IMAGE_MODEL = "stabilityai/stable-diffusion-xl-base-1.0"
AUDIO_MODEL = "facebook/mms-tts-eng"
def translate_text(text):
"""Translates Thai text to English."""
try:
if not text.strip():
return ""
# Using the translation API
result = client.translation(text, model=TRANSLATION_MODEL)
# The API usually returns [{'translation_text': '...'}] or similar object
if hasattr(result, 'translation_text'):
return result.translation_text
return result[0]['translation_text']
except Exception as e:
print(f"Translation Error: {e}")
return f"Error translating: {text}"
def generate_image(prompt, style):
"""Generates an image from text."""
try:
# Enhance prompt based on style
enhanced_prompt = prompt
if style == "Cinematic":
enhanced_prompt += ", cinematic lighting, highly detailed, photorealistic, 8k"
elif style == "Anime":
enhanced_prompt += ", anime style, japanese animation, vibrant colors"
elif style == "3D Model":
enhanced_prompt += ", 3d render, blender, unreal engine 5, isometric"
elif style == "Oil Painting":
enhanced_prompt += ", oil painting, textured, artistic, van gogh style"
elif style == "Pixel Art":
enhanced_prompt += ", pixel art, 16-bit, retro game style"
image = client.text_to_image(
enhanced_prompt,
model=IMAGE_MODEL
)
return image
except Exception as e:
print(f"Image Generation Error: {e}")
return None
def generate_audio(text):
"""Generates audio from English text."""
try:
# Generate audio bytes
audio_bytes = client.text_to_speech(
text,
model=AUDIO_MODEL
)
# Save to a temporary file
temp_dir = tempfile.gettempdir()
filename = f"{uuid.uuid4()}.flac"
filepath = os.path.join(temp_dir, filename)
with open(filepath, "wb") as f:
f.write(audio_bytes)
return filepath
except Exception as e:
print(f"Audio Generation Error: {e}")
return None
def process_pipeline(thai_text, style):
"""Main function to orchestrate the flow."""
if not thai_text:
return "Please enter text.", None, None
print(f"Processing: {thai_text}")
# Step 1: Translate
eng_text = translate_text(thai_text)
# Step 2 & 3: Generate Image and Audio (can be done in parallel ideally, but sequential here for simplicity)
image = generate_image(eng_text, style)
audio_path = generate_audio(eng_text)
return eng_text, image, audio_path |