Spaces:

channelcorp
/

Ko-TTS-Arena

Sleeping

File size: 16,499 Bytes

# 한국어 TTS Arena - TTS Router
import os
import json
import base64
import tempfile
import requests
import urllib.request
import urllib.parse
import wave
import struct
from dotenv import load_dotenv

# Optional: scipy for high-quality resampling
try:
    from scipy import signal
    from scipy.io import wavfile
    import numpy as np
    HAS_SCIPY = True
except ImportError:
    HAS_SCIPY = False
    print("Warning: scipy not installed. Using basic resampling.")

load_dotenv()

# Target sample rate for all TTS outputs (for fair comparison)
TARGET_SAMPLE_RATE = 16000

# 한국어 지원 TTS 제공자 매핑
# - 채널톡: 자체 API
# - ElevenLabs: 직접 API
# - OpenAI: API (gpt-4o-mini-tts)
# - Google: API
# - CLOVA: 네이버 클라우드 API
# - Supertone: API

CHANNEL_TTS_URL = os.getenv(
    "CHANNEL_TTS_URL",
    "https://ch-tts-streaming-demo.channel.io/v1/text-to-speech"
)

ELEVENLABS_API_KEY = os.getenv("ELEVENLABS_API_KEY")
ELEVENLABS_VOICE_ID = os.getenv("ELEVENLABS_VOICE_ID", "21m00Tcm4TlvDq8ikWAM")  # Rachel (기본)

SUPERTONE_API_KEY = os.getenv("SUPERTONE_API_KEY")
SUPERTONE_VOICE_ID = os.getenv("SUPERTONE_VOICE_ID", "91992bbd4758bdcf9c9b01")  # 기본 보이스

# CLOVA TTS (네이버 클라우드)
CLOVA_CLIENT_ID = os.getenv("CLOVA_CLIENT_ID")
CLOVA_API_KEY = os.getenv("CLOVA_API_KEY")

# Humelo DIVE TTS
HUMELO_API_KEY = os.getenv("HUMELO_API_KEY")
HUMELO_API_URL = "https://agitvxptajouhvoatxio.supabase.co/functions/v1/dive-synthesize-v1"

def resample_wav_to_16khz(input_path: str) -> str:
    """
    Resample a WAV file to 16kHz for fair comparison.
    Returns the path to the resampled file.
    """
    if not HAS_SCIPY:
        # If scipy is not available, return original file
        print(f"[Resample] scipy not available, skipping resample for {input_path}")
        return input_path
    
    try:
        # Read the original WAV file
        original_rate, data = wavfile.read(input_path)
        
        # If already 16kHz, return as-is
        if original_rate == TARGET_SAMPLE_RATE:
            print(f"[Resample] Already {TARGET_SAMPLE_RATE}Hz, no resample needed")
            return input_path
        
        print(f"[Resample] Resampling from {original_rate}Hz to {TARGET_SAMPLE_RATE}Hz")
        
        # Handle stereo to mono conversion if needed
        if len(data.shape) > 1:
            data = data.mean(axis=1).astype(data.dtype)
        
        # Calculate the number of samples in the output
        num_samples = int(len(data) * TARGET_SAMPLE_RATE / original_rate)
        
        # Resample using scipy
        resampled_data = signal.resample(data, num_samples)
        
        # Normalize to int16 range
        if resampled_data.dtype != np.int16:
            # Normalize float to int16
            max_val = np.max(np.abs(resampled_data))
            if max_val > 0:
                resampled_data = (resampled_data / max_val * 32767).astype(np.int16)
            else:
                resampled_data = resampled_data.astype(np.int16)
        
        # Save to new temporary file
        with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as f:
            output_path = f.name
        
        wavfile.write(output_path, TARGET_SAMPLE_RATE, resampled_data)
        
        # Remove original file
        os.remove(input_path)
        
        print(f"[Resample] Successfully resampled to {output_path}")
        return output_path
        
    except Exception as e:
        print(f"[Resample] Error resampling: {e}, returning original")
        return input_path


def convert_mp3_to_wav_16khz(input_path: str) -> str:
    """
    Convert MP3 to WAV at 16kHz using pydub (if available) or ffmpeg.
    """
    try:
        from pydub import AudioSegment
        
        print(f"[Convert] Converting MP3 to WAV 16kHz: {input_path}")
        
        # Load MP3
        audio = AudioSegment.from_mp3(input_path)
        
        # Convert to mono and set sample rate
        audio = audio.set_channels(1).set_frame_rate(TARGET_SAMPLE_RATE)
        
        # Export as WAV
        with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as f:
            output_path = f.name
        
        audio.export(output_path, format="wav")
        
        # Remove original MP3
        os.remove(input_path)
        
        print(f"[Convert] Successfully converted to {output_path}")
        return output_path
        
    except ImportError:
        print("[Convert] pydub not available, trying ffmpeg directly")
        try:
            import subprocess
            
            with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as f:
                output_path = f.name
            
            subprocess.run([
                "ffmpeg", "-y", "-i", input_path,
                "-ar", str(TARGET_SAMPLE_RATE),
                "-ac", "1",
                output_path
            ], check=True, capture_output=True)
            
            os.remove(input_path)
            return output_path
            
        except Exception as e:
            print(f"[Convert] ffmpeg conversion failed: {e}, returning original")
            return input_path
    except Exception as e:
        print(f"[Convert] Error converting: {e}, returning original")
        return input_path


model_mapping = {
    # 채널톡 TTS (한국어 특화)
    "channel-hana": {
        "provider": "channel",
        "voice": "hana",
    },
    # ElevenLabs (다국어 지원) - 직접 API 호출
    "eleven-multilingual-v2": {
        "provider": "elevenlabs",
        "model": "eleven_multilingual_v2",
    },
    # OpenAI TTS (gpt-4o-mini-tts)
    "openai-gpt-4o-mini-tts": {
        "provider": "openai",
        "model": "gpt-4o-mini-tts",
        "voice": "coral",
    },
    # Google Cloud TTS
    "google-wavenet": {
        "provider": "google",
        "voice": "ko-KR-Wavenet-A",
    },
    "google-neural2": {
        "provider": "google",
        "voice": "ko-KR-Neural2-A",
    },
    # CLOVA TTS (네이버 클라우드 - 한국어 특화)
    "clova-nara": {
        "provider": "clova",
        "speaker": "nara",
    },
    # Supertone TTS (한국어 특화)
    "supertone-sona": {
        "provider": "supertone",
        "model": "sona_speech_1",
    },
    # Humelo DIVE TTS (한국어 특화)
    "humelo-sia": {
        "provider": "humelo",
        "voice": "리아",
        "emotion": "neutral",
    },
}


def predict_channel_tts(text: str, voice: str = "hana") -> str:
    """채널톡 TTS API 호출"""
    url = f"{CHANNEL_TTS_URL}/{voice}"
    
    response = requests.post(
        url,
        headers={"Content-Type": "application/json"},
        json={"text": text, "output_format": "wav_24000"},
        timeout=30,
    )
    response.raise_for_status()
    
    # 임시 파일에 저장
    with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as f:
        f.write(response.content)
        return f.name


def predict_elevenlabs_tts(text: str, model: str = "eleven_multilingual_v2") -> str:
    """ElevenLabs TTS API 직접 호출"""
    api_key = ELEVENLABS_API_KEY
    if not api_key:
        raise ValueError("ELEVENLABS_API_KEY 환경 변수가 설정되지 않았습니다.")
    
    voice_id = ELEVENLABS_VOICE_ID
    
    response = requests.post(
        f"https://api.elevenlabs.io/v1/text-to-speech/{voice_id}",
        headers={
            "xi-api-key": api_key,
            "Content-Type": "application/json",
            "Accept": "audio/mpeg",
        },
        json={
            "text": text,
            "model_id": model,
            "voice_settings": {
                "stability": 0.5,
                "similarity_boost": 0.75,
            },
        },
        timeout=60,
    )
    response.raise_for_status()
    
    with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as f:
        f.write(response.content)
        return f.name


def predict_openai_tts(text: str, model: str = "gpt-4o-mini-tts", voice: str = "coral") -> str:
    """OpenAI TTS API 호출 (gpt-4o-mini-tts 지원)"""
    api_key = os.getenv("OPENAI_API_KEY")
    if not api_key:
        raise ValueError("OPENAI_API_KEY 환경 변수가 설정되지 않았습니다.")
    
    # gpt-4o-mini-tts용 instructions (한국어 TTS에 최적화)
    instructions = """Voice: Natural and clear Korean voice, with appropriate intonation and rhythm.
Punctuation: Well-structured with natural pauses for clarity.
Delivery: Calm, professional, and easy to understand.
Phrasing: Clear pronunciation with proper Korean phonetics.
Tone: Friendly yet professional, suitable for various contexts."""

    payload = {
        "model": model,
        "input": text,
        "voice": voice,
        "response_format": "wav",
    }
    
    # gpt-4o-mini-tts 모델은 instructions 지원
    if model == "gpt-4o-mini-tts":
        payload["instructions"] = instructions
    
    response = requests.post(
        "https://api.openai.com/v1/audio/speech",
        headers={
            "Authorization": f"Bearer {api_key}",
            "Content-Type": "application/json",
        },
        json=payload,
        timeout=60,
    )
    response.raise_for_status()
    
    with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as f:
        f.write(response.content)
        return f.name


def predict_clova_tts(text: str, speaker: str = "nara") -> str:
    """네이버 클라우드 CLOVA TTS API 호출"""
    client_id = CLOVA_CLIENT_ID
    client_secret = CLOVA_API_KEY
    
    if not client_id or not client_secret:
        raise ValueError("CLOVA_CLIENT_ID 또는 CLOVA_API_KEY 환경 변수가 설정되지 않았습니다.")
    
    enc_text = urllib.parse.quote(text)
    data = f"speaker={speaker}&volume=0&speed=0&pitch=0&format=mp3&text={enc_text}"
    url = "https://naveropenapi.apigw.ntruss.com/tts-premium/v1/tts"
    
    request = urllib.request.Request(url)
    request.add_header("X-NCP-APIGW-API-KEY-ID", client_id)
    request.add_header("X-NCP-APIGW-API-KEY", client_secret)
    
    response = urllib.request.urlopen(request, data=data.encode('utf-8'), timeout=60)
    
    if response.getcode() != 200:
        raise ValueError(f"CLOVA TTS API 오류: {response.getcode()}")
    
    with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as f:
        f.write(response.read())
        return f.name


def predict_supertone_tts(text: str, model: str = "sona_speech_1") -> str:
    """Supertone TTS API 호출"""
    api_key = SUPERTONE_API_KEY
    if not api_key:
        raise ValueError("SUPERTONE_API_KEY 환경 변수가 설정되지 않았습니다.")
    
    voice_id = SUPERTONE_VOICE_ID
    
    response = requests.post(
        f"https://supertoneapi.com/v1/text-to-speech/{voice_id}",
        headers={
            "x-sup-api-key": api_key,
            "Content-Type": "application/json",
        },
        json={
            "text": text,
            "language": "ko",
            "model": model,
            "output_format": "wav",
            "voice_settings": {
                "pitch_shift": 0,
                "pitch_variance": 1,
                "speed": 1,
            },
        },
        timeout=60,
    )
    response.raise_for_status()
    
    with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as f:
        f.write(response.content)
        return f.name


def predict_humelo_tts(text: str, voice: str = "리아", emotion: str = "neutral") -> str:
    """Humelo DIVE TTS API 호출"""
    api_key = HUMELO_API_KEY
    if not api_key:
        raise ValueError("HUMELO_API_KEY 환경 변수가 설정되지 않았습니다.")
    
    response = requests.post(
        HUMELO_API_URL,
        headers={
            "Content-Type": "application/json",
            "X-API-Key": api_key,
        },
        json={
            "text": text,
            "mode": "preset",
            "voiceName": voice,
            "emotion": emotion,
            "lang": "ko",
        },
        timeout=60,
    )
    response.raise_for_status()
    
    data = response.json()
    audio_url = data.get("audio_url")
    
    if not audio_url:
        raise ValueError("Humelo API가 오디오 URL을 반환하지 않았습니다.")
    
    # Download audio from URL
    audio_response = requests.get(audio_url, timeout=60)
    audio_response.raise_for_status()
    
    # Determine file extension from URL or content-type
    content_type = audio_response.headers.get("Content-Type", "")
    if "mp3" in content_type or audio_url.endswith(".mp3"):
        suffix = ".mp3"
    else:
        suffix = ".wav"
    
    with tempfile.NamedTemporaryFile(delete=False, suffix=suffix) as f:
        f.write(audio_response.content)
        return f.name


def predict_google_tts(text: str, voice: str = "ko-KR-Wavenet-A") -> str:
    """Google Cloud TTS API 호출"""
    api_key = os.getenv("GOOGLE_API_KEY")
    if not api_key:
        raise ValueError("GOOGLE_API_KEY 환경 변수가 설정되지 않았습니다.")
    
    response = requests.post(
        f"https://texttospeech.googleapis.com/v1/text:synthesize?key={api_key}",
        headers={"Content-Type": "application/json"},
        json={
            "input": {"text": text},
            "voice": {
                "languageCode": "ko-KR",
                "name": voice,
            },
            "audioConfig": {
                "audioEncoding": "LINEAR16",
                "sampleRateHertz": 24000,
            },
        },
        timeout=30,
    )
    response.raise_for_status()
    
    audio_content = response.json().get("audioContent")
    if not audio_content:
        raise ValueError("Google TTS API가 오디오를 반환하지 않았습니다.")
    
    audio_bytes = base64.b64decode(audio_content)
    with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as f:
        f.write(audio_bytes)
        return f.name


def predict_tts(text: str, model: str) -> str:
    """
    TTS 생성 메인 함수
    
    Args:
        text: 합성할 텍스트
        model: 모델 ID (model_mapping의 키)
        
    Returns:
        생성된 오디오 파일 경로 (16kHz WAV로 통일)
    """
    print(f"[TTS] Predicting for model: {model}")
    
    if model not in model_mapping:
        raise ValueError(f"지원하지 않는 모델입니다: {model}")
    
    config = model_mapping[model]
    provider = config["provider"]
    audio_path = None
    is_mp3 = False
    
    if provider == "channel":
        audio_path = predict_channel_tts(text, config.get("voice", "hana"))
        # Channel TTS returns WAV at 24kHz
    
    elif provider == "openai":
        audio_path = predict_openai_tts(
            text,
            config.get("model", "gpt-4o-mini-tts"),
            config.get("voice", "coral"),
        )
        # OpenAI returns WAV
    
    elif provider == "google":
        audio_path = predict_google_tts(text, config.get("voice", "ko-KR-Wavenet-A"))
        # Google returns WAV at 24kHz
    
    elif provider == "elevenlabs":
        audio_path = predict_elevenlabs_tts(text, config.get("model", "eleven_multilingual_v2"))
        is_mp3 = True  # ElevenLabs returns MP3
    
    elif provider == "supertone":
        audio_path = predict_supertone_tts(text, config.get("model", "sona_speech_1"))
        # Supertone returns WAV
    
    elif provider == "clova":
        audio_path = predict_clova_tts(text, config.get("speaker", "nara"))
        is_mp3 = True  # CLOVA returns MP3
    
    elif provider == "humelo":
        audio_path = predict_humelo_tts(
            text, 
            config.get("voice", "리아"),
            config.get("emotion", "neutral"),
        )
        # Humelo might return MP3 or WAV, check extension
        is_mp3 = audio_path.endswith(".mp3")
    
    else:
        raise ValueError(f"알 수 없는 provider: {provider}")
    
    # Standardize to 16kHz WAV for fair comparison
    if audio_path:
        if is_mp3:
            # Convert MP3 to WAV at 16kHz
            audio_path = convert_mp3_to_wav_16khz(audio_path)
        else:
            # Resample WAV to 16kHz
            audio_path = resample_wav_to_16khz(audio_path)
    
    return audio_path


if __name__ == "__main__":
    # 테스트
    test_text = "안녕하세요, 채널톡 TTS 테스트입니다."
    
    print("Testing Channel TTS...")
    try:
        path = predict_channel_tts(test_text)
        print(f"  Success: {path}")
    except Exception as e:
        print(f"  Error: {e}")