Ko-TTS-Arena / tts.py
Ko-TTS-Arena Contributors
fix: Update Typecast default model to ssfm-v30
06db955
# 한국어 TTS Arena - TTS Router
import os
import json
import base64
import tempfile
import requests
import urllib.request
import urllib.parse
import wave
import struct
from dotenv import load_dotenv
# Optional: scipy for high-quality resampling
try:
from scipy import signal
from scipy.io import wavfile
import numpy as np
HAS_SCIPY = True
except ImportError:
HAS_SCIPY = False
print("Warning: scipy not installed. Using basic resampling.")
load_dotenv()
# Target sample rate for all TTS outputs (for fair comparison)
TARGET_SAMPLE_RATE = 16000
# 한국어 지원 TTS 제공자 매핑
# - 채널톡: 자체 API
# - ElevenLabs: 직접 API
# - OpenAI: API (gpt-4o-mini-tts)
# - Google: API
# - CLOVA: 네이버 클라우드 API
# - Supertone: API
CHANNEL_TTS_URL = os.getenv(
"CHANNEL_TTS_URL",
"https://ch-tts-streaming-demo.channel.io/v1/text-to-speech"
)
ELEVENLABS_API_KEY = os.getenv("ELEVENLABS_API_KEY")
ELEVENLABS_VOICE_ID = os.getenv("ELEVENLABS_VOICE_ID", "21m00Tcm4TlvDq8ikWAM") # Rachel (기본)
SUPERTONE_API_KEY = os.getenv("SUPERTONE_API_KEY")
SUPERTONE_VOICE_ID = os.getenv("SUPERTONE_VOICE_ID", "91992bbd4758bdcf9c9b01") # 기본 보이스
# CLOVA TTS (네이버 클라우드)
CLOVA_CLIENT_ID = os.getenv("CLOVA_CLIENT_ID")
CLOVA_API_KEY = os.getenv("CLOVA_API_KEY")
# Humelo DIVE TTS
HUMELO_API_KEY = os.getenv("HUMELO_API_KEY")
HUMELO_API_URL = "https://agitvxptajouhvoatxio.supabase.co/functions/v1/dive-synthesize-v1"
# Typecast TTS
TYPECAST_API_KEY = os.getenv("TYPECAST_API_KEY")
# Gemini TTS (Google Cloud) - 서비스 계정 JSON 필요 (API Key 미지원)
GOOGLE_APPLICATION_CREDENTIALS_JSON = os.getenv("GOOGLE_APPLICATION_CREDENTIALS_JSON")
if GOOGLE_APPLICATION_CREDENTIALS_JSON:
_credentials_path = os.path.join(tempfile.gettempdir(), "google_credentials.json")
try:
with open(_credentials_path, "w") as f:
f.write(GOOGLE_APPLICATION_CREDENTIALS_JSON)
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = _credentials_path
print("[Gemini TTS] Service account credentials loaded")
except Exception as e:
print(f"[Gemini TTS] Failed to save credentials: {e}")
def resample_wav_to_16khz(input_path: str) -> str:
"""
Resample a WAV file to 16kHz for fair comparison.
Returns the path to the resampled file.
"""
if not HAS_SCIPY:
# If scipy is not available, return original file
print(f"[Resample] scipy not available, skipping resample for {input_path}")
return input_path
try:
# Read the original WAV file
original_rate, data = wavfile.read(input_path)
# If already 16kHz, return as-is
if original_rate == TARGET_SAMPLE_RATE:
print(f"[Resample] Already {TARGET_SAMPLE_RATE}Hz, no resample needed")
return input_path
print(f"[Resample] Resampling from {original_rate}Hz to {TARGET_SAMPLE_RATE}Hz")
# Handle stereo to mono conversion if needed
if len(data.shape) > 1:
data = data.mean(axis=1).astype(data.dtype)
# Calculate the number of samples in the output
num_samples = int(len(data) * TARGET_SAMPLE_RATE / original_rate)
# Resample using scipy
resampled_data = signal.resample(data, num_samples)
# Normalize to int16 range
if resampled_data.dtype != np.int16:
# Normalize float to int16
max_val = np.max(np.abs(resampled_data))
if max_val > 0:
resampled_data = (resampled_data / max_val * 32767).astype(np.int16)
else:
resampled_data = resampled_data.astype(np.int16)
# Save to new temporary file
with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as f:
output_path = f.name
wavfile.write(output_path, TARGET_SAMPLE_RATE, resampled_data)
# Remove original file
os.remove(input_path)
print(f"[Resample] Successfully resampled to {output_path}")
return output_path
except Exception as e:
print(f"[Resample] Error resampling: {e}, returning original")
return input_path
def convert_mp3_to_wav_16khz(input_path: str) -> str:
"""
Convert MP3 to WAV at 16kHz using pydub (if available) or ffmpeg.
"""
try:
from pydub import AudioSegment
print(f"[Convert] Converting MP3 to WAV 16kHz: {input_path}")
# Load MP3
audio = AudioSegment.from_mp3(input_path)
# Convert to mono and set sample rate
audio = audio.set_channels(1).set_frame_rate(TARGET_SAMPLE_RATE)
# Export as WAV
with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as f:
output_path = f.name
audio.export(output_path, format="wav")
# Remove original MP3
os.remove(input_path)
print(f"[Convert] Successfully converted to {output_path}")
return output_path
except ImportError:
print("[Convert] pydub not available, trying ffmpeg directly")
try:
import subprocess
with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as f:
output_path = f.name
subprocess.run([
"ffmpeg", "-y", "-i", input_path,
"-ar", str(TARGET_SAMPLE_RATE),
"-ac", "1",
output_path
], check=True, capture_output=True)
os.remove(input_path)
return output_path
except Exception as e:
print(f"[Convert] ffmpeg conversion failed: {e}, returning original")
return input_path
except Exception as e:
print(f"[Convert] Error converting: {e}, returning original")
return input_path
model_mapping = {
# 채널톡 TTS (한국어 특화)
"channel-hana": {
"provider": "channel",
"voice": "hana",
},
# ElevenLabs (다국어 지원) - 직접 API 호출
"eleven-multilingual-v2": {
"provider": "elevenlabs",
"model": "eleven_multilingual_v2",
},
# OpenAI TTS (gpt-4o-mini-tts)
"openai-gpt-4o-mini-tts": {
"provider": "openai",
"model": "gpt-4o-mini-tts",
"voice": "coral",
},
# Google Cloud TTS
"google-wavenet": {
"provider": "google",
"voice": "ko-KR-Wavenet-A",
},
"google-neural2": {
"provider": "google",
"voice": "ko-KR-Neural2-A",
},
# CLOVA TTS (네이버 클라우드 - 한국어 특화)
"clova-nara": {
"provider": "clova",
"speaker": "nara",
},
# Supertone TTS (한국어 특화)
"supertone-sona": {
"provider": "supertone",
"model": "sona_speech_1",
},
# Humelo DIVE TTS (한국어 특화)
"humelo-sia": {
"provider": "humelo",
"voice": "리아",
"emotion": "neutral",
},
# Typecast TTS v3.0 (한국어 특화) - 새 보이스
"typecast-jaesun": {
"provider": "typecast",
"voice_id": "tc_684a7a1446e2a628b5b07230", # 재선
"model": "ssfm-v30",
},
"typecast-jain": {
"provider": "typecast",
"voice_id": "tc_6809c111e5e8c73f8a0237b2", # 자인
"model": "ssfm-v30",
},
# Legacy Typecast IDs - routes to new Jaesun voice
"typecast-geumhee": {
"provider": "typecast",
"voice_id": "tc_684a7a1446e2a628b5b07230",
"model": "ssfm-v30",
},
"typecast-jaeyi": {
"provider": "typecast",
"voice_id": "tc_684a7a1446e2a628b5b07230",
"model": "ssfm-v30",
},
# Gemini TTS (Google Cloud - 다국어 지원)
"gemini-tts-aoede": {
"provider": "gemini",
"voice": "Aoede",
"model": "gemini-2.5-flash-tts",
},
}
def predict_channel_tts(text: str, voice: str = "hana") -> str:
"""채널톡 TTS API 호출"""
url = f"{CHANNEL_TTS_URL}/{voice}"
response = requests.post(
url,
headers={"Content-Type": "application/json"},
json={"text": text, "output_format": "wav_24000"},
timeout=30,
)
response.raise_for_status()
# 임시 파일에 저장
with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as f:
f.write(response.content)
return f.name
def predict_elevenlabs_tts(text: str, model: str = "eleven_multilingual_v2") -> str:
"""ElevenLabs TTS API 직접 호출"""
api_key = ELEVENLABS_API_KEY
if not api_key:
raise ValueError("ELEVENLABS_API_KEY 환경 변수가 설정되지 않았습니다.")
voice_id = ELEVENLABS_VOICE_ID
response = requests.post(
f"https://api.elevenlabs.io/v1/text-to-speech/{voice_id}",
headers={
"xi-api-key": api_key,
"Content-Type": "application/json",
"Accept": "audio/mpeg",
},
json={
"text": text,
"model_id": model,
"voice_settings": {
"stability": 0.5,
"similarity_boost": 0.75,
},
},
timeout=60,
)
response.raise_for_status()
with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as f:
f.write(response.content)
return f.name
def predict_openai_tts(text: str, model: str = "gpt-4o-mini-tts", voice: str = "coral") -> str:
"""OpenAI TTS API 호출 (gpt-4o-mini-tts 지원)"""
api_key = os.getenv("OPENAI_API_KEY")
if not api_key:
raise ValueError("OPENAI_API_KEY 환경 변수가 설정되지 않았습니다.")
# gpt-4o-mini-tts용 instructions (한국어 TTS에 최적화)
instructions = """Voice: Natural and clear Korean voice, with appropriate intonation and rhythm.
Punctuation: Well-structured with natural pauses for clarity.
Delivery: Calm, professional, and easy to understand.
Phrasing: Clear pronunciation with proper Korean phonetics.
Tone: Friendly yet professional, suitable for various contexts."""
payload = {
"model": model,
"input": text,
"voice": voice,
"response_format": "wav",
}
# gpt-4o-mini-tts 모델은 instructions 지원
if model == "gpt-4o-mini-tts":
payload["instructions"] = instructions
response = requests.post(
"https://api.openai.com/v1/audio/speech",
headers={
"Authorization": f"Bearer {api_key}",
"Content-Type": "application/json",
},
json=payload,
timeout=60,
)
response.raise_for_status()
with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as f:
f.write(response.content)
return f.name
def predict_clova_tts(text: str, speaker: str = "nara") -> str:
"""네이버 클라우드 CLOVA TTS API 호출"""
client_id = CLOVA_CLIENT_ID
client_secret = CLOVA_API_KEY
if not client_id or not client_secret:
raise ValueError("CLOVA_CLIENT_ID 또는 CLOVA_API_KEY 환경 변수가 설정되지 않았습니다.")
enc_text = urllib.parse.quote(text)
data = f"speaker={speaker}&volume=0&speed=0&pitch=0&format=mp3&text={enc_text}"
url = "https://naveropenapi.apigw.ntruss.com/tts-premium/v1/tts"
request = urllib.request.Request(url)
request.add_header("X-NCP-APIGW-API-KEY-ID", client_id)
request.add_header("X-NCP-APIGW-API-KEY", client_secret)
response = urllib.request.urlopen(request, data=data.encode('utf-8'), timeout=60)
if response.getcode() != 200:
raise ValueError(f"CLOVA TTS API 오류: {response.getcode()}")
with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as f:
f.write(response.read())
return f.name
def predict_supertone_tts(text: str, model: str = "sona_speech_1") -> str:
"""Supertone TTS API 호출"""
api_key = SUPERTONE_API_KEY
if not api_key:
raise ValueError("SUPERTONE_API_KEY 환경 변수가 설정되지 않았습니다.")
voice_id = SUPERTONE_VOICE_ID
response = requests.post(
f"https://supertoneapi.com/v1/text-to-speech/{voice_id}",
headers={
"x-sup-api-key": api_key,
"Content-Type": "application/json",
},
json={
"text": text,
"language": "ko",
"model": model,
"output_format": "wav",
"voice_settings": {
"pitch_shift": 0,
"pitch_variance": 1,
"speed": 1,
},
},
timeout=60,
)
response.raise_for_status()
with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as f:
f.write(response.content)
return f.name
def predict_humelo_tts(text: str, voice: str = "리아", emotion: str = "neutral") -> str:
"""Humelo DIVE TTS API 호출"""
api_key = HUMELO_API_KEY
if not api_key:
raise ValueError("HUMELO_API_KEY 환경 변수가 설정되지 않았습니다.")
response = requests.post(
HUMELO_API_URL,
headers={
"Content-Type": "application/json",
"X-API-Key": api_key,
},
json={
"text": text,
"mode": "preset",
"voiceName": voice,
"emotion": emotion,
"lang": "ko",
},
timeout=60,
)
response.raise_for_status()
data = response.json()
audio_url = data.get("audio_url")
if not audio_url:
raise ValueError("Humelo API가 오디오 URL을 반환하지 않았습니다.")
# Download audio from URL
audio_response = requests.get(audio_url, timeout=60)
audio_response.raise_for_status()
# Determine file extension from URL or content-type
content_type = audio_response.headers.get("Content-Type", "")
if "mp3" in content_type or audio_url.endswith(".mp3"):
suffix = ".mp3"
else:
suffix = ".wav"
with tempfile.NamedTemporaryFile(delete=False, suffix=suffix) as f:
f.write(audio_response.content)
return f.name
def predict_typecast_tts(text: str, voice_id: str, model: str = "ssfm-v30", max_retries: int = 3) -> str:
"""Typecast TTS API 호출 (v3.0 experiment API) with retry logic"""
import time
api_key = TYPECAST_API_KEY
if not api_key:
raise ValueError("TYPECAST_API_KEY 환경 변수가 설정되지 않았습니다.")
url = "https://api.typecast.ai/experiment/text-to-speech"
payload = {
"voice_id": voice_id,
"text": text,
"model": model
}
headers = {
"X-API-KEY": api_key,
"Content-Type": "application/json"
}
last_error = None
for attempt in range(max_retries):
try:
response = requests.post(url, headers=headers, json=payload, timeout=60)
response.raise_for_status()
with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as f:
f.write(response.content)
return f.name
except requests.exceptions.RequestException as e:
last_error = e
print(f"[Typecast] Attempt {attempt + 1}/{max_retries} failed: {e}")
if attempt < max_retries - 1:
wait_time = (attempt + 1) * 2 # 2초, 4초, 6초...
print(f"[Typecast] Retrying in {wait_time}s...")
time.sleep(wait_time)
raise ValueError(f"Typecast TTS API 오류 (재시도 {max_retries}회 실패): {last_error}")
def predict_gemini_tts(text: str, voice: str = "Aoede", model: str = "gemini-2.5-flash-tts") -> str:
"""Gemini TTS API 호출 (서비스 계정 JSON 필요)"""
if not GOOGLE_APPLICATION_CREDENTIALS_JSON:
raise ValueError("GOOGLE_APPLICATION_CREDENTIALS_JSON 환경 변수가 설정되지 않았습니다.")
try:
from google.api_core.client_options import ClientOptions
from google.cloud import texttospeech_v1beta1 as texttospeech
client = texttospeech.TextToSpeechClient(
client_options=ClientOptions(api_endpoint="texttospeech.googleapis.com")
)
voice_params = texttospeech.VoiceSelectionParams(
name=voice,
language_code="ko-kr",
model_name=model,
)
response = client.synthesize_speech(
input=texttospeech.SynthesisInput(
text=text,
prompt="친절하고 자연스러운 톤으로 말해주세요",
),
voice=voice_params,
audio_config=texttospeech.AudioConfig(
audio_encoding=texttospeech.AudioEncoding.LINEAR16,
sample_rate_hertz=24000,
),
)
with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as f:
f.write(response.audio_content)
return f.name
except ImportError:
raise ValueError("google-cloud-texttospeech 패키지가 설치되지 않았습니다.")
except Exception as e:
raise ValueError(f"Gemini TTS API 오류: {str(e)}")
def predict_google_tts(text: str, voice: str = "ko-KR-Wavenet-A") -> str:
"""Google Cloud TTS API 호출"""
api_key = os.getenv("GOOGLE_API_KEY")
if not api_key:
raise ValueError("GOOGLE_API_KEY 환경 변수가 설정되지 않았습니다.")
response = requests.post(
f"https://texttospeech.googleapis.com/v1/text:synthesize?key={api_key}",
headers={"Content-Type": "application/json"},
json={
"input": {"text": text},
"voice": {
"languageCode": "ko-KR",
"name": voice,
},
"audioConfig": {
"audioEncoding": "LINEAR16",
"sampleRateHertz": 24000,
},
},
timeout=30,
)
response.raise_for_status()
audio_content = response.json().get("audioContent")
if not audio_content:
raise ValueError("Google TTS API가 오디오를 반환하지 않았습니다.")
audio_bytes = base64.b64decode(audio_content)
with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as f:
f.write(audio_bytes)
return f.name
def predict_tts(text: str, model: str) -> str:
"""
TTS 생성 메인 함수
Args:
text: 합성할 텍스트
model: 모델 ID (model_mapping의 키)
Returns:
생성된 오디오 파일 경로 (16kHz WAV로 통일)
"""
print(f"[TTS] Predicting for model: {model}")
if model not in model_mapping:
raise ValueError(f"지원하지 않는 모델입니다: {model}")
config = model_mapping[model]
provider = config["provider"]
audio_path = None
is_mp3 = False
if provider == "channel":
audio_path = predict_channel_tts(text, config.get("voice", "hana"))
# Channel TTS returns WAV at 24kHz
elif provider == "openai":
audio_path = predict_openai_tts(
text,
config.get("model", "gpt-4o-mini-tts"),
config.get("voice", "coral"),
)
# OpenAI returns WAV
elif provider == "google":
audio_path = predict_google_tts(text, config.get("voice", "ko-KR-Wavenet-A"))
# Google returns WAV at 24kHz
elif provider == "elevenlabs":
audio_path = predict_elevenlabs_tts(text, config.get("model", "eleven_multilingual_v2"))
is_mp3 = True # ElevenLabs returns MP3
elif provider == "supertone":
audio_path = predict_supertone_tts(text, config.get("model", "sona_speech_1"))
# Supertone returns WAV
elif provider == "clova":
audio_path = predict_clova_tts(text, config.get("speaker", "nara"))
is_mp3 = True # CLOVA returns MP3
elif provider == "humelo":
audio_path = predict_humelo_tts(
text,
config.get("voice", "리아"),
config.get("emotion", "neutral"),
)
# Humelo might return MP3 or WAV, check extension
is_mp3 = audio_path.endswith(".mp3")
elif provider == "typecast":
audio_path = predict_typecast_tts(
text,
config.get("voice_id", "tc_684a7a1446e2a628b5b07230"), # 재선 (v3.0)
config.get("model", "ssfm-v30"),
)
# Typecast returns WAV
elif provider == "gemini":
audio_path = predict_gemini_tts(
text,
config.get("voice", "Aoede"),
config.get("model", "gemini-2.5-flash-tts"),
)
# Gemini TTS returns WAV at 24kHz
else:
raise ValueError(f"알 수 없는 provider: {provider}")
# Standardize to 16kHz WAV for fair comparison
if audio_path:
if is_mp3:
# Convert MP3 to WAV at 16kHz
audio_path = convert_mp3_to_wav_16khz(audio_path)
else:
# Resample WAV to 16kHz
audio_path = resample_wav_to_16khz(audio_path)
return audio_path
if __name__ == "__main__":
# 테스트
test_text = "안녕하세요, 채널톡 TTS 테스트입니다."
print("Testing Channel TTS...")
try:
path = predict_channel_tts(test_text)
print(f" Success: {path}")
except Exception as e:
print(f" Error: {e}")