Spaces:

channelcorp
/

Ko-TTS-Arena

Sleeping

blackhole1218 commited on 14 days ago

Commit

0084d0b

1 Parent(s): c7ac7fd

feat: add Humelo DIVE TTS (시아 voice)

- Add Humelo DIVE TTS provider with 시아 voice
- Downloads audio from URL returned by API
- Supports emotion parameter (neutral default)
- Auto-converts to 16kHz WAV for fair comparison

Files changed (2) hide show

models.py +10 -0
tts.py +64 -0

models.py CHANGED Viewed

@@ -572,6 +572,7 @@ def insert_initial_models():
     has_google = bool(os.getenv("GOOGLE_API_KEY"))
     has_supertone = bool(os.getenv("SUPERTONE_API_KEY"))
     has_clova = bool(os.getenv("CLOVA_CLIENT_ID") and os.getenv("CLOVA_API_KEY"))
     tts_models = [
         # 채널톡 TTS (한국어 특화) - 항상 활성화
@@ -636,6 +637,15 @@ def insert_initial_models():
             is_active=has_supertone,
             model_url="https://supertone.ai/",
         ),
     ]
     for model in tts_models:

     has_google = bool(os.getenv("GOOGLE_API_KEY"))
     has_supertone = bool(os.getenv("SUPERTONE_API_KEY"))
     has_clova = bool(os.getenv("CLOVA_CLIENT_ID") and os.getenv("CLOVA_API_KEY"))
+    has_humelo = bool(os.getenv("HUMELO_API_KEY"))
     tts_models = [
         # 채널톡 TTS (한국어 특화) - 항상 활성화
             is_active=has_supertone,
             model_url="https://supertone.ai/",
         ),
+        # Humelo DIVE TTS (한국어 특화) - API 키 있을 때만 활성화
+        Model(
+            id="humelo-sia",
+            name="Humelo DIVE (시아)",
+            model_type=ModelType.TTS,
+            is_open=False,
+            is_active=has_humelo,
+            model_url="https://humelo.com/",
+        ),
     ]
     for model in tts_models:

tts.py CHANGED Viewed

@@ -48,6 +48,10 @@ SUPERTONE_VOICE_ID = os.getenv("SUPERTONE_VOICE_ID", "91992bbd4758bdcf9c9b01")
 CLOVA_CLIENT_ID = os.getenv("CLOVA_CLIENT_ID")
 CLOVA_API_KEY = os.getenv("CLOVA_API_KEY")
 def resample_wav_to_16khz(input_path: str) -> str:
     """
     Resample a WAV file to 16kHz for fair comparison.
@@ -194,6 +198,12 @@ model_mapping = {
         "provider": "supertone",
         "model": "sona_speech_1",
     },
 }
@@ -347,6 +357,51 @@ def predict_supertone_tts(text: str, model: str = "sona_speech_1") -> str:
         return f.name
 def predict_google_tts(text: str, voice: str = "ko-KR-Wavenet-A") -> str:
     """Google Cloud TTS API 호출"""
     api_key = os.getenv("GOOGLE_API_KEY")
@@ -430,6 +485,15 @@ def predict_tts(text: str, model: str) -> str:
         audio_path = predict_clova_tts(text, config.get("speaker", "nara"))
         is_mp3 = True  # CLOVA returns MP3
     else:
         raise ValueError(f"알 수 없는 provider: {provider}")

 CLOVA_CLIENT_ID = os.getenv("CLOVA_CLIENT_ID")
 CLOVA_API_KEY = os.getenv("CLOVA_API_KEY")
+# Humelo DIVE TTS
+HUMELO_API_KEY = os.getenv("HUMELO_API_KEY")
+HUMELO_API_URL = "https://agitvxptajouhvoatxio.supabase.co/functions/v1/dive-synthesize-v1"
 def resample_wav_to_16khz(input_path: str) -> str:
     """
     Resample a WAV file to 16kHz for fair comparison.
         "provider": "supertone",
         "model": "sona_speech_1",
     },
+    # Humelo DIVE TTS (한국어 특화)
+    "humelo-sia": {
+        "provider": "humelo",
+        "voice": "시아",
+        "emotion": "neutral",
+    },
 }
         return f.name
+def predict_humelo_tts(text: str, voice: str = "시아", emotion: str = "neutral") -> str:
+    """Humelo DIVE TTS API 호출"""
+    api_key = HUMELO_API_KEY
+    if not api_key:
+        raise ValueError("HUMELO_API_KEY 환경 변수가 설정되지 않았습니다.")
+    response = requests.post(
+        HUMELO_API_URL,
+        headers={
+            "Content-Type": "application/json",
+            "X-API-Key": api_key,
+        },
+        json={
+            "text": text,
+            "mode": "preset",
+            "voiceName": voice,
+            "emotion": emotion,
+            "lang": "ko",
+        },
+        timeout=60,
+    )
+    response.raise_for_status()
+    data = response.json()
+    audio_url = data.get("audio_url")
+    if not audio_url:
+        raise ValueError("Humelo API가 오디오 URL을 반환하지 않았습니다.")
+    # Download audio from URL
+    audio_response = requests.get(audio_url, timeout=60)
+    audio_response.raise_for_status()
+    # Determine file extension from URL or content-type
+    content_type = audio_response.headers.get("Content-Type", "")
+    if "mp3" in content_type or audio_url.endswith(".mp3"):
+        suffix = ".mp3"
+    else:
+        suffix = ".wav"
+    with tempfile.NamedTemporaryFile(delete=False, suffix=suffix) as f:
+        f.write(audio_response.content)
+        return f.name
 def predict_google_tts(text: str, voice: str = "ko-KR-Wavenet-A") -> str:
     """Google Cloud TTS API 호출"""
     api_key = os.getenv("GOOGLE_API_KEY")
         audio_path = predict_clova_tts(text, config.get("speaker", "nara"))
         is_mp3 = True  # CLOVA returns MP3
+    elif provider == "humelo":
+        audio_path = predict_humelo_tts(
+            text,
+            config.get("voice", "시아"),
+            config.get("emotion", "neutral"),
+        )
+        # Humelo might return MP3 or WAV, check extension
+        is_mp3 = audio_path.endswith(".mp3")
     else:
         raise ValueError(f"알 수 없는 provider: {provider}")