Spaces:

channelcorp
/

Ko-TTS-Arena

Running on CPU Upgrade

blackhole1218 commited on 11 days ago

Commit

c7ac7fd

1 Parent(s): 7b01a99

feat: remove Discord link, add 16kHz audio resampling for fair comparison

- Remove Discord link and CSS styles from base.html
- Add audio resampling to 16kHz for all TTS outputs
- Convert MP3 outputs (ElevenLabs, CLOVA) to WAV
- Add scipy, numpy, pydub to requirements.txt
- Ensures fair audio quality comparison across providers

Files changed (3) hide show

requirements.txt +4 -1
templates/base.html +0 -25
tts.py +151 -7

requirements.txt CHANGED Viewed

@@ -10,4 +10,7 @@ apscheduler
 flask-migrate
 gunicorn
 waitress
-huggingface-hub

 flask-migrate
 gunicorn
 waitress
+huggingface-hub
+scipy
+numpy
+pydub

templates/base.html CHANGED Viewed

@@ -388,24 +388,6 @@
             margin-right: 12px;
         }
-        .discord-link {
-            display: flex;
-            align-items: center;
-            padding: 12px 16px;
-            border-top: 1px solid var(--border-color);
-            text-decoration: none;
-            color: var(--text-color);
-        }
-        .discord-link:hover {
-            background-color: var(--light-gray);
-            color: #5865F2;
-        }
-        .discord-link svg {
-            margin-right: 12px;
-        }
         .sidebar-footer {
             margin-top: auto;
             display: flex;
@@ -1126,13 +1108,6 @@
         </nav>
         <div class="sidebar-footer">
-            <a href="https://discord.gg/HB8fMR6GTr" target="_blank" rel="noopener noreferrer" class="discord-link">
-                <svg xmlns="http://www.w3.org/2000/svg" width="24" height="24" viewBox="0 0 127.14 96.36" fill="currentColor">
-                    <path d="M107.7,8.07A105.15,105.15,0,0,0,81.47,0a72.06,72.06,0,0,0-3.36,6.83A97.68,97.68,0,0,0,49,6.83,72.37,72.37,0,0,0,45.64,0,105.89,105.89,0,0,0,19.39,8.09C2.79,32.65-1.71,56.6.54,80.21h0A105.73,105.73,0,0,0,32.71,96.36,77.7,77.7,0,0,0,39.6,85.25a68.42,68.42,0,0,1-10.85-5.18c.91-.66,1.8-1.34,2.66-2a75.57,75.57,0,0,0,64.32,0c.87.71,1.76,1.39,2.66,2a68.68,68.68,0,0,1-10.87,5.19,77,77,0,0,0,6.89,11.1A105.25,105.25,0,0,0,126.6,80.22h0C129.24,52.84,122.09,29.11,107.7,8.07ZM42.45,65.69C36.18,65.69,31,60,31,53s5-12.74,11.43-12.74S54,46,53.89,53,48.84,65.69,42.45,65.69Zm42.24,0C78.41,65.69,73.25,60,73.25,53s5-12.74,11.44-12.74S96.23,46,96.12,53,91.08,65.69,84.69,65.69Z"/>
-                </svg>
-                Join our Discord
-            </a>
             {% if current_user.is_authenticated %}
             <div class="user-auth" onclick="toggleUserDropdown(event)">
                 <div class="user-name">{{ current_user.username }}</div>

             margin-right: 12px;
         }
         .sidebar-footer {
             margin-top: auto;
             display: flex;
         </nav>
         <div class="sidebar-footer">
             {% if current_user.is_authenticated %}
             <div class="user-auth" onclick="toggleUserDropdown(event)">
                 <div class="user-name">{{ current_user.username }}</div>

tts.py CHANGED Viewed

@@ -6,10 +6,25 @@ import tempfile
 import requests
 import urllib.request
 import urllib.parse
 from dotenv import load_dotenv
 load_dotenv()
 # 한국어 지원 TTS 제공자 매핑
 # - 채널톡: 자체 API
 # - ElevenLabs: 직접 API
@@ -33,6 +48,116 @@ SUPERTONE_VOICE_ID = os.getenv("SUPERTONE_VOICE_ID", "91992bbd4758bdcf9c9b01")
 CLOVA_CLIENT_ID = os.getenv("CLOVA_CLIENT_ID")
 CLOVA_API_KEY = os.getenv("CLOVA_API_KEY")
 model_mapping = {
     # 채널톡 TTS (한국어 특화)
     "channel-hana": {
@@ -265,7 +390,7 @@ def predict_tts(text: str, model: str) -> str:
         model: 모델 ID (model_mapping의 키)
     Returns:
-        생성된 오디오 파일 경로
     """
     print(f"[TTS] Predicting for model: {model}")
@@ -274,31 +399,50 @@ def predict_tts(text: str, model: str) -> str:
     config = model_mapping[model]
     provider = config["provider"]
     if provider == "channel":
-        return predict_channel_tts(text, config.get("voice", "hana"))
     elif provider == "openai":
-        return predict_openai_tts(
             text,
             config.get("model", "gpt-4o-mini-tts"),
             config.get("voice", "coral"),
         )
     elif provider == "google":
-        return predict_google_tts(text, config.get("voice", "ko-KR-Wavenet-A"))
     elif provider == "elevenlabs":
-        return predict_elevenlabs_tts(text, config.get("model", "eleven_multilingual_v2"))
     elif provider == "supertone":
-        return predict_supertone_tts(text, config.get("model", "sona_speech_1"))
     elif provider == "clova":
-        return predict_clova_tts(text, config.get("speaker", "nara"))
     else:
         raise ValueError(f"알 수 없는 provider: {provider}")
 if __name__ == "__main__":

 import requests
 import urllib.request
 import urllib.parse
+import wave
+import struct
 from dotenv import load_dotenv
+# Optional: scipy for high-quality resampling
+try:
+    from scipy import signal
+    from scipy.io import wavfile
+    import numpy as np
+    HAS_SCIPY = True
+except ImportError:
+    HAS_SCIPY = False
+    print("Warning: scipy not installed. Using basic resampling.")
 load_dotenv()
+# Target sample rate for all TTS outputs (for fair comparison)
+TARGET_SAMPLE_RATE = 16000
 # 한국어 지원 TTS 제공자 매핑
 # - 채널톡: 자체 API
 # - ElevenLabs: 직접 API
 CLOVA_CLIENT_ID = os.getenv("CLOVA_CLIENT_ID")
 CLOVA_API_KEY = os.getenv("CLOVA_API_KEY")
+def resample_wav_to_16khz(input_path: str) -> str:
+    """
+    Resample a WAV file to 16kHz for fair comparison.
+    Returns the path to the resampled file.
+    """
+    if not HAS_SCIPY:
+        # If scipy is not available, return original file
+        print(f"[Resample] scipy not available, skipping resample for {input_path}")
+        return input_path
+    try:
+        # Read the original WAV file
+        original_rate, data = wavfile.read(input_path)
+        # If already 16kHz, return as-is
+        if original_rate == TARGET_SAMPLE_RATE:
+            print(f"[Resample] Already {TARGET_SAMPLE_RATE}Hz, no resample needed")
+            return input_path
+        print(f"[Resample] Resampling from {original_rate}Hz to {TARGET_SAMPLE_RATE}Hz")
+        # Handle stereo to mono conversion if needed
+        if len(data.shape) > 1:
+            data = data.mean(axis=1).astype(data.dtype)
+        # Calculate the number of samples in the output
+        num_samples = int(len(data) * TARGET_SAMPLE_RATE / original_rate)
+        # Resample using scipy
+        resampled_data = signal.resample(data, num_samples)
+        # Normalize to int16 range
+        if resampled_data.dtype != np.int16:
+            # Normalize float to int16
+            max_val = np.max(np.abs(resampled_data))
+            if max_val > 0:
+                resampled_data = (resampled_data / max_val * 32767).astype(np.int16)
+            else:
+                resampled_data = resampled_data.astype(np.int16)
+        # Save to new temporary file
+        with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as f:
+            output_path = f.name
+        wavfile.write(output_path, TARGET_SAMPLE_RATE, resampled_data)
+        # Remove original file
+        os.remove(input_path)
+        print(f"[Resample] Successfully resampled to {output_path}")
+        return output_path
+    except Exception as e:
+        print(f"[Resample] Error resampling: {e}, returning original")
+        return input_path
+def convert_mp3_to_wav_16khz(input_path: str) -> str:
+    """
+    Convert MP3 to WAV at 16kHz using pydub (if available) or ffmpeg.
+    """
+    try:
+        from pydub import AudioSegment
+        print(f"[Convert] Converting MP3 to WAV 16kHz: {input_path}")
+        # Load MP3
+        audio = AudioSegment.from_mp3(input_path)
+        # Convert to mono and set sample rate
+        audio = audio.set_channels(1).set_frame_rate(TARGET_SAMPLE_RATE)
+        # Export as WAV
+        with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as f:
+            output_path = f.name
+        audio.export(output_path, format="wav")
+        # Remove original MP3
+        os.remove(input_path)
+        print(f"[Convert] Successfully converted to {output_path}")
+        return output_path
+    except ImportError:
+        print("[Convert] pydub not available, trying ffmpeg directly")
+        try:
+            import subprocess
+            with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as f:
+                output_path = f.name
+            subprocess.run([
+                "ffmpeg", "-y", "-i", input_path,
+                "-ar", str(TARGET_SAMPLE_RATE),
+                "-ac", "1",
+                output_path
+            ], check=True, capture_output=True)
+            os.remove(input_path)
+            return output_path
+        except Exception as e:
+            print(f"[Convert] ffmpeg conversion failed: {e}, returning original")
+            return input_path
+    except Exception as e:
+        print(f"[Convert] Error converting: {e}, returning original")
+        return input_path
 model_mapping = {
     # 채널톡 TTS (한국어 특화)
     "channel-hana": {
         model: 모델 ID (model_mapping의 키)
     Returns:
+        생성된 오디오 파일 경로 (16kHz WAV로 통일)
     """
     print(f"[TTS] Predicting for model: {model}")
     config = model_mapping[model]
     provider = config["provider"]
+    audio_path = None
+    is_mp3 = False
     if provider == "channel":
+        audio_path = predict_channel_tts(text, config.get("voice", "hana"))
+        # Channel TTS returns WAV at 24kHz
     elif provider == "openai":
+        audio_path = predict_openai_tts(
             text,
             config.get("model", "gpt-4o-mini-tts"),
             config.get("voice", "coral"),
         )
+        # OpenAI returns WAV
     elif provider == "google":
+        audio_path = predict_google_tts(text, config.get("voice", "ko-KR-Wavenet-A"))
+        # Google returns WAV at 24kHz
     elif provider == "elevenlabs":
+        audio_path = predict_elevenlabs_tts(text, config.get("model", "eleven_multilingual_v2"))
+        is_mp3 = True  # ElevenLabs returns MP3
     elif provider == "supertone":
+        audio_path = predict_supertone_tts(text, config.get("model", "sona_speech_1"))
+        # Supertone returns WAV
     elif provider == "clova":
+        audio_path = predict_clova_tts(text, config.get("speaker", "nara"))
+        is_mp3 = True  # CLOVA returns MP3
     else:
         raise ValueError(f"알 수 없는 provider: {provider}")
+    # Standardize to 16kHz WAV for fair comparison
+    if audio_path:
+        if is_mp3:
+            # Convert MP3 to WAV at 16kHz
+            audio_path = convert_mp3_to_wav_16khz(audio_path)
+        else:
+            # Resample WAV to 16kHz
+            audio_path = resample_wav_to_16khz(audio_path)
+    return audio_path
 if __name__ == "__main__":