blackhole1218 commited on
Commit
c7ac7fd
·
1 Parent(s): 7b01a99

feat: remove Discord link, add 16kHz audio resampling for fair comparison

Browse files

- Remove Discord link and CSS styles from base.html
- Add audio resampling to 16kHz for all TTS outputs
- Convert MP3 outputs (ElevenLabs, CLOVA) to WAV
- Add scipy, numpy, pydub to requirements.txt
- Ensures fair audio quality comparison across providers

Files changed (3) hide show
  1. requirements.txt +4 -1
  2. templates/base.html +0 -25
  3. tts.py +151 -7
requirements.txt CHANGED
@@ -10,4 +10,7 @@ apscheduler
10
  flask-migrate
11
  gunicorn
12
  waitress
13
- huggingface-hub
 
 
 
 
10
  flask-migrate
11
  gunicorn
12
  waitress
13
+ huggingface-hub
14
+ scipy
15
+ numpy
16
+ pydub
templates/base.html CHANGED
@@ -388,24 +388,6 @@
388
  margin-right: 12px;
389
  }
390
 
391
- .discord-link {
392
- display: flex;
393
- align-items: center;
394
- padding: 12px 16px;
395
- border-top: 1px solid var(--border-color);
396
- text-decoration: none;
397
- color: var(--text-color);
398
- }
399
-
400
- .discord-link:hover {
401
- background-color: var(--light-gray);
402
- color: #5865F2;
403
- }
404
-
405
- .discord-link svg {
406
- margin-right: 12px;
407
- }
408
-
409
  .sidebar-footer {
410
  margin-top: auto;
411
  display: flex;
@@ -1126,13 +1108,6 @@
1126
  </nav>
1127
 
1128
  <div class="sidebar-footer">
1129
- <a href="https://discord.gg/HB8fMR6GTr" target="_blank" rel="noopener noreferrer" class="discord-link">
1130
- <svg xmlns="http://www.w3.org/2000/svg" width="24" height="24" viewBox="0 0 127.14 96.36" fill="currentColor">
1131
- <path d="M107.7,8.07A105.15,105.15,0,0,0,81.47,0a72.06,72.06,0,0,0-3.36,6.83A97.68,97.68,0,0,0,49,6.83,72.37,72.37,0,0,0,45.64,0,105.89,105.89,0,0,0,19.39,8.09C2.79,32.65-1.71,56.6.54,80.21h0A105.73,105.73,0,0,0,32.71,96.36,77.7,77.7,0,0,0,39.6,85.25a68.42,68.42,0,0,1-10.85-5.18c.91-.66,1.8-1.34,2.66-2a75.57,75.57,0,0,0,64.32,0c.87.71,1.76,1.39,2.66,2a68.68,68.68,0,0,1-10.87,5.19,77,77,0,0,0,6.89,11.1A105.25,105.25,0,0,0,126.6,80.22h0C129.24,52.84,122.09,29.11,107.7,8.07ZM42.45,65.69C36.18,65.69,31,60,31,53s5-12.74,11.43-12.74S54,46,53.89,53,48.84,65.69,42.45,65.69Zm42.24,0C78.41,65.69,73.25,60,73.25,53s5-12.74,11.44-12.74S96.23,46,96.12,53,91.08,65.69,84.69,65.69Z"/>
1132
- </svg>
1133
- Join our Discord
1134
- </a>
1135
-
1136
  {% if current_user.is_authenticated %}
1137
  <div class="user-auth" onclick="toggleUserDropdown(event)">
1138
  <div class="user-name">{{ current_user.username }}</div>
 
388
  margin-right: 12px;
389
  }
390
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
391
  .sidebar-footer {
392
  margin-top: auto;
393
  display: flex;
 
1108
  </nav>
1109
 
1110
  <div class="sidebar-footer">
 
 
 
 
 
 
 
1111
  {% if current_user.is_authenticated %}
1112
  <div class="user-auth" onclick="toggleUserDropdown(event)">
1113
  <div class="user-name">{{ current_user.username }}</div>
tts.py CHANGED
@@ -6,10 +6,25 @@ import tempfile
6
  import requests
7
  import urllib.request
8
  import urllib.parse
 
 
9
  from dotenv import load_dotenv
10
 
 
 
 
 
 
 
 
 
 
 
11
  load_dotenv()
12
 
 
 
 
13
  # 한국어 지원 TTS 제공자 매핑
14
  # - 채널톡: 자체 API
15
  # - ElevenLabs: 직접 API
@@ -33,6 +48,116 @@ SUPERTONE_VOICE_ID = os.getenv("SUPERTONE_VOICE_ID", "91992bbd4758bdcf9c9b01")
33
  CLOVA_CLIENT_ID = os.getenv("CLOVA_CLIENT_ID")
34
  CLOVA_API_KEY = os.getenv("CLOVA_API_KEY")
35
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
36
  model_mapping = {
37
  # 채널톡 TTS (한국어 특화)
38
  "channel-hana": {
@@ -265,7 +390,7 @@ def predict_tts(text: str, model: str) -> str:
265
  model: 모델 ID (model_mapping의 키)
266
 
267
  Returns:
268
- 생성된 오디오 파일 경로
269
  """
270
  print(f"[TTS] Predicting for model: {model}")
271
 
@@ -274,31 +399,50 @@ def predict_tts(text: str, model: str) -> str:
274
 
275
  config = model_mapping[model]
276
  provider = config["provider"]
 
 
277
 
278
  if provider == "channel":
279
- return predict_channel_tts(text, config.get("voice", "hana"))
 
280
 
281
  elif provider == "openai":
282
- return predict_openai_tts(
283
  text,
284
  config.get("model", "gpt-4o-mini-tts"),
285
  config.get("voice", "coral"),
286
  )
 
287
 
288
  elif provider == "google":
289
- return predict_google_tts(text, config.get("voice", "ko-KR-Wavenet-A"))
 
290
 
291
  elif provider == "elevenlabs":
292
- return predict_elevenlabs_tts(text, config.get("model", "eleven_multilingual_v2"))
 
293
 
294
  elif provider == "supertone":
295
- return predict_supertone_tts(text, config.get("model", "sona_speech_1"))
 
296
 
297
  elif provider == "clova":
298
- return predict_clova_tts(text, config.get("speaker", "nara"))
 
299
 
300
  else:
301
  raise ValueError(f"알 수 없는 provider: {provider}")
 
 
 
 
 
 
 
 
 
 
 
302
 
303
 
304
  if __name__ == "__main__":
 
6
  import requests
7
  import urllib.request
8
  import urllib.parse
9
+ import wave
10
+ import struct
11
  from dotenv import load_dotenv
12
 
13
+ # Optional: scipy for high-quality resampling
14
+ try:
15
+ from scipy import signal
16
+ from scipy.io import wavfile
17
+ import numpy as np
18
+ HAS_SCIPY = True
19
+ except ImportError:
20
+ HAS_SCIPY = False
21
+ print("Warning: scipy not installed. Using basic resampling.")
22
+
23
  load_dotenv()
24
 
25
+ # Target sample rate for all TTS outputs (for fair comparison)
26
+ TARGET_SAMPLE_RATE = 16000
27
+
28
  # 한국어 지원 TTS 제공자 매핑
29
  # - 채널톡: 자체 API
30
  # - ElevenLabs: 직접 API
 
48
  CLOVA_CLIENT_ID = os.getenv("CLOVA_CLIENT_ID")
49
  CLOVA_API_KEY = os.getenv("CLOVA_API_KEY")
50
 
51
+ def resample_wav_to_16khz(input_path: str) -> str:
52
+ """
53
+ Resample a WAV file to 16kHz for fair comparison.
54
+ Returns the path to the resampled file.
55
+ """
56
+ if not HAS_SCIPY:
57
+ # If scipy is not available, return original file
58
+ print(f"[Resample] scipy not available, skipping resample for {input_path}")
59
+ return input_path
60
+
61
+ try:
62
+ # Read the original WAV file
63
+ original_rate, data = wavfile.read(input_path)
64
+
65
+ # If already 16kHz, return as-is
66
+ if original_rate == TARGET_SAMPLE_RATE:
67
+ print(f"[Resample] Already {TARGET_SAMPLE_RATE}Hz, no resample needed")
68
+ return input_path
69
+
70
+ print(f"[Resample] Resampling from {original_rate}Hz to {TARGET_SAMPLE_RATE}Hz")
71
+
72
+ # Handle stereo to mono conversion if needed
73
+ if len(data.shape) > 1:
74
+ data = data.mean(axis=1).astype(data.dtype)
75
+
76
+ # Calculate the number of samples in the output
77
+ num_samples = int(len(data) * TARGET_SAMPLE_RATE / original_rate)
78
+
79
+ # Resample using scipy
80
+ resampled_data = signal.resample(data, num_samples)
81
+
82
+ # Normalize to int16 range
83
+ if resampled_data.dtype != np.int16:
84
+ # Normalize float to int16
85
+ max_val = np.max(np.abs(resampled_data))
86
+ if max_val > 0:
87
+ resampled_data = (resampled_data / max_val * 32767).astype(np.int16)
88
+ else:
89
+ resampled_data = resampled_data.astype(np.int16)
90
+
91
+ # Save to new temporary file
92
+ with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as f:
93
+ output_path = f.name
94
+
95
+ wavfile.write(output_path, TARGET_SAMPLE_RATE, resampled_data)
96
+
97
+ # Remove original file
98
+ os.remove(input_path)
99
+
100
+ print(f"[Resample] Successfully resampled to {output_path}")
101
+ return output_path
102
+
103
+ except Exception as e:
104
+ print(f"[Resample] Error resampling: {e}, returning original")
105
+ return input_path
106
+
107
+
108
+ def convert_mp3_to_wav_16khz(input_path: str) -> str:
109
+ """
110
+ Convert MP3 to WAV at 16kHz using pydub (if available) or ffmpeg.
111
+ """
112
+ try:
113
+ from pydub import AudioSegment
114
+
115
+ print(f"[Convert] Converting MP3 to WAV 16kHz: {input_path}")
116
+
117
+ # Load MP3
118
+ audio = AudioSegment.from_mp3(input_path)
119
+
120
+ # Convert to mono and set sample rate
121
+ audio = audio.set_channels(1).set_frame_rate(TARGET_SAMPLE_RATE)
122
+
123
+ # Export as WAV
124
+ with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as f:
125
+ output_path = f.name
126
+
127
+ audio.export(output_path, format="wav")
128
+
129
+ # Remove original MP3
130
+ os.remove(input_path)
131
+
132
+ print(f"[Convert] Successfully converted to {output_path}")
133
+ return output_path
134
+
135
+ except ImportError:
136
+ print("[Convert] pydub not available, trying ffmpeg directly")
137
+ try:
138
+ import subprocess
139
+
140
+ with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as f:
141
+ output_path = f.name
142
+
143
+ subprocess.run([
144
+ "ffmpeg", "-y", "-i", input_path,
145
+ "-ar", str(TARGET_SAMPLE_RATE),
146
+ "-ac", "1",
147
+ output_path
148
+ ], check=True, capture_output=True)
149
+
150
+ os.remove(input_path)
151
+ return output_path
152
+
153
+ except Exception as e:
154
+ print(f"[Convert] ffmpeg conversion failed: {e}, returning original")
155
+ return input_path
156
+ except Exception as e:
157
+ print(f"[Convert] Error converting: {e}, returning original")
158
+ return input_path
159
+
160
+
161
  model_mapping = {
162
  # 채널톡 TTS (한국어 특화)
163
  "channel-hana": {
 
390
  model: 모델 ID (model_mapping의 키)
391
 
392
  Returns:
393
+ 생성된 오디오 파일 경로 (16kHz WAV로 통일)
394
  """
395
  print(f"[TTS] Predicting for model: {model}")
396
 
 
399
 
400
  config = model_mapping[model]
401
  provider = config["provider"]
402
+ audio_path = None
403
+ is_mp3 = False
404
 
405
  if provider == "channel":
406
+ audio_path = predict_channel_tts(text, config.get("voice", "hana"))
407
+ # Channel TTS returns WAV at 24kHz
408
 
409
  elif provider == "openai":
410
+ audio_path = predict_openai_tts(
411
  text,
412
  config.get("model", "gpt-4o-mini-tts"),
413
  config.get("voice", "coral"),
414
  )
415
+ # OpenAI returns WAV
416
 
417
  elif provider == "google":
418
+ audio_path = predict_google_tts(text, config.get("voice", "ko-KR-Wavenet-A"))
419
+ # Google returns WAV at 24kHz
420
 
421
  elif provider == "elevenlabs":
422
+ audio_path = predict_elevenlabs_tts(text, config.get("model", "eleven_multilingual_v2"))
423
+ is_mp3 = True # ElevenLabs returns MP3
424
 
425
  elif provider == "supertone":
426
+ audio_path = predict_supertone_tts(text, config.get("model", "sona_speech_1"))
427
+ # Supertone returns WAV
428
 
429
  elif provider == "clova":
430
+ audio_path = predict_clova_tts(text, config.get("speaker", "nara"))
431
+ is_mp3 = True # CLOVA returns MP3
432
 
433
  else:
434
  raise ValueError(f"알 수 없는 provider: {provider}")
435
+
436
+ # Standardize to 16kHz WAV for fair comparison
437
+ if audio_path:
438
+ if is_mp3:
439
+ # Convert MP3 to WAV at 16kHz
440
+ audio_path = convert_mp3_to_wav_16khz(audio_path)
441
+ else:
442
+ # Resample WAV to 16kHz
443
+ audio_path = resample_wav_to_16khz(audio_path)
444
+
445
+ return audio_path
446
 
447
 
448
  if __name__ == "__main__":