Spaces:
Running
on
Zero
Running
on
Zero
Commit
·
4620bde
1
Parent(s):
3f373d0
add volume-normalization to avoid audio clippig after multiple editing
Browse files
tts.py
CHANGED
|
@@ -367,6 +367,12 @@ class StepAudioTTS:
|
|
| 367 |
prompt_wav, prompt_wav_sr = torchaudio.load(prompt_wav_path)
|
| 368 |
if prompt_wav.shape[0] > 1:
|
| 369 |
prompt_wav = prompt_wav.mean(dim=0, keepdim=True) # 将多通道音频转换为单通道
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 370 |
speech_feat, speech_feat_len = self.cosy_model.frontend.extract_speech_feat(
|
| 371 |
prompt_wav, prompt_wav_sr
|
| 372 |
)
|
|
|
|
| 367 |
prompt_wav, prompt_wav_sr = torchaudio.load(prompt_wav_path)
|
| 368 |
if prompt_wav.shape[0] > 1:
|
| 369 |
prompt_wav = prompt_wav.mean(dim=0, keepdim=True) # 将多通道音频转换为单通道
|
| 370 |
+
|
| 371 |
+
# volume-normalize avoid clipping
|
| 372 |
+
norm = torch.max(torch.abs(prompt_wav), dim=1, keepdim=True)[0]
|
| 373 |
+
if norm > 0.6: # hard code; max absolute value is 0.6
|
| 374 |
+
prompt_wav = prompt_wav / norm * 0.6
|
| 375 |
+
|
| 376 |
speech_feat, speech_feat_len = self.cosy_model.frontend.extract_speech_feat(
|
| 377 |
prompt_wav, prompt_wav_sr
|
| 378 |
)
|