Opera8 commited on
Commit
06310a1
·
verified ·
1 Parent(s): 05bb323

Update requirements.txt

Browse files
Files changed (1) hide show
  1. requirements.txt +31 -258
requirements.txt CHANGED
@@ -1,258 +1,31 @@
1
- import os
2
- import sys
3
- import importlib.util
4
- import site
5
- import json
6
- import torch
7
- import gradio as gr
8
- import torchaudio
9
- import numpy as np
10
- from huggingface_hub import snapshot_download, hf_hub_download
11
- import subprocess
12
- import re
13
- import spaces
14
- import uuid
15
- import soundfile as sf
16
-
17
- # منابع ضروری
18
- downloaded_resources = {
19
- "configs": False,
20
- "tokenizer_vq8192": False,
21
- "fmt_Vq8192ToMels": False,
22
- "vocoder": False
23
- }
24
-
25
- def install_espeak():
26
- try:
27
- result = subprocess.run(["which", "espeak-ng"], capture_output=True, text=True)
28
- if result.returncode != 0:
29
- print("Installing espeak-ng...")
30
- subprocess.run(["apt-get", "update"], check=True)
31
- subprocess.run(["apt-get", "install", "-y", "espeak-ng", "espeak-ng-data"], check=True)
32
- except Exception as e:
33
- print(f"Error installing espeak-ng: {e}")
34
-
35
- install_espeak()
36
-
37
- def patch_langsegment_init():
38
- try:
39
- spec = importlib.util.find_spec("LangSegment")
40
- if spec is None or spec.origin is None: return
41
- init_path = os.path.join(os.path.dirname(spec.origin), '__init__.py')
42
- if not os.path.exists(init_path):
43
- for site_pkg_path in site.getsitepackages():
44
- potential_path = os.path.join(site_pkg_path, 'LangSegment', '__init__.py')
45
- if os.path.exists(potential_path):
46
- init_path = potential_path
47
- break
48
- else: return
49
-
50
- with open(init_path, 'r') as f: lines = f.readlines()
51
- modified = False
52
- new_lines = []
53
- target_line_prefix = "from .LangSegment import"
54
-
55
- for line in lines:
56
- if line.strip().startswith(target_line_prefix) and ('setLangfilters' in line or 'getLangfilters' in line):
57
- mod_line = line.replace(',setLangfilters', '').replace(',getLangfilters', '')
58
- mod_line = mod_line.replace('setLangfilters,', '').replace('getLangfilters,', '').rstrip(',')
59
- new_lines.append(mod_line + '\n')
60
- modified = True
61
- else:
62
- new_lines.append(line)
63
-
64
- if modified:
65
- with open(init_path, 'w') as f: f.writelines(new_lines)
66
- try:
67
- import LangSegment
68
- importlib.reload(LangSegment)
69
- except: pass
70
- except: pass
71
-
72
- patch_langsegment_init()
73
-
74
- if not os.path.exists("Amphion"):
75
- subprocess.run(["git", "clone", "https://github.com/open-mmlab/Amphion.git"])
76
- os.chdir("Amphion")
77
- else:
78
- if not os.getcwd().endswith("Amphion"):
79
- os.chdir("Amphion")
80
-
81
- if os.path.dirname(os.path.abspath("Amphion")) not in sys.path:
82
- sys.path.append(os.path.dirname(os.path.abspath("Amphion")))
83
-
84
- os.makedirs("wav", exist_ok=True)
85
- os.makedirs("ckpts/Vevo", exist_ok=True)
86
-
87
- from models.vc.vevo.vevo_utils import VevoInferencePipeline
88
-
89
- # --- تابع ذخیره سازی دقیق (16-bit PCM) ---
90
- # این تابع کلید حل مشکل نویز صداست. فایل را دقیقاً مثل WAV استاندارد ذخیره می‌کند.
91
- def save_audio_pcm16(waveform, output_path, sample_rate=24000):
92
- try:
93
- if isinstance(waveform, torch.Tensor):
94
- waveform = waveform.detach().cpu()
95
- if waveform.dim() == 2 and waveform.shape[0] == 1:
96
- waveform = waveform.squeeze(0)
97
- waveform = waveform.numpy()
98
-
99
- # تبدیل به فرمت 16 بیتی برای جلوگیری از نویز
100
- # (مدل‌های Vevo با فرمت Float گاهی مشکل دارند)
101
- sf.write(output_path, waveform, sample_rate, subtype='PCM_16')
102
-
103
- except Exception as e:
104
- print(f"Save error: {e}")
105
- raise e
106
-
107
- def setup_configs():
108
- if downloaded_resources["configs"]: return
109
- config_path = "models/vc/vevo/config"
110
- os.makedirs(config_path, exist_ok=True)
111
- config_files = ["Vq8192ToMels.json", "Vocoder.json"]
112
-
113
- for file in config_files:
114
- file_path = f"{config_path}/{file}"
115
- if not os.path.exists(file_path):
116
- try:
117
- file_data = hf_hub_download(repo_id="amphion/Vevo", filename=f"config/{file}", repo_type="model")
118
- subprocess.run(["cp", file_data, file_path])
119
- except Exception as e: print(f"Error downloading config {file}: {e}")
120
- downloaded_resources["configs"] = True
121
-
122
- setup_configs()
123
-
124
- device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
125
- print(f"Using device: {device}")
126
-
127
- inference_pipelines = {}
128
-
129
- def preload_all_resources():
130
- print("Preloading resources...")
131
- setup_configs()
132
-
133
- global downloaded_content_style_tokenizer_path
134
- global downloaded_fmt_path
135
- global downloaded_vocoder_path
136
-
137
- if not downloaded_resources["tokenizer_vq8192"]:
138
- local_dir = snapshot_download(repo_id="amphion/Vevo", repo_type="model", cache_dir="./ckpts/Vevo", allow_patterns=["tokenizer/vq8192/*"])
139
- downloaded_content_style_tokenizer_path = local_dir
140
- downloaded_resources["tokenizer_vq8192"] = True
141
-
142
- if not downloaded_resources["fmt_Vq8192ToMels"]:
143
- local_dir = snapshot_download(repo_id="amphion/Vevo", repo_type="model", cache_dir="./ckpts/Vevo", allow_patterns=["acoustic_modeling/Vq8192ToMels/*"])
144
- downloaded_fmt_path = local_dir
145
- downloaded_resources["fmt_Vq8192ToMels"] = True
146
-
147
- if not downloaded_resources["vocoder"]:
148
- local_dir = snapshot_download(repo_id="amphion/Vevo", repo_type="model", cache_dir="./ckpts/Vevo", allow_patterns=["acoustic_modeling/Vocoder/*"])
149
- downloaded_vocoder_path = local_dir
150
- downloaded_resources["vocoder"] = True
151
- print("Resources ready.")
152
-
153
- downloaded_content_style_tokenizer_path = None
154
- downloaded_fmt_path = None
155
- downloaded_vocoder_path = None
156
-
157
- preload_all_resources()
158
-
159
- def get_pipeline():
160
- if "timbre" in inference_pipelines:
161
- return inference_pipelines["timbre"]
162
-
163
- pipeline = VevoInferencePipeline(
164
- content_style_tokenizer_ckpt_path=os.path.join(downloaded_content_style_tokenizer_path, "tokenizer/vq8192"),
165
- fmt_cfg_path="./models/vc/vevo/config/Vq8192ToMels.json",
166
- fmt_ckpt_path=os.path.join(downloaded_fmt_path, "acoustic_modeling/Vq8192ToMels"),
167
- vocoder_cfg_path="./models/vc/vevo/config/Vocoder.json",
168
- vocoder_ckpt_path=os.path.join(downloaded_vocoder_path, "acoustic_modeling/Vocoder"),
169
- device=device,
170
- )
171
-
172
- inference_pipelines["timbre"] = pipeline
173
- return pipeline
174
-
175
- @spaces.GPU()
176
- def vevo_timbre(content_wav, reference_wav):
177
- session_id = str(uuid.uuid4())[:8]
178
- temp_content_path = f"wav/c_{session_id}.wav"
179
- temp_reference_path = f"wav/r_{session_id}.wav"
180
- output_path = f"wav/out_{session_id}.wav"
181
-
182
- if content_wav is None or reference_wav is None:
183
- raise ValueError("Please upload audio files")
184
-
185
- try:
186
- # --- پردازش صدای اصلی ---
187
- if isinstance(content_wav, tuple):
188
- content_sr, content_data = content_wav if isinstance(content_wav[0], int) else (content_wav[1], content_wav[0])
189
- else:
190
- content_sr, content_data = content_wav
191
-
192
- if len(content_data.shape) > 1 and content_data.shape[1] > 1:
193
- content_data = np.mean(content_data, axis=1)
194
-
195
- content_tensor = torch.FloatTensor(content_data).unsqueeze(0)
196
-
197
- if content_sr != 24000:
198
- content_tensor = torchaudio.functional.resample(content_tensor, content_sr, 24000)
199
- content_sr = 24000
200
-
201
- content_tensor = content_tensor / (torch.max(torch.abs(content_tensor)) + 1e-6) * 0.95
202
-
203
- # --- پردازش صدای رفرنس ---
204
- if isinstance(reference_wav, tuple):
205
- ref_sr, ref_data = reference_wav if isinstance(reference_wav[0], int) else (reference_wav[1], reference_wav[0])
206
- else:
207
- ref_sr, ref_data = reference_wav
208
-
209
- if len(ref_data.shape) > 1 and ref_data.shape[1] > 1:
210
- ref_data = np.mean(ref_data, axis=1)
211
-
212
- ref_tensor = torch.FloatTensor(ref_data).unsqueeze(0)
213
- if ref_sr != 24000:
214
- ref_tensor = torchaudio.functional.resample(ref_tensor, ref_sr, 24000)
215
- ref_sr = 24000
216
-
217
- ref_tensor = ref_tensor / (torch.max(torch.abs(ref_tensor)) + 1e-6) * 0.95
218
-
219
- # *** ذخیره با فرمت PCM_16 (کلید حل مشکل نویز) ***
220
- save_audio_pcm16(content_tensor, temp_content_path, content_sr)
221
- save_audio_pcm16(ref_tensor, temp_reference_path, ref_sr)
222
-
223
- print(f"[{session_id}] Processing...")
224
-
225
- pipeline = get_pipeline()
226
-
227
- # اجرای مدل
228
- gen_audio = pipeline.inference_fm(
229
- src_wav_path=temp_content_path,
230
- timbre_ref_wav_path=temp_reference_path,
231
- flow_matching_steps=32,
232
- )
233
-
234
- if torch.isnan(gen_audio).any() or torch.isinf(gen_audio).any():
235
- gen_audio = torch.nan_to_num(gen_audio, nan=0.0, posinf=0.95, neginf=-0.95)
236
-
237
- # ذخیره خروجی نهایی
238
- save_audio_pcm16(gen_audio, output_path, 24000)
239
- return output_path
240
-
241
- finally:
242
- if os.path.exists(temp_content_path): os.remove(temp_content_path)
243
- if os.path.exists(temp_reference_path): os.remove(temp_reference_path)
244
-
245
- with gr.Blocks(title="Vevo-Timbre (High Quality)") as demo:
246
- gr.Markdown("## Vevo-Timbre: Zero-Shot Voice Conversion")
247
-
248
- with gr.Row():
249
- with gr.Column():
250
- timbre_content = gr.Audio(label="Source Audio", type="numpy")
251
- timbre_reference = gr.Audio(label="Target Timbre", type="numpy")
252
- timbre_button = gr.Button("Generate", variant="primary")
253
- with gr.Column():
254
- timbre_output = gr.Audio(label="Result")
255
-
256
- timbre_button.click(vevo_timbre, inputs=[timbre_content, timbre_reference], outputs=timbre_output)
257
-
258
- demo.launch()
 
1
+ gradio>=3.50.2
2
+ torch
3
+ torchaudio
4
+ numpy<2.0.0
5
+ huggingface_hub>=0.14.1
6
+ librosa>=0.9.2
7
+ PyYAML>=6.0
8
+ accelerate>=0.20.3
9
+ safetensors>=0.3.1
10
+ phonemizer>=3.2.0
11
+ setuptools
12
+ onnxruntime
13
+ transformers==4.41.2
14
+ unidecode
15
+ scipy>=1.12.0
16
+ encodec
17
+ g2p_en
18
+ jieba
19
+ cn2an
20
+ pypinyin
21
+ langsegment==0.2.0
22
+ pyopenjtalk
23
+ pykakasi
24
+ json5
25
+ black>=24.1.1
26
+ ruamel.yaml
27
+ tqdm
28
+ openai-whisper
29
+ ipython
30
+ pyworld
31
+ soundfile