Pj12 commited on
Commit
fbe4863
·
verified ·
1 Parent(s): 85316a3

Upload extract_f0_print.py

Browse files
Files changed (1) hide show
  1. extract_f0_print.py +302 -0
extract_f0_print.py ADDED
@@ -0,0 +1,302 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import traceback
3
+ import sys
4
+ import parselmouth
5
+
6
+ now_dir = os.getcwd()
7
+ sys.path.append(now_dir)
8
+ from LazyImport import lazyload
9
+ from my_utils import load_audio
10
+ import pyworld
11
+ import numpy as np, logging
12
+ torchcrepe = lazyload("torchcrepe") # Fork Feature. Crepe algo for training and preprocess
13
+ torch = lazyload("torch")
14
+ #from torch import Tensor # Fork Feature. Used for pitch prediction for torch crepe.
15
+ tqdm = lazyload("tqdm")
16
+
17
+ logging.getLogger("numba").setLevel(logging.WARNING)
18
+
19
+ import multiprocessing
20
+
21
+ exp_dir = sys.argv[1]
22
+ f = open(f"{exp_dir}/extract_f0_feature.log", "a+")
23
+
24
+ DoFormant = False
25
+ Quefrency = 1.0
26
+ Timbre = 1.0
27
+
28
+ def printt(strr):
29
+ print(strr)
30
+ f.write(f"{strr}\n")
31
+ f.flush()
32
+
33
+
34
+ n_p = int(sys.argv[2])
35
+ f0method = sys.argv[3]
36
+ extraction_crepe_hop_length = 0
37
+ try:
38
+ extraction_crepe_hop_length = int(sys.argv[4])
39
+ except:
40
+ print("Temp Issue. echl is not being passed with argument!")
41
+ extraction_crepe_hop_length = 128
42
+
43
+ # print("EXTRACTION CREPE HOP LENGTH: " + str(extraction_crepe_hop_length))
44
+ # print("EXTRACTION CREPE HOP LENGTH TYPE: " + str(type(extraction_crepe_hop_length)))
45
+
46
+
47
+ class FeatureInput(object):
48
+ def __init__(self, samplerate=16000, hop_size=160):
49
+ self.fs = samplerate
50
+ self.hop = hop_size
51
+
52
+ self.f0_method_dict = self.get_f0_method_dict()
53
+
54
+ self.f0_bin = 256
55
+ self.f0_max = 1100.0
56
+ self.f0_min = 50.0
57
+ self.f0_mel_min = 1127 * np.log(1 + self.f0_min / 700)
58
+ self.f0_mel_max = 1127 * np.log(1 + self.f0_max / 700)
59
+
60
+ # EXPERIMENTAL. PROBABLY BUGGY
61
+ def mncrepe(self, method, x, p_len, crepe_hop_length):
62
+ f0 = None
63
+ torch_device_index = 0
64
+ torch_device = torch.device(
65
+ f"cuda:{torch_device_index % torch.cuda.device_count()}"
66
+ ) if torch.cuda.is_available() \
67
+ else torch.device("mps") if torch.backends.mps.is_available() \
68
+ else torch.device("cpu")
69
+
70
+ audio = torch.from_numpy(x.astype(np.float32)).to(torch_device, copy=True)
71
+ audio /= torch.quantile(torch.abs(audio), 0.999)
72
+ audio = torch.unsqueeze(audio, dim=0)
73
+ if audio.ndim == 2 and audio.shape[0] > 1:
74
+ audio = torch.mean(audio, dim=0, keepdim=True).detach()
75
+ audio = audio.detach()
76
+
77
+ if method == 'mangio-crepe':
78
+ pitch: torch.Tensor = torchcrepe.predict(
79
+ audio,
80
+ self.fs,
81
+ crepe_hop_length,
82
+ self.f0_min,
83
+ self.f0_max,
84
+ "full",
85
+ batch_size=crepe_hop_length * 2,
86
+ device=torch_device,
87
+ pad=True,
88
+ )
89
+ p_len = p_len or x.shape[0] // crepe_hop_length
90
+ # Resize the pitch
91
+ source = np.array(pitch.squeeze(0).cpu().float().numpy())
92
+ source[source < 0.001] = np.nan
93
+ target = np.interp(
94
+ np.arange(0, len(source) * p_len, len(source)) / p_len,
95
+ np.arange(0, len(source)),
96
+ source,
97
+ )
98
+ f0 = np.nan_to_num(target)
99
+
100
+ elif method == 'crepe':
101
+ batch_size = 512
102
+ audio = torch.tensor(np.copy(x))[None].float()
103
+ f0, pd = torchcrepe.predict(
104
+ audio,
105
+ self.fs,
106
+ 160,
107
+ self.f0_min,
108
+ self.f0_max,
109
+ "full",
110
+ batch_size=batch_size,
111
+ device=torch_device,
112
+ return_periodicity=True,
113
+ )
114
+ pd = torchcrepe.filter.median(pd, 3)
115
+ f0 = torchcrepe.filter.mean(f0, 3)
116
+ f0[pd < 0.1] = 0
117
+ f0 = f0[0].cpu().numpy()
118
+ f0 = f0[1:] # Get rid of extra first frame
119
+
120
+ return f0
121
+
122
+ def get_pm(self, x, p_len):
123
+ f0 = parselmouth.Sound(x, self.fs).to_pitch_ac(
124
+ time_step=160 / 16000,
125
+ voicing_threshold=0.6,
126
+ pitch_floor=self.f0_min,
127
+ pitch_ceiling=self.f0_max,
128
+ ).selected_array["frequency"]
129
+
130
+ return np.pad(
131
+ f0,
132
+ [[max(0, (p_len - len(f0) + 1) // 2), max(0, p_len - len(f0) - (p_len - len(f0) + 1) // 2)]],
133
+ mode="constant"
134
+ )
135
+
136
+ def get_harvest(self, x):
137
+ f0_spectral = pyworld.harvest(
138
+ x.astype(np.double),
139
+ fs=self.fs,
140
+ f0_ceil=self.f0_max,
141
+ f0_floor=self.f0_min,
142
+ frame_period=1000 * self.hop / self.fs,
143
+ )
144
+ return pyworld.stonemask(x.astype(np.double), *f0_spectral, self.fs)
145
+
146
+ def get_dio(self, x):
147
+ f0_spectral = pyworld.dio(
148
+ x.astype(np.double),
149
+ fs=self.fs,
150
+ f0_ceil=self.f0_max,
151
+ f0_floor=self.f0_min,
152
+ frame_period=1000 * self.hop / self.fs,
153
+ )
154
+ return pyworld.stonemask(x.astype(np.double), *f0_spectral, self.fs)
155
+
156
+ def get_rmvpe(self, x):
157
+ if not hasattr(self, "model_rmvpe"):
158
+ from rmvpe import RMVPE
159
+ self.model_rmvpe = RMVPE("rmvpe.pt", is_half=False, device="cuda:0")
160
+
161
+ return self.model_rmvpe.infer_from_audio(x, thred=0.03)
162
+
163
+ def get_rmvpe_onnx(self, x):
164
+ if not hasattr(self, "model_rmvpe_onnx"):
165
+ from rmvpe import RMVPE
166
+ self.model_rmvpe_onnx = RMVPE("rmvpe.onnx", is_half=False, device="cuda:0")
167
+
168
+ return self.model_rmvpe_onnx.infer_from_audio(x, thred=0.03)
169
+
170
+ def get_f0_method_dict(self):
171
+ return {
172
+ "pm": self.get_pm,
173
+ "harvest": self.get_harvest,
174
+ "dio": self.get_dio,
175
+ "rmvpe_onnx": self.get_rmvpe_onnx,
176
+ "rmvpe": self.get_rmvpe
177
+ }
178
+
179
+ def get_f0_hybrid_computation(
180
+ self,
181
+ methods_str,
182
+ x,
183
+ p_len,
184
+ crepe_hop_length,
185
+ ):
186
+ # Get various f0 methods from input to use in the computation stack
187
+ s = methods_str
188
+ s = s.split("hybrid")[1]
189
+ s = s.replace("[", "").replace("]", "")
190
+ methods = s.split("+")
191
+ f0_computation_stack = []
192
+
193
+ for method in methods:
194
+ if method in self.f0_method_dict:
195
+ f0 = self.f0_method_dict[method](x, p_len) if method == 'pm' else self.f0_method_dict[method](x)
196
+ f0_computation_stack.append(f0)
197
+ elif method == 'crepe' or method == 'mangio-crepe':
198
+ self.the_other_complex_function(x, method, crepe_hop_length)
199
+
200
+ if len(f0_computation_stack) != 0:
201
+ f0_median_hybrid = np.nanmedian(f0_computation_stack, axis=0) if len(f0_computation_stack)>1 else f0_computation_stack[0]
202
+ return f0_median_hybrid
203
+ else:
204
+ raise ValueError("No valid methods were provided")
205
+
206
+ def compute_f0(self, path, f0_method, crepe_hop_length):
207
+ x = load_audio(path, self.fs, DoFormant, Quefrency, Timbre)
208
+ p_len = x.shape[0] // self.hop
209
+
210
+ if f0_method in self.f0_method_dict:
211
+ f0 = self.f0_method_dict[f0_method](x, p_len) if f0_method == 'pm' else self.f0_method_dict[f0_method](x)
212
+ elif f0_method in ['crepe', 'mangio-crepe']:
213
+ f0 = self.mncrepe(f0_method, x, p_len, crepe_hop_length)
214
+ elif "hybrid" in f0_method: # EXPERIMENTAL
215
+ # Perform hybrid median pitch estimation
216
+ f0 = self.get_f0_hybrid_computation(
217
+ f0_method,
218
+ x,
219
+ p_len,
220
+ crepe_hop_length,
221
+ )
222
+ # Mangio-RVC-Fork Feature: Add hybrid f0 inference to feature extraction. EXPERIMENTAL...
223
+
224
+ return f0
225
+
226
+ def coarse_f0(self, f0):
227
+ f0_mel = 1127 * np.log(1 + f0 / 700)
228
+ f0_mel[f0_mel > 0] = (f0_mel[f0_mel > 0] - self.f0_mel_min) * (
229
+ self.f0_bin - 2
230
+ ) / (self.f0_mel_max - self.f0_mel_min) + 1
231
+
232
+ # use 0 or 1
233
+ f0_mel[f0_mel <= 1] = 1
234
+ f0_mel[f0_mel > self.f0_bin - 1] = self.f0_bin - 1
235
+ f0_coarse = np.rint(f0_mel).astype(int)
236
+ assert f0_coarse.max() <= 255 and f0_coarse.min() >= 1, (
237
+ f0_coarse.max(),
238
+ f0_coarse.min(),
239
+ )
240
+ return f0_coarse
241
+
242
+ def go(self, paths, f0_method, crepe_hop_length, thread_n):
243
+ if not paths:
244
+ printt("no-f0-todo")
245
+ return
246
+
247
+ with tqdm.tqdm(total=len(paths), leave=True, position=thread_n) as pbar:
248
+ description = f"thread:{thread_n}, f0ing, Hop-Length:{crepe_hop_length}"
249
+ pbar.set_description(description)
250
+
251
+ for idx, (inp_path, opt_path1, opt_path2) in enumerate(paths):
252
+ try:
253
+ if (
254
+ os.path.exists(opt_path1 + ".npy")
255
+ and os.path.exists(opt_path2 + ".npy")
256
+ ):
257
+ pbar.update(1)
258
+ continue
259
+
260
+ featur_pit = self.compute_f0(inp_path, f0_method, crepe_hop_length)
261
+ np.save(opt_path2, featur_pit, allow_pickle=False) # nsf
262
+
263
+ coarse_pit = self.coarse_f0(featur_pit)
264
+ np.save(opt_path1, coarse_pit, allow_pickle=False) # ori
265
+
266
+ pbar.update(1)
267
+ except Exception as e:
268
+ printt(f"f0fail-{idx}-{inp_path}-{traceback.format_exc()}")
269
+
270
+
271
+ if __name__ == "__main__":
272
+ # exp_dir=r"E:\codes\py39\dataset\mi-test"
273
+ # n_p=16
274
+ # f = open("%s/log_extract_f0.log"%exp_dir, "w")
275
+ printt(sys.argv)
276
+ featureInput = FeatureInput()
277
+ paths = []
278
+ inp_root = "%s/1_16k_wavs" % (exp_dir)
279
+ opt_root1 = "%s/2a_f0" % (exp_dir)
280
+ opt_root2 = "%s/2b-f0nsf" % (exp_dir)
281
+
282
+ os.makedirs(opt_root1, exist_ok=True)
283
+ os.makedirs(opt_root2, exist_ok=True)
284
+ for name in sorted(list(os.listdir(inp_root))):
285
+ inp_path = "%s/%s" % (inp_root, name)
286
+ if "spec" in inp_path:
287
+ continue
288
+ opt_path1 = "%s/%s" % (opt_root1, name)
289
+ opt_path2 = "%s/%s" % (opt_root2, name)
290
+ paths.append([inp_path, opt_path1, opt_path2])
291
+
292
+ ps = []
293
+ print("Using f0 method: " + f0method)
294
+ for i in range(n_p):
295
+ p = multiprocessing.Process(
296
+ target=featureInput.go,
297
+ args=(paths[i::n_p], f0method, extraction_crepe_hop_length, i),
298
+ )
299
+ ps.append(p)
300
+ p.start()
301
+ for i in range(n_p):
302
+ ps[i].join()