import json import shutil import soundfile as sf from pathlib import Path import librosa from preprocess.utils import convert_metadata, merge_short_segments from preprocess.tools import ( F0Extractor, VocalDetector, VocalSeparator, NoteTranscriber, LyricTranscriber, ) class PreprocessPipeline: def __init__(self, device: str, language: str, save_dir: str, vocal_sep: bool = True, max_merge_duration: int = 60000): self.device = device self.language = language self.save_dir = save_dir self.vocal_sep = vocal_sep self.max_merge_duration = max_merge_duration if vocal_sep: self.vocal_separator = VocalSeparator( sep_model_path="pretrained_models/SoulX-Singer-Preprocess/mel-band-roformer-karaoke/mel_band_roformer_karaoke_becruily.ckpt", sep_config_path="pretrained_models/SoulX-Singer-Preprocess/mel-band-roformer-karaoke/config_karaoke_becruily.yaml", der_model_path="pretrained_models/SoulX-Singer-Preprocess/dereverb_mel_band_roformer/dereverb_mel_band_roformer_anvuew_sdr_19.1729.ckpt", der_config_path="pretrained_models/SoulX-Singer-Preprocess/dereverb_mel_band_roformer/dereverb_mel_band_roformer_anvuew.yaml", device=device ) else: self.vocal_separator = None self.f0_extractor = F0Extractor( model_path="pretrained_models/SoulX-Singer-Preprocess/rmvpe/rmvpe.pt", device=device, ) self.vocal_detector = VocalDetector( cut_wavs_output_dir= f"{save_dir}/cut_wavs", ) self.lyric_transcriber = LyricTranscriber( zh_model_path="pretrained_models/SoulX-Singer-Preprocess/speech_seaco_paraformer_large_asr_nat-zh-cn-16k-common-vocab8404-pytorch", en_model_path="pretrained_models/SoulX-Singer-Preprocess/parakeet-tdt-0.6b-v2/parakeet-tdt-0.6b-v2.nemo", device=device ) self.note_transcriber = NoteTranscriber( rosvot_model_path="pretrained_models/SoulX-Singer-Preprocess/rosvot/rosvot/model.pt", rwbd_model_path="pretrained_models/SoulX-Singer-Preprocess/rosvot/rwbd/model.pt", device=device ) def run( self, audio_path: str, vocal_sep: bool = True, max_merge_duration: int = 60000, language: str = "Mandarin" ) -> None: vocal_sep = self.vocal_sep if vocal_sep is None else vocal_sep max_merge_duration = self.max_merge_duration if max_merge_duration is None else max_merge_duration language = self.language if language is None else language output_dir = Path(self.save_dir) output_dir.mkdir(parents=True, exist_ok=True) if vocal_sep: # Perform vocal/accompaniment separation sep = self.vocal_separator.process(audio_path) vocal = sep.vocals_dereverbed.T acc = sep.accompaniment.T sample_rate = sep.sample_rate vocal_path = output_dir / "vocal.wav" acc_path = output_dir / "acc.wav" sf.write(vocal_path, vocal, sample_rate) sf.write(acc_path, acc, sample_rate) else: # Use the original audio as vocal source (no separation) vocal, sample_rate = librosa.load(audio_path, sr=None, mono=True) vocal_path = output_dir / "vocal.wav" sf.write(vocal_path, vocal, sample_rate) vocal_f0 = self.f0_extractor.process(str(vocal_path)) segments = self.vocal_detector.process(str(vocal_path), f0=vocal_f0) metadata = [] for seg in segments: self.f0_extractor.process(seg["wav_fn"], f0_path=seg["wav_fn"].replace(".wav", "_f0.npy")) words, durs = self.lyric_transcriber.process( seg["wav_fn"], language ) seg["words"] = words seg["word_durs"] = durs seg["language"] = language metadata.append( self.note_transcriber.process(seg, segment_info=seg) ) merged = merge_short_segments( vocal, sample_rate, metadata, output_dir / "long_cut_wavs", max_duration_ms=max_merge_duration, ) final_metadata = [] for item in merged: self.f0_extractor.process(item.wav_fn, f0_path=item.wav_fn.replace(".wav", "_f0.npy")) final_metadata.append(convert_metadata(item)) with open(output_dir / "metadata.json", "w", encoding="utf-8") as f: json.dump(final_metadata, f, ensure_ascii=False, indent=2) shutil.copy(output_dir / "metadata.json", audio_path.replace(".wav", ".json").replace(".mp3", ".json").replace(".flac", ".json")) def main(args): pipeline = PreprocessPipeline( device=args.device, language=args.language, save_dir=args.save_dir, vocal_sep=args.vocal_sep, max_merge_duration=args.max_merge_duration, ) pipeline.run( audio_path=args.audio_path, language=args.language ) if __name__ == "__main__": import argparse parser = argparse.ArgumentParser() parser.add_argument("--audio_path", type=str, required=True, help="Path to the input audio file") parser.add_argument("--save_dir", type=str, required=True, help="Directory to save the output files") parser.add_argument("--language", type=str, default="Mandarin", help="Language of the audio") parser.add_argument("--device", type=str, default="cuda:0", help="Device to run the models on") parser.add_argument("--vocal_sep", type=bool, default=True, help="Whether to perform vocal separation") parser.add_argument("--max_merge_duration", type=int, default=60000, help="Maximum merged segment duration in milliseconds") args = parser.parse_args() main(args)