Spaces:

AIDC-AI
/

Marco-Voice-TTS

Running

App Files Files Community

tianfengping.tfp commited on 10 days ago

Commit

efacc59

1 Parent(s): 26dff53

modify emotion type to english

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

app.py +195 -255
cosyvoice_rodis/__init__.py +2 -0
cosyvoice_rodis/__pycache__/__init__.cpython-310.pyc +0 -0
cosyvoice_rodis/__pycache__/__init__.cpython-312.pyc +0 -0
cosyvoice_rodis/__pycache__/__init__.cpython-38.pyc +0 -0
cosyvoice_rodis/__pycache__/__init__.cpython-39.pyc +0 -0
cosyvoice_rodis/bin/average_model.py +91 -0
cosyvoice_rodis/bin/export_jit.py +73 -0
cosyvoice_rodis/bin/export_onnx.py +110 -0
cosyvoice_rodis/bin/inference.py +114 -0
cosyvoice_rodis/bin/train.py +159 -0
cosyvoice_rodis/cli/__init__.py +2 -0
cosyvoice_rodis/cli/__pycache__/__init__.cpython-310.pyc +0 -0
cosyvoice_rodis/cli/__pycache__/__init__.cpython-312.pyc +0 -0
cosyvoice_rodis/cli/__pycache__/__init__.cpython-38.pyc +0 -0
cosyvoice_rodis/cli/__pycache__/__init__.cpython-39.pyc +0 -0
cosyvoice_rodis/cli/__pycache__/cosyvoice.cpython-310.pyc +0 -0
cosyvoice_rodis/cli/__pycache__/cosyvoice.cpython-312.pyc +0 -0
cosyvoice_rodis/cli/__pycache__/cosyvoice.cpython-38.pyc +0 -0
cosyvoice_rodis/cli/__pycache__/cosyvoice.cpython-39.pyc +0 -0
cosyvoice_rodis/cli/__pycache__/frontend.cpython-310.pyc +0 -0
cosyvoice_rodis/cli/__pycache__/frontend.cpython-38.pyc +0 -0
cosyvoice_rodis/cli/__pycache__/frontend.cpython-39.pyc +0 -0
cosyvoice_rodis/cli/__pycache__/model.cpython-310.pyc +0 -0
cosyvoice_rodis/cli/__pycache__/model.cpython-38.pyc +0 -0
cosyvoice_rodis/cli/__pycache__/model.cpython-39.pyc +0 -0
cosyvoice_rodis/cli/cosyvoice.py +114 -0
cosyvoice_rodis/cli/frontend.py +192 -0
cosyvoice_rodis/cli/model.py +257 -0
cosyvoice_rodis/dataset/__init__.py +2 -0
cosyvoice_rodis/dataset/__pycache__/__init__.cpython-310.pyc +0 -0
cosyvoice_rodis/dataset/__pycache__/__init__.cpython-38.pyc +0 -0
cosyvoice_rodis/dataset/__pycache__/__init__.cpython-39.pyc +0 -0
cosyvoice_rodis/dataset/__pycache__/dataset.cpython-310.pyc +0 -0
cosyvoice_rodis/dataset/__pycache__/dataset.cpython-38.pyc +0 -0
cosyvoice_rodis/dataset/__pycache__/processor.cpython-310.pyc +0 -0
cosyvoice_rodis/dataset/__pycache__/processor.cpython-38.pyc +0 -0
cosyvoice_rodis/dataset/__pycache__/processor.cpython-39.pyc +0 -0
cosyvoice_rodis/dataset/dataset.py +163 -0
cosyvoice_rodis/dataset/processor.py +427 -0
cosyvoice_rodis/flow/__pycache__/decoder.cpython-310.pyc +0 -0
cosyvoice_rodis/flow/__pycache__/decoder.cpython-38.pyc +0 -0
cosyvoice_rodis/flow/__pycache__/decoder.cpython-39.pyc +0 -0
cosyvoice_rodis/flow/__pycache__/flow.cpython-310.pyc +0 -0
cosyvoice_rodis/flow/__pycache__/flow.cpython-38.pyc +0 -0
cosyvoice_rodis/flow/__pycache__/flow.cpython-39.pyc +0 -0
cosyvoice_rodis/flow/__pycache__/flow_matching.cpython-310.pyc +0 -0
cosyvoice_rodis/flow/__pycache__/flow_matching.cpython-38.pyc +0 -0
cosyvoice_rodis/flow/__pycache__/flow_matching.cpython-39.pyc +0 -0
cosyvoice_rodis/flow/__pycache__/length_regulator.cpython-310.pyc +0 -0

app.py CHANGED Viewed

@@ -1,13 +1,12 @@
 import gradio as gr
 import sys, os
 import torch
 from cosyvoice.utils.file_utils import load_wav
-from tts_model.base_model.cosyvoice import CosyVoice as CosyVoiceTTS_base
-from tts_model.sft_model.cosyvoice import CosyVoice as CosyVoiceTTS_sft
 from uuid import uuid1
 import uuid
-from tts_model.speaker_minus.cosyvoice import CosyVoice as CosyVoiceTTS_speakerminus
-# from tts_model.model_cosy2_instruct import CosyVoiceTTS as CosyVoiceTTS_cosy2
 from pydub import AudioSegment
 import tempfile
 import soundfile as sf
@@ -17,20 +16,56 @@ import random
 import numpy
-from pydub import AudioSegment
-# AudioSegment.converter = "/mnt/by079416/fengping/ffmpeg-7.0.2-amd64-static/ffmpeg"
-# AudioSegment.ffprobe = "/mnt/by079416/fengping/ffmpeg-7.0.2-amd64-static/ffprobe"
-ffmpeg_path = os.path.expanduser("/mnt/by079416/fengping/ffmpeg-7.0.2-amd64-static/ffmpeg/")
-os.environ["PATH"] += os.pathsep + ffmpeg_path
 sys.path.append('third_party/Matcha-TTS')
 os.system('export PYTHONPATH=third_party/Matcha-TTS')
-tts_base = CosyVoiceTTS_base(model_dir="./pretrained_models/CosyVoice-300M/")
-tts_speakerminus = CosyVoiceTTS_speakerminus(model_dir="./pretrained_models/CosyVoice-300M-speakerminus/")
-# tts_cosy2_instruct = CosyVoiceTTS_cosy2(model_path="./pretrained_models/CosyVoice-300M-Instruct_cosy2/")
-tts_sft = CosyVoiceTTS_base(model_dir="./pretrained_models/CosyVoice-300M-SFT/")
 text_prompt = {
 "翟佳宁": "这个节目就是把四个男嘉宾，四个女嘉宾放一个大别墅里让他们朝夕相处一整个月，月末选择心动的彼此。",
@@ -58,7 +93,7 @@ audio_prompt = {
     "赵晓卉": "zhaoxiaohui",
     "徐志胜": "xuzhisheng"
 }
-audio_prompt_path = "/mnt/by079416/fengping/CosyVoice2/talk_show_prompt/"
 def load_audio_and_convert_to_16bit(file_path, target_sample_rate=16000):
     audio = AudioSegment.from_file(file_path)
@@ -88,11 +123,11 @@ def convert_audio_with_sox(input_file, output_file, target_sample_rate=16000):
         # ]
         command = [
             './ffmpeg-7.0.2-amd64-static/ffmpeg',
-            '-i', input_file,  # 必须显式指定 -i 标记输入文件
-            '-ar', str(target_sample_rate),  # 设置音频采样率
-            '-ac', '1',  # 设置单通道 (mono)
-            '-b:a', '16k',  # 设置音频比特率为 16kbps
-            '-f', 'wav',  # 强制输出格式为 WAV
             output_file
         ]
@@ -103,88 +138,45 @@ def convert_audio_with_sox(input_file, output_file, target_sample_rate=16000):
 os.makedirs("./tmp", exist_ok=True)
-def generate_speech_sft(tts_text, speaker):
-    # if not ref_audio and not ref_text:
-    #     ref_text = text_prompt.get(speaker, "")
-    #     ref_audio = os.path.join(audio_prompt_path, f"{audio_prompt.get(speaker)}.wav")
-    # else:
-    #     random_int = random.randint(0, 90)
-    #     soxsed_ref_audio = "/tmp/{random_int}_ref.wav"
-    #     convert_audio_with_sox(ref_audio, soxsed_ref_audio)
-    # ref_audio = load_wav(ref_audio, 16000)
-    # # ref_audio, target_sample_rate = load_audio_and_convert_to_16bit(ref_audio)
-    sample_rate, full_audio = tts_sft.inference_sft(
-        tts_text,
-        spk_id = speaker
-        # instruct = instruct
-        # prompt_text = ref_text,
-        # prompt_speech_16k = ref_audio,
-        # speed=speed,
-        # speaker=speaker,
-        # emotion=emotion,
-    )
-    full_audio = full_audio.astype(np.float32)
-    if full_audio.max() > 1.0 or full_audio.min() < -1.0:
-        full_audio /= 32768.0   # int16 → [-1,1]
-    print("dtype:", full_audio.dtype,
-      "shape:", full_audio.shape,
-      "max:", full_audio.max(), "min:", full_audio.min())
-    out_path = os.path.join("./tmp", f"{uuid.uuid4().hex}.wav")
-    audio_segment = AudioSegment(
-        full_audio.tobytes(),
-        frame_rate=sample_rate,
-        sample_width=full_audio.dtype.itemsize,
-        channels=1
-    )
-    audio_segment.export(out_path, format="wav")
-    print(">>> audio path:", os.path.abspath(out_path))
-    # return out_path
-    return (sample_rate, full_audio)
-    # with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as temp_audio_file:
-    #     output_audio_path = temp_audio_file.name
-    #     audio_segment = AudioSegment(
-    #         full_audio.tobytes(),
-    #         frame_rate=sample_rate,
-    #         sample_width=full_audio.dtype.itemsize,
-    #         channels=1
-    #     )
-    #     audio_segment.export(output_audio_path, format="wav")
-    # print(f"Audio saved to {output_audio_path}")
-    # return output_audio_path
-# def generate_speech_sft(tts_text, speaker):
-#     sr = 22050                                # 采样率
-#     t = np.linspace(0, 1, sr, dtype=np.float32)
-#     audio_np = np.sin(2 * np.pi * 440 * t)    # 1 秒 440 Hz 正弦波
-#     return (sr, audio_np)
-def generate_speech_base(tts_text, speed, speaker, ref_audio, ref_text):
     # import pdb;pdb.set_trace()
     if not ref_audio and not ref_text:
         ref_text = text_prompt.get(speaker, "")
-        ref_audio = os.path.join(audio_prompt_path, f"{audio_prompt.get(speaker)}.wav")
-        ref_audio = load_wav(ref_audio, 16000)
     else:
-        random_int = random.randint(0, 90000)
-        soxsed_ref_audio = f"/tmp/{random_int}_ref.wav"
         convert_audio_with_sox(ref_audio, soxsed_ref_audio)
-        ref_audio = load_wav(soxsed_ref_audio, 16000)
-    # ref_audio, target_sample_rate = load_audio_and_convert_to_16bit(ref_audio)
-    sample_rate, full_audio = tts_base.inference_zero_shot(
         tts_text,
         prompt_text = ref_text,
-        prompt_speech_16k = ref_audio,
-        speed=speed,
         # speaker=speaker,
-        # emotion=emotion,
     )
     print("sample_rate:", sample_rate, "full_audio:", full_audio.min(), full_audio.max())
     with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as temp_audio_file:
@@ -197,37 +189,45 @@ def generate_speech_base(tts_text, speed, speaker, ref_audio, ref_text):
         )
         audio_segment.export(output_audio_path, format="wav")
     print(f"Audio saved to {output_audio_path}")
     return output_audio_path
-def generate_speech_speakerminus(tts_text, speed, speaker, key, ref_audio, ref_text):
     # import pdb;pdb.set_trace()
     if not ref_audio and not ref_text:
         ref_text = text_prompt.get(speaker, "")
-        ref_audio = os.path.join(audio_prompt_path, f"{audio_prompt.get(speaker)}.wav")
     else:
         random_int = random.randint(0, 90)
-        soxsed_ref_audio = f"/tmp/{random_int}_ref.wav"
         convert_audio_with_sox(ref_audio, soxsed_ref_audio)
-    # print("output_file:", output_file)
-    # ref_audio, target_sample_rate = load_audio_and_convert_to_16bit(ref_audio)
     ref_audio = load_wav(ref_audio, 16000)
-    # if key == "Surprise":
-    #     emotion_info = torch.load("/mnt/by079416/surprise.pt")
-    # if key == "Sad":
-    #     emotion_info = torch.load("/mnt/by079416/sad.pt")
-    # if key == "Angry":
-    #     emotion_info = torch.load("/mnt/by079416/angry.pt")
-    # if key == "Happy":
-    #     emotion_info = torch.load("/mnt/by079416/happy.pt")
-    emotion_info = torch.load("/mnt/by079416/fengping/CosyVoice2/embedding_info.pt")["0002"][key]
-    sample_rate, full_audio = tts_speakerminus.inference_zero_shot(
         tts_text,
         prompt_text = ref_text,
         # speaker=speaker,
         prompt_speech_16k = ref_audio,
-        key = key,
         emotion_speakerminus=emotion_info,
         # ref_audio = ref_audio,
         speed=speed
@@ -328,7 +328,7 @@ body {
     flex-grow: 1; /* 占据所有剩余空间 */
 }
-/* 6. 标题文本样式 */
 #header-title h1 {
     color: white;
     font-size: 28px;
@@ -561,38 +561,29 @@ input[type="text"]:focus, textarea:focus {
 }
 """
-# 创建界面
-logo_path = "/mnt/by079416/fengping/CosyVoice2/logo2.png"
 with gr.Blocks(css=custom_css, theme=gr.themes.Soft()) as demo:
     with gr.Column(elem_classes="header"):
-        # 行容器，用于左右布局
         with gr.Row(elem_id="header-row", variant="compact"):
-            # 左侧：Logo
             gr.Image(value=logo_path,
                      elem_id="logo-container",
                      show_label=False,
                      show_download_button=False,
-                     show_share_button=False) # 隐藏分享按钮
-            # 右侧：标题区域
             with gr.Column(elem_id="title-area"):
-                gr.Markdown("# 🎤 Marco-Voice 语音合成系统", elem_id="header-title")
-        # gr.Markdown("")
-    # 标签页
     with gr.Tabs(elem_classes="tabs") as tabs:
-        # Tab 1: 音色克隆
-        with gr.TabItem("🎭 音色克隆", id=0):
             with gr.Row():
                 with gr.Column(scale=2, elem_classes="input-section"):
-                    gr.Markdown("### 输入设置")
                     tts_text_v1 = gr.Textbox(
                         lines=3,
-                        placeholder="请输入要合成的文本内容...",
-                        label="合成文本",
-                        value="大家好，欢迎使用Marco Voice语音合成系统，这是一个强大的语音生成工具。"
                     )
                     with gr.Row():
@@ -602,174 +593,130 @@ with gr.Blocks(css=custom_css, theme=gr.themes.Soft()) as demo:
                                 maximum=2.0,
                                 value=1.0,
                                 step=0.1,
-                                label="语速控制",
-                                interactive=True
                             )
                         with gr.Column():
                             speaker_v1 = gr.Dropdown(
                                 choices=names,
                                 value="徐志胜",
-                                label="预设音色",
-                                info="选择脱口秀演员音色"
-                            )
-                    # [tts_text_v1, speed_v1, speaker_v1, emotion, ref_audio_v1, ref_text_v1]
-                    with gr.Accordion("高级设置", open=False, elem_classes="accordion"):
-                        gr.Markdown("上传3-10秒清晰人声作为参考音频")
-                        with gr.Row():
-                            ref_audio_v1 = gr.Audio(
-                                type="filepath",
-                                label="上传参考音频",
-                                elem_classes="audio-upload"
-                            )
-                            ref_text_v1 = gr.Textbox(
-                                lines=2,
-                                placeholder="参考音频对应的文本...",
-                                label="参考文本"
                             )
                     gr.Markdown("""
                     <div class="model-info">
-                        <p><span class="info-icon">ℹ️</span> <strong>模型说明：</strong> 此模型使用零样本音色克隆技术，只需3-10秒参考音频即可模仿目标音色。</p>
                     </div>
                     """)
                 with gr.Column(scale=1, elem_classes="output-section"):
-                    gr.Markdown("### 输出结果")
-                    tts_base_output = gr.Audio(
                         type="filepath",
-                        label="生成语音",
-                        elem_id="tts_output_audio",
                         interactive=False
                     )
-                    tts_base_button = gr.Button(
-                        "🚀 生成语音",
                         variant="primary",
                         elem_classes="btn-generate"
                     )
                     gr.Examples(
                         examples=[
-                            ["大家好，欢迎使用Marco-Voice语音合成系统，这是一个强大的语音生成工具。", "徐志胜"],
-                            ["科技改变生活，创新引领未来。人工智能正在深刻改变我们的世界。", "李雪琴"],
-                            ["在这个充满机遇的时代，我们要勇于探索，敢于创新，不断突破自我。", "范志毅"]
-                        ],
-                        inputs=[tts_text_v1, speaker_v1],
-                        label="示例文本"
-                    )
-        # Tab 2: 多语种合成
-        with gr.TabItem("🌍 多语种合成", id=1):
-            with gr.Row():
-                with gr.Column(scale=2, elem_classes="input-section"):
-                    gr.Markdown("### 输入设置")
-                    tts_text_sft = gr.Textbox(
-                        lines=3,
-                        placeholder="请输入要合成的文本内容...",
-                        label="合成文本",
-                        value="Hello, welcome to Marco-Voice text-to-speech system. This is a powerful multilingual TTS tool."
-                    )
-                    speaker_sft = gr.Dropdown(
-                        choices=["中文男", "中文女", "英文男", "英文女", "韩语女", "日语男"],
-                        value="英文男",
-                        label="说话人",
-                        info="选择语言和性别"
-                    )
-                    gr.Markdown("""
-                    <div class="model-info">
-                        <p><span class="info-icon">ℹ️</span> <strong>模型说明：</strong> 此模型支持多个语种，无需参考音频即可生成自然语音。</p>
-                        <p><span class="info-icon">💡</span> <strong>使用技巧：</strong> 输入文本语言应与选择的说话人语言一致以获得最佳效果。</p>
-                    </div>
-                    """)
-                with gr.Column(scale=1, elem_classes="output-section"):
-                    gr.Markdown("### 输出结果")
-                    tts_sft_output = gr.Audio(
-                        type="numpy",
-                        label="生成语音",
-                        interactive=False
-                    )
-                    tts_sft_button = gr.Button(
-                        "🚀 生成语音",
-                        variant="primary",
-                        elem_classes="btn-generate"
-                    )
-                    gr.Examples(
-                        examples=[
-                            ["Hello, welcome to Marco-Voice text-to-speech system.", "英文男"],
-                            ["こんにちは、Marco-Voiceテキスト読み上げシステムへようこそ。", "日语男"],
-                            ["안녕하세요, Marco-Voice 텍스트 음성 변환 시스템에 오신 것을 환영합니다.", "韩语女"]
                         ],
-                        inputs=[tts_text_sft, speaker_sft],
-                        label="多语种示例"
                     )
-        # Tab 3: 情感控制
-        with gr.TabItem("😄 情感控制", id=2):
             with gr.Row():
                 with gr.Column(scale=2, elem_classes="input-section"):
-                    gr.Markdown("### 输入设置")
-                    tts_text_v3 = gr.Textbox(
                         lines=3,
-                        placeholder="请输入要合成的文本内容...",
-                        label="合成文本",
                         value="这真是太令人兴奋了！我们刚刚完成了一个重大突破！"
                     )
                     with gr.Row():
                         with gr.Column():
-                            speed_v3 = gr.Slider(
                                 minimum=0.5,
                                 maximum=2.0,
                                 value=1.0,
                                 step=0.1,
-                                label="语速控制"
                             )
                         with gr.Column():
-                            emotion_v3 = gr.Radio(
-                                choices=["Angry", "Happy", "Surprise", "Sad"],
                                 value="Happy",
-                                label="情感选择"
                             )
                     with gr.Row():
                         with gr.Column():
-                            speaker_v3 = gr.Dropdown(
                                 choices=names,
                                 value="徐志胜",
-                                label="预设音色"
                             )
                         with gr.Column():
-                            gr.Markdown("### 或使用自定义音色")
-                            with gr.Accordion("上传参考音频", open=False, elem_classes="accordion"):
-                                gr.Markdown("上传3-10秒清晰人声作为参考音频")
-                                ref_audio_v3 = gr.Audio(
                                     type="filepath",
-                                    label="上传参考音频",
                                     elem_classes="audio-upload"
                                 )
-                                ref_text_v3 = gr.Textbox(
                                     lines=2,
-                                    placeholder="参考音频对应的文本...",
-                                    label="参考文本"
                                 )
                     gr.Markdown("""
                     <div class="model-info">
-                        <p><span class="info-icon">ℹ️</span> <strong>模型说明：</strong> 此模型在音色克隆基础上增加了情感控制能力，可生成带有特定情感的语音。</p>
-                        <p><span class="info-icon">💡</span> <strong>使用技巧：</strong> 情感表达效果与文本内容相关，请确保文本与所选情感匹配。</p>
                     </div>
                     """)
                 with gr.Column(scale=1, elem_classes="output-section"):
-                    gr.Markdown("### 输出结果")
-                    tts_v3_output = gr.Audio(
                         type="filepath",
-                        label="生成语音",
                         interactive=False
                     )
-                    tts_v3_button = gr.Button(
-                        "🚀 生成语音",
                         variant="primary",
                         elem_classes="btn-generate"
                     )
@@ -779,35 +726,28 @@ with gr.Blocks(css=custom_css, theme=gr.themes.Soft()) as demo:
                             ["我简直不敢相信！这怎么可能发生？", "Surprise", "李雪琴"],
                             ["这太让人失望了，我们所有的努力都白费了。", "Sad", "范志毅"]
                         ],
-                        inputs=[tts_text_v3, emotion_v3, speaker_v3],
-                        label="情感示例"
                     )
-    # 页脚
     gr.Markdown("""
     <div class="footer">
-        <p>Marco-Voice 语音合成系统 v1.0 | 基于优秀的tts 模型 | 技术支持: tech@marco-voice.com</p>
-        <p>注意: 生成内容仅用于技术演示，请勿用于非法用途</p>
     </div>
     """)
-    # 绑定事件 # tts_text, speed, speaker, emotion, ref_audio, ref_text
-    tts_base_button.click(
-        fn=generate_speech_base,
-        inputs=[tts_text_v1, speed_v1, speaker_v1, ref_audio_v1, ref_text_v1],
-        outputs=tts_base_output
-    )
-    tts_sft_button.click(
-        fn=generate_speech_sft,
-        inputs=[tts_text_sft, speaker_sft],
-        outputs=tts_sft_output
     )
      # tts_text, speed, speaker, key, ref_audio, ref_text
-    tts_v3_button.click(
-        fn=generate_speech_speakerminus,
-        inputs=[tts_text_v3, speed_v3, speaker_v3, emotion_v3, ref_audio_v3, ref_text_v3],
-        outputs=tts_v3_output
     )
 if __name__ == "__main__":
@@ -815,5 +755,5 @@ if __name__ == "__main__":
         server_name="0.0.0.0",
         server_port=10163,
         share=True,
-        favicon_path="/mnt/by079416/fengping/CosyVoice2/logo.png"
     )

 import gradio as gr
 import sys, os
+from huggingface_hub import snapshot_download, hf_hub_download
 import torch
 from cosyvoice.utils.file_utils import load_wav
 from uuid import uuid1
 import uuid
+from cosyvoice_rodis.cli.cosyvoice import CosyVoice as CosyVoiceTTS_speakerminus
 from pydub import AudioSegment
 import tempfile
 import soundfile as sf
 import numpy
+import imageio_ffmpeg
+ffmpeg_path = imageio_ffmpeg.get_ffmpeg_exe()
+print(f"FFmpeg path: {ffmpeg_path}")
+user_bin = os.path.expanduser("~/bin")
+if not os.path.exists(user_bin):
+    os.makedirs(user_bin)
+ffmpeg_link = os.path.join(user_bin, "ffmpeg")
+if os.path.exists(ffmpeg_link):
+    os.remove(ffmpeg_link)
+os.symlink(ffmpeg_path, ffmpeg_link)
+print(f"create symbolic link: {ffmpeg_link}")
+os.environ["PATH"] = f"{user_bin}:{os.environ.get('PATH', '')}"
 sys.path.append('third_party/Matcha-TTS')
 os.system('export PYTHONPATH=third_party/Matcha-TTS')
+assets_dir = snapshot_download(
+    repo_id="tienfeng/prompt",
+    repo_type="dataset",
+)
+from huggingface_hub import hf_hub_download
+model_repo_id = "AIDC-AI/Marco-Voice"
+local_model = snapshot_download(
+    repo_id=model_repo_id,
+    repo_type="model"
+    # token=os.getenv("HF_TOKEN")
+)
+local_model_path = os.path.join(local_model, "marco_voice")
+local_model_path_enhenced = os.path.join(local_model, "marco_voice_enhenced")
+logo_path = hf_hub_download(
+    repo_id="tienfeng/prompt",
+    filename="logo2.png",
+    repo_type="dataset",
+)
+logo_path2 = hf_hub_download(
+    repo_id="tienfeng/prompt",
+    filename="logo.png",
+    repo_type="dataset",
+)
+tts_speakerminus = CosyVoiceTTS_speakerminus(model_dir=local_model_path)
+tts_sft = CosyVoiceTTS_speakerminus(model_dir=local_model_path_enhenced)
 text_prompt = {
 "翟佳宁": "这个节目就是把四个男嘉宾，四个女嘉宾放一个大别墅里让他们朝夕相处一整个月，月末选择心动的彼此。",
     "赵晓卉": "zhaoxiaohui",
     "徐志胜": "xuzhisheng"
 }
+audio_prompt_path = assets_dir
 def load_audio_and_convert_to_16bit(file_path, target_sample_rate=16000):
     audio = AudioSegment.from_file(file_path)
         # ]
         command = [
             './ffmpeg-7.0.2-amd64-static/ffmpeg',
+            '-i', input_file,
+            '-ar', str(target_sample_rate),
+            '-ac', '1',
+            '-b:a', '16k',
+            '-f', 'wav',
             output_file
         ]
 os.makedirs("./tmp", exist_ok=True)
+def generate_speech_speakerminus(tts_text, speed, speaker, key, ref_audio, ref_text):
     # import pdb;pdb.set_trace()
     if not ref_audio and not ref_text:
         ref_text = text_prompt.get(speaker, "")
+        speaker_audio_name = audio_prompt.get(speaker)
+        if speaker_audio_name:
+            ref_audio = os.path.join(audio_prompt_path, f"{speaker_audio_name}.wav")
+        else:
+            raise ValueError(f"Speaker '{speaker}' not found in audio_prompt dictionary")
     else:
+        random_int = random.randint(0, 90)
+        soxsed_ref_audio = f"./tmp/{random_int}_ref.wav"
         convert_audio_with_sox(ref_audio, soxsed_ref_audio)
+        ref_audio = soxsed_ref_audio
+    if not ref_audio:
+        raise ValueError("Reference audio is required but not provided")
+    ref_audio = load_wav(ref_audio, 16000)
+    emo = {"Sad": "伤心", "Fearful": "恐惧", "Happy": "快乐", "Surprise": "惊喜", "Angry": "生气", "Jolliest": "戏谑"}
+    # key="快乐"
+    if key in ["Angry", "Surprise", "Happy"]:
+        emotion_info = torch.load("./emotion_info.pt")["male005"][key]
+    elif key in ["Sad"]:
+        emotion_info = torch.load("./emotion_info.pt")["female005"][key]
+    elif key in ["Fearful"]:
+        emotion_info = torch.load("./emotion_info.pt")["female003"][key]
+    else:
+        emotion_info = torch.load("./emotion_info.pt")["male005"][key]
+    sample_rate, full_audio = tts_sft.inference_zero_shot(
         tts_text,
         prompt_text = ref_text,
         # speaker=speaker,
+        prompt_speech_16k = ref_audio,
+        key = emo.get(key),
+        emotion_speakerminus=emotion_info,
+        # ref_audio = ref_audio,
+        speed=speed
     )
     print("sample_rate:", sample_rate, "full_audio:", full_audio.min(), full_audio.max())
     with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as temp_audio_file:
         )
         audio_segment.export(output_audio_path, format="wav")
     print(f"Audio saved to {output_audio_path}")
     return output_audio_path
+def generate_speech_sft(tts_text, speed, speaker, key, ref_audio, ref_text):
     # import pdb;pdb.set_trace()
     if not ref_audio and not ref_text:
         ref_text = text_prompt.get(speaker, "")
+        speaker_audio_name = audio_prompt.get(speaker)
+        if speaker_audio_name:
+            ref_audio = os.path.join(audio_prompt_path, f"{speaker_audio_name}.wav")
+        else:
+            raise ValueError(f"Speaker '{speaker}' not found in audio_prompt dictionary")
     else:
         random_int = random.randint(0, 90)
+        soxsed_ref_audio = f"./tmp/{random_int}_ref.wav"
         convert_audio_with_sox(ref_audio, soxsed_ref_audio)
+        ref_audio = soxsed_ref_audio
+    if not ref_audio:
+        raise ValueError("Reference audio is required but not provided")
     ref_audio = load_wav(ref_audio, 16000)
+    emo = {"Sad": "伤心", "Fearful": "恐惧", "Happy": "快乐", "Surprise": "惊喜", "Angry": "生气", "Jolliest": "戏谑"}
+    # key="快乐"
+    if key in ["Angry", "Surprise", "Happy"]:
+        emotion_info = torch.load("./emotion_info.pt")["male005"][key]
+    elif key in ["Sad"]:
+        emotion_info = torch.load("./emotion_info.pt")["female005"][key]
+    elif key in ["Fearful"]:
+        emotion_info = torch.load("./emotion_info.pt")["female003"][key]
+    else:
+        emotion_info = torch.load("./emotion_info.pt")["male005"][key]
+    sample_rate, full_audio = tts_sft.inference_zero_shot(
         tts_text,
         prompt_text = ref_text,
         # speaker=speaker,
         prompt_speech_16k = ref_audio,
+        key = emo.get(key),
         emotion_speakerminus=emotion_info,
         # ref_audio = ref_audio,
         speed=speed
     flex-grow: 1; /* 占据所有剩余空间 */
 }
+/* 6. title */
 #header-title h1 {
     color: white;
     font-size: 28px;
 }
 """
 with gr.Blocks(css=custom_css, theme=gr.themes.Soft()) as demo:
     with gr.Column(elem_classes="header"):
         with gr.Row(elem_id="header-row", variant="compact"):
             gr.Image(value=logo_path,
                      elem_id="logo-container",
                      show_label=False,
                      show_download_button=False,
+                     show_share_button=False)
             with gr.Column(elem_id="title-area"):
+                gr.Markdown("# 🎤 Marco-Voice ", elem_id="header-title")
     with gr.Tabs(elem_classes="tabs") as tabs:
+        with gr.TabItem("😄 Control of emotion", id=0):
             with gr.Row():
                 with gr.Column(scale=2, elem_classes="input-section"):
+                    gr.Markdown("### Input Settings")
                     tts_text_v1 = gr.Textbox(
                         lines=3,
+                        placeholder="Enter the text content you want to compose...",
+                        label="Synthesizing text",
+                        value="这真是太令人兴奋了！我们刚刚完成了一个重大突破！"
                     )
                     with gr.Row():
                                 maximum=2.0,
                                 value=1.0,
                                 step=0.1,
+                                label="Speaking rate control"
+                            )
+                        with gr.Column():
+                            emotion_v1 = gr.Radio(
+                                choices=["Angry", "Happy", "Surprise", "Sad", "Fearful", "Jolliest"],
+                                value="Happy",
+                                label="Emotion selection"
                             )
+                    with gr.Row():
                         with gr.Column():
                             speaker_v1 = gr.Dropdown(
                                 choices=names,
                                 value="徐志胜",
+                                label="Preset timbre"
                             )
+                        with gr.Column():
+                            gr.Markdown("### Or use a custom timbre")
+                            with gr.Accordion("Upload reference audio", open=False, elem_classes="accordion"):
+                                gr.Markdown("Upload 3-10 seconds of clear human voice as reference audio")
+                                ref_audio_v1 = gr.Audio(
+                                    type="filepath",
+                                    label="upload audio",
+                                    elem_classes="audio-upload"
+                                )
+                                ref_text_v1 = gr.Textbox(
+                                    lines=2,
+                                    placeholder="ref text content...",
+                                    label="ref text"
+                                )
                     gr.Markdown("""
                     <div class="model-info">
+                        <p><span class="info-icon">ℹ️</span> <strong>specification of a model：</strong> This model added emotion control ability on the basis of timbre cloning, and could generate speech with specific emotion.</p>
+                        <p><span class="info-icon">💡</span> <strong>use skill：</strong> The sentiment expression effect is related to the content of the text, make sure the text matches the selected sentiment.</p>
                     </div>
                     """)
                 with gr.Column(scale=1, elem_classes="output-section"):
+                    gr.Markdown("### output result")
+                    tts_v1_output = gr.Audio(
                         type="filepath",
+                        label="Generating speech",
                         interactive=False
                     )
+                    tts_v1_button = gr.Button(
+                        "🚀 Generating speech",
                         variant="primary",
                         elem_classes="btn-generate"
                     )
                     gr.Examples(
                         examples=[
+                            ["这真是太令人兴奋了！我们刚刚完成了一个重大突破！", "Happy", "徐志胜"],
+                            ["我简直不敢相信！这怎么可能发生？", "Surprise", "李雪琴"],
+                            ["这太让人失望了，我们所有的努力都白费了。", "Sad", "范志毅"]
                         ],
+                        inputs=[tts_text_v1, emotion_v1, speaker_v1],
+                        label="Emotion example"
                     )
+        with gr.TabItem("😄 Control of emotion enhenced", id=1):
             with gr.Row():
                 with gr.Column(scale=2, elem_classes="input-section"):
+                    gr.Markdown("### Input Settings")
+                    tts_text_v2 = gr.Textbox(
                         lines=3,
+                        placeholder="Enter the text content you want to compose...",
+                        label="Synthesizing text",
                         value="这真是太令人兴奋了！我们刚刚完成了一个重大突破！"
                     )
                     with gr.Row():
                         with gr.Column():
+                            speed_v2 = gr.Slider(
                                 minimum=0.5,
                                 maximum=2.0,
                                 value=1.0,
                                 step=0.1,
+                                label="Speaking rate control"
                             )
                         with gr.Column():
+                            emotion_v2 = gr.Radio(
+                                choices=["Angry", "Happy", "Surprise", "Sad", "Fearful", "Jolliest"],
                                 value="Happy",
+                                label="Emotion selection"
                             )
                     with gr.Row():
                         with gr.Column():
+                            speaker_v2 = gr.Dropdown(
                                 choices=names,
                                 value="徐志胜",
+                                label="Preset timbre"
                             )
                         with gr.Column():
+                            gr.Markdown("### Or use a custom timbre")
+                            with gr.Accordion("Upload reference audio", open=False, elem_classes="accordion"):
+                                gr.Markdown("Upload 3-10 seconds of clear human voice as reference audio")
+                                ref_audio_v2 = gr.Audio(
                                     type="filepath",
+                                    label="upload audio",
                                     elem_classes="audio-upload"
                                 )
+                                ref_text_v2 = gr.Textbox(
                                     lines=2,
+                                    placeholder="ref text content...",
+                                    label="ref text"
                                 )
                     gr.Markdown("""
                     <div class="model-info">
+                        <p><span class="info-icon">ℹ️</span> <strong>specification of a model：</strong> This model added emotion control ability on the basis of timbre cloning, and could generate speech with specific emotion.</p>
+                        <p><span class="info-icon">💡</span> <strong>use skill：</strong> The sentiment expression effect is related to the content of the text, make sure the text matches the selected sentiment.</p>
                     </div>
                     """)
                 with gr.Column(scale=1, elem_classes="output-section"):
+                    gr.Markdown("### output result")
+                    tts_v2_output = gr.Audio(
                         type="filepath",
+                        label="Generating speech",
                         interactive=False
                     )
+                    tts_v2_button = gr.Button(
+                        "🚀 Generating speech",
                         variant="primary",
                         elem_classes="btn-generate"
                     )
                             ["我简直不敢相信！这怎么可能发生？", "Surprise", "李雪琴"],
                             ["这太让人失望了，我们所有的努力都白费了。", "Sad", "范志毅"]
                         ],
+                        inputs=[tts_text_v2, emotion_v2, speaker_v2],
+                        label="emotion example"
                     )
     gr.Markdown("""
     <div class="footer">
+        <p>Marco-Voice text to speech v1.0 | based on excepent open source tts model | tech support: tech@marco-voice.com</p>
+        <p>attention: synthesised speech only use to tech share</p>
     </div>
     """)
+    tts_v1_button.click(
+        fn=generate_speech_speakerminus,
+        inputs=[tts_text_v1, speed_v1, speaker_v1, emotion_v1, ref_audio_v1, ref_text_v1],
+        outputs=tts_v1_output
     )
      # tts_text, speed, speaker, key, ref_audio, ref_text
+    tts_v2_button.click(
+        fn=generate_speech_sft,
+        inputs=[tts_text_v2, speed_v2, speaker_v2, emotion_v2, ref_audio_v2, ref_text_v2],
+        outputs=tts_v2_output
     )
 if __name__ == "__main__":
         server_name="0.0.0.0",
         server_port=10163,
         share=True,
+        favicon_path=logo_path2
     )

cosyvoice_rodis/__init__.py ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ #
2	+

cosyvoice_rodis/__pycache__/__init__.cpython-310.pyc ADDED Viewed

Binary file (165 Bytes). View file

cosyvoice_rodis/__pycache__/__init__.cpython-312.pyc ADDED Viewed

Binary file (186 Bytes). View file

cosyvoice_rodis/__pycache__/__init__.cpython-38.pyc ADDED Viewed

Binary file (163 Bytes). View file

cosyvoice_rodis/__pycache__/__init__.cpython-39.pyc ADDED Viewed

Binary file (140 Bytes). View file

cosyvoice_rodis/bin/average_model.py ADDED Viewed

	@@ -0,0 +1,91 @@

+#
+# Copyright (c) 2020 Mobvoi Inc (Di Wu)
+# Copyright (c) 2024 Alibaba Inc (authors: Xiang Lyu)
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os
+import argparse
+import glob
+import yaml
+import torch
+def get_args():
+    parser = argparse.ArgumentParser(description='average model')
+    parser.add_argument('--dst_model', required=True, help='averaged model')
+    parser.add_argument('--src_path',
+                        required=True,
+                        help='src model path for average')
+    parser.add_argument('--val_best',
+                        action="store_true",
+                        help='averaged model')
+    parser.add_argument('--num',
+                        default=5,
+                        type=int,
+                        help='nums for averaged model')
+    args = parser.parse_args()
+    print(args)
+    return args
+def main():
+    args = get_args()
+    val_scores = []
+    if args.val_best:
+        yamls = glob.glob('{}/*.yaml'.format(args.src_path))
+        yamls = [
+            f for f in yamls
+            if not (os.path.basename(f).startswith('train')
+                    or os.path.basename(f).startswith('init'))
+        ]
+        for y in yamls:
+            with open(y, 'r') as f:
+                dic_yaml = yaml.load(f, Loader=yaml.BaseLoader)
+                loss = float(dic_yaml['loss_dict']['loss'])
+                epoch = int(dic_yaml['epoch'])
+                step = int(dic_yaml['step'])
+                tag = dic_yaml['tag']
+                val_scores += [[epoch, step, loss, tag]]
+        sorted_val_scores = sorted(val_scores,
+                                   key=lambda x: x[2],
+                                   reverse=False)
+        print("best val (epoch, step, loss, tag) = " +
+              str(sorted_val_scores[:args.num]))
+        path_list = [
+            args.src_path + '/epoch_{}_whole.pt'.format(score[0])
+            for score in sorted_val_scores[:args.num]
+        ]
+    print(path_list)
+    avg = {}
+    num = args.num
+    assert num == len(path_list)
+    for path in path_list:
+        print('Processing {}'.format(path))
+        states = torch.load(path, map_location=torch.device('cpu'))
+        for k in states.keys():
+            if k not in avg.keys():
+                avg[k] = states[k].clone()
+            else:
+                avg[k] += states[k]
+    # average
+    for k in avg.keys():
+        if avg[k] is not None:
+            # pytorch 1.6 use true_divide instead of /=
+            avg[k] = torch.true_divide(avg[k], num)
+    print('Saving to {}'.format(args.dst_model))
+    torch.save(avg, args.dst_model)
+if __name__ == '__main__':
+    main()

cosyvoice_rodis/bin/export_jit.py ADDED Viewed

	@@ -0,0 +1,73 @@

+#
+# Copyright (c) 2024 Alibaba Inc (authors: Xiang Lyu)
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import print_function
+import argparse
+import logging
+logging.getLogger('matplotlib').setLevel(logging.WARNING)
+import os
+import sys
+import torch
+ROOT_DIR = os.path.dirname(os.path.abspath(__file__))
+sys.path.append('{}/../..'.format(ROOT_DIR))
+sys.path.append('{}/../../third_party/Matcha-TTS'.format(ROOT_DIR))
+from cosyvoice_rodis.cli.cosyvoice import CosyVoice
+def get_args():
+    parser = argparse.ArgumentParser(description='export your model for deployment')
+    parser.add_argument('--model_dir',
+                        type=str,
+                        default='pretrained_models/CosyVoice-300M',
+                        help='local path')
+    args = parser.parse_args()
+    print(args)
+    return args
+def main():
+    args = get_args()
+    logging.basicConfig(level=logging.DEBUG,
+                        format='%(asctime)s %(levelname)s %(message)s')
+    torch._C._jit_set_fusion_strategy([('STATIC', 1)])
+    torch._C._jit_set_profiling_mode(False)
+    torch._C._jit_set_profiling_executor(False)
+    cosyvoice = CosyVoice(args.model_dir, load_jit=False, load_onnx=False)
+    # 1. export llm text_encoder
+    llm_text_encoder = cosyvoice.model.llm.text_encoder.half()
+    script = torch.jit.script(llm_text_encoder)
+    script = torch.jit.freeze(script)
+    script = torch.jit.optimize_for_inference(script)
+    script.save('{}/llm.text_encoder.fp16.zip'.format(args.model_dir))
+    # 2. export llm llm
+    llm_llm = cosyvoice.model.llm.llm.half()
+    script = torch.jit.script(llm_llm)
+    script = torch.jit.freeze(script, preserved_attrs=['forward_chunk'])
+    script = torch.jit.optimize_for_inference(script)
+    script.save('{}/llm.llm.fp16.zip'.format(args.model_dir))
+    # 3. export flow encoder
+    flow_encoder = cosyvoice.model.flow.encoder
+    script = torch.jit.script(flow_encoder)
+    script = torch.jit.freeze(script)
+    script = torch.jit.optimize_for_inference(script)
+    script.save('{}/flow.encoder.fp32.zip'.format(args.model_dir))
+if __name__ == '__main__':
+    main()

cosyvoice_rodis/bin/export_onnx.py ADDED Viewed

	@@ -0,0 +1,110 @@

+#
+# Copyright (c) 2024 Antgroup Inc (authors: Zhoubofan, hexisyztem@icloud.com)
+# Copyright (c) 2024 Alibaba Inc (authors: Xiang Lyu)
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import print_function
+import argparse
+import logging
+logging.getLogger('matplotlib').setLevel(logging.WARNING)
+import os
+import sys
+import onnxruntime
+import random
+import torch
+from tqdm import tqdm
+ROOT_DIR = os.path.dirname(os.path.abspath(__file__))
+sys.path.append('{}/../..'.format(ROOT_DIR))
+sys.path.append('{}/../../third_party/Matcha-TTS'.format(ROOT_DIR))
+from cosyvoice_rodis.cli.cosyvoice import CosyVoice
+def get_dummy_input(batch_size, seq_len, out_channels, device):
+    x = torch.rand((batch_size, out_channels, seq_len), dtype=torch.float32, device=device)
+    mask = torch.ones((batch_size, 1, seq_len), dtype=torch.float32, device=device)
+    mu = torch.rand((batch_size, out_channels, seq_len), dtype=torch.float32, device=device)
+    t = torch.rand((batch_size), dtype=torch.float32, device=device)
+    spks = torch.rand((batch_size, out_channels), dtype=torch.float32, device=device)
+    cond = torch.rand((batch_size, out_channels, seq_len), dtype=torch.float32, device=device)
+    return x, mask, mu, t, spks, cond
+def get_args():
+    parser = argparse.ArgumentParser(description='export your model for deployment')
+    parser.add_argument('--model_dir',
+                        type=str,
+                        default='pretrained_models/CosyVoice-300M',
+                        help='local path')
+    args = parser.parse_args()
+    print(args)
+    return args
+def main():
+    args = get_args()
+    logging.basicConfig(level=logging.DEBUG,
+                        format='%(asctime)s %(levelname)s %(message)s')
+    cosyvoice = CosyVoice(args.model_dir, load_jit=False, load_onnx=False)
+    # 1. export flow decoder estimator
+    estimator = cosyvoice.model.flow.decoder.estimator
+    device = cosyvoice.model.device
+    batch_size, seq_len = 1, 256
+    out_channels = cosyvoice.model.flow.decoder.estimator.out_channels
+    x, mask, mu, t, spks, cond = get_dummy_input(batch_size, seq_len, out_channels, device)
+    torch.onnx.export(
+        estimator,
+        (x, mask, mu, t, spks, cond),
+        '{}/flow.decoder.estimator.fp32.onnx'.format(args.model_dir),
+        export_params=True,
+        opset_version=18,
+        do_constant_folding=True,
+        input_names=['x', 'mask', 'mu', 't', 'spks', 'cond'],
+        output_names=['estimator_out'],
+        dynamic_axes={
+            'x': {0: 'batch_size', 2: 'seq_len'},
+            'mask': {0: 'batch_size', 2: 'seq_len'},
+            'mu': {0: 'batch_size', 2: 'seq_len'},
+            'cond': {0: 'batch_size', 2: 'seq_len'},
+            't': {0: 'batch_size'},
+            'spks': {0: 'batch_size'},
+            'estimator_out': {0: 'batch_size', 2: 'seq_len'},
+        }
+    )
+    # 2. test computation consistency
+    option = onnxruntime.SessionOptions()
+    option.graph_optimization_level = onnxruntime.GraphOptimizationLevel.ORT_ENABLE_ALL
+    option.intra_op_num_threads = 1
+    providers = ['CUDAExecutionProvider' if torch.cuda.is_available() else 'CPUExecutionProvider']
+    estimator_onnx = onnxruntime.InferenceSession('{}/flow.decoder.estimator.fp32.onnx'.format(args.model_dir),
+                                                  sess_options=option, providers=providers)
+    for _ in tqdm(range(10)):
+        x, mask, mu, t, spks, cond = get_dummy_input(random.randint(1, 6), random.randint(16, 512), out_channels, device)
+        output_pytorch = estimator(x, mask, mu, t, spks, cond)
+        ort_inputs = {
+            'x': x.cpu().numpy(),
+            'mask': mask.cpu().numpy(),
+            'mu': mu.cpu().numpy(),
+            't': t.cpu().numpy(),
+            'spks': spks.cpu().numpy(),
+            'cond': cond.cpu().numpy()
+        }
+        output_onnx = estimator_onnx.run(None, ort_inputs)[0]
+        torch.testing.assert_allclose(output_pytorch, torch.from_numpy(output_onnx).to(device), rtol=1e-2, atol=1e-4)
+if __name__ == "__main__":
+    main()

cosyvoice_rodis/bin/inference.py ADDED Viewed

	@@ -0,0 +1,114 @@

+#
+# Copyright (c) 2024 Alibaba Inc (authors: Xiang Lyu)
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import print_function
+import argparse
+import logging
+logging.getLogger('matplotlib').setLevel(logging.WARNING)
+import os
+import torch
+from torch.utils.data import DataLoader
+import torchaudio
+from hyperpyyaml import load_hyperpyyaml
+from tqdm import tqdm
+from cosyvoice_rodis.cli.model import CosyVoiceModel
+from cosyvoice_rodis.dataset.dataset import Dataset
+def get_args():
+    parser = argparse.ArgumentParser(description='inference with your model')
+    parser.add_argument('--config', required=True, help='config file')
+    parser.add_argument('--prompt_data', required=True, help='prompt data file')
+    parser.add_argument('--prompt_utt2data', required=True, help='prompt data file')
+    parser.add_argument('--tts_text', required=True, help='tts input file')
+    parser.add_argument('--llm_model', required=True, help='llm model file')
+    parser.add_argument('--flow_model', required=True, help='flow model file')
+    parser.add_argument('--hifigan_model', required=True, help='hifigan model file')
+    parser.add_argument('--gpu',
+                        type=int,
+                        default=-1,
+                        help='gpu id for this rank, -1 for cpu')
+    parser.add_argument('--mode',
+                        default='sft',
+                        choices=['sft', 'zero_shot'],
+                        help='inference mode')
+    parser.add_argument('--result_dir', required=True, help='asr result file')
+    args = parser.parse_args()
+    print(args)
+    return args
+def main():
+    args = get_args()
+    logging.basicConfig(level=logging.DEBUG,
+                        format='%(asctime)s %(levelname)s %(message)s')
+    os.environ['CUDA_VISIBLE_DEVICES'] = str(args.gpu)
+    # Init cosyvoice models from configs
+    use_cuda = args.gpu >= 0 and torch.cuda.is_available()
+    device = torch.device('cuda' if use_cuda else 'cpu')
+    with open(args.config, 'r') as f:
+        configs = load_hyperpyyaml(f)
+    model = CosyVoiceModel(configs['llm'], configs['flow'], configs['hift'], True)
+    model.load(args.llm_model, args.flow_model, args.hifigan_model)
+    test_dataset = Dataset(args.prompt_data, data_pipeline=configs['data_pipeline'], mode='inference', shuffle=False, partition=False,
+                           tts_file=args.tts_text, prompt_utt2data=args.prompt_utt2data)
+    test_data_loader = DataLoader(test_dataset, batch_size=None, num_workers=0)
+    del configs
+    os.makedirs(args.result_dir, exist_ok=True)
+    fn = os.path.join(args.result_dir, 'wav.scp')
+    f = open(fn, 'w')
+    with torch.no_grad():
+        for _, batch in tqdm(enumerate(test_data_loader)):
+            utts = batch["utts"]
+            assert len(utts) == 1, "inference mode only support batchsize 1"
+            text_token = batch["text_token"].to(device)
+            text_token_len = batch["text_token_len"].to(device)
+            tts_index = batch["tts_index"]
+            tts_text_token = batch["tts_text_token"].to(device)
+            tts_text_token_len = batch["tts_text_token_len"].to(device)
+            speech_token = batch["speech_token"].to(device)
+            speech_token_len = batch["speech_token_len"].to(device)
+            speech_feat = batch["speech_feat"].to(device)
+            speech_feat_len = batch["speech_feat_len"].to(device)
+            utt_embedding = batch["utt_embedding"].to(device)
+            spk_embedding = batch["spk_embedding"].to(device)
+            if args.mode == 'sft':
+                model_input = {'text': tts_text_token, 'text_len': tts_text_token_len,
+                               'llm_embedding': spk_embedding, 'flow_embedding': spk_embedding}
+            else:
+                model_input = {'text': tts_text_token, 'text_len': tts_text_token_len,
+                               'prompt_text': text_token, 'prompt_text_len': text_token_len,
+                               'llm_prompt_speech_token': speech_token, 'llm_prompt_speech_token_len': speech_token_len,
+                               'flow_prompt_speech_token': speech_token, 'flow_prompt_speech_token_len': speech_token_len,
+                               'prompt_speech_feat': speech_feat, 'prompt_speech_feat_len': speech_feat_len,
+                               'llm_embedding': utt_embedding, 'flow_embedding': utt_embedding}
+            tts_speeches = []
+            for model_output in model.tts(**model_input):
+                tts_speeches.append(model_output['tts_speech'])
+            tts_speeches = torch.concat(tts_speeches, dim=1)
+            tts_key = '{}_{}'.format(utts[0], tts_index[0])
+            tts_fn = os.path.join(args.result_dir, '{}.wav'.format(tts_key))
+            torchaudio.save(tts_fn, tts_speeches, sample_rate=22050)
+            f.write('{} {}\n'.format(tts_key, tts_fn))
+            f.flush()
+    f.close()
+    logging.info('Result wav.scp saved in {}'.format(fn))
+if __name__ == '__main__':
+    main()

cosyvoice_rodis/bin/train.py ADDED Viewed

	@@ -0,0 +1,159 @@

+#
+# Copyright (c) 2024 Alibaba Inc (authors: Xiang Lyu)
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import print_function
+import sys
+print("_+++++++++++++++++++++")
+print(sys.path)
+sys.path.append("/mnt/workspace/baipeng/project/Marco-Voice/Models/marco_voice")
+import argparse
+import datetime
+import logging
+logging.getLogger('matplotlib').setLevel(logging.WARNING)
+from copy import deepcopy
+import os
+import torch
+import torch.distributed as dist
+import deepspeed
+from hyperpyyaml import load_hyperpyyaml
+from torch.distributed.elastic.multiprocessing.errors import record
+from cosyvoice_rodis.utils.executor import Executor
+from cosyvoice_rodis.utils.train_utils import (
+    init_distributed,
+    init_dataset_and_dataloader,
+    init_optimizer_and_scheduler,
+    init_summarywriter, save_model,
+    wrap_cuda_model, check_modify_and_save_config)
+def get_args():
+    parser = argparse.ArgumentParser(description='training your network')
+    parser.add_argument('--train_engine',
+                        default='torch_ddp',
+                        choices=['torch_ddp', 'deepspeed'],
+                        help='Engine for paralleled training')
+    parser.add_argument('--model', required=True, help='model which will be trained')
+    parser.add_argument('--config', required=True, help='config file')
+    parser.add_argument('--train_data', required=True, help='train data file')
+    parser.add_argument('--cv_data', required=True, help='cv data file')
+    parser.add_argument('--checkpoint', help='checkpoint model')
+    parser.add_argument('--model_dir', required=True, help='save model dir')
+    parser.add_argument('--tensorboard_dir',
+                        default='tensorboard',
+                        help='tensorboard log dir')
+    parser.add_argument('--ddp.dist_backend',
+                        dest='dist_backend',
+                        default='nccl',
+                        choices=['nccl', 'gloo'],
+                        help='distributed backend')
+    parser.add_argument('--num_workers',
+                        default=0,
+                        type=int,
+                        help='num of subprocess workers for reading')
+    parser.add_argument('--prefetch',
+                        default=100,
+                        type=int,
+                        help='prefetch number')
+    parser.add_argument('--pin_memory',
+                        action='store_true',
+                        default=False,
+                        help='Use pinned memory buffers used for reading')
+    parser.add_argument('--use_amp',
+                        action='store_true',
+                        default=False,
+                        help='Use automatic mixed precision training')
+    parser.add_argument('--deepspeed.save_states',
+                        dest='save_states',
+                        default='model_only',
+                        choices=['model_only', 'model+optimizer'],
+                        help='save model/optimizer states')
+    parser.add_argument('--timeout',
+                        default=60,
+                        type=int,
+                        help='timeout (in seconds) of cosyvoice_join.')
+    parser = deepspeed.add_config_arguments(parser)
+    args = parser.parse_args()
+    return args
+@record
+def main():
+    args = get_args()
+    logging.basicConfig(level=logging.DEBUG,
+                        format='%(asctime)s %(levelname)s %(message)s')
+    # gan train has some special initialization logic
+    gan = True if args.model == 'hifigan' else False
+    override_dict = {k: None for k in ['llm', 'flow', 'hift', 'hifigan'] if k != args.model}
+    if gan is True:
+        override_dict.pop('hift')
+    with open(args.config, 'r') as f:
+        configs = load_hyperpyyaml(f, overrides=override_dict)
+    if gan is True:
+        configs['train_conf'] = configs['train_conf_gan']
+    configs['train_conf'].update(vars(args))
+    # Init env for ddp
+    init_distributed(args)
+    # Get dataset & dataloader
+    train_dataset, cv_dataset, train_data_loader, cv_data_loader = \
+        init_dataset_and_dataloader(args, configs, gan)
+    # Do some sanity checks and save config to arsg.model_dir
+    configs = check_modify_and_save_config(args, configs)
+    # Tensorboard summary
+    writer = init_summarywriter(args)
+    # load checkpoint
+    model = configs[args.model]
+    if args.checkpoint is not None:
+        if os.path.exists(args.checkpoint):
+            model.load_state_dict(torch.load(args.checkpoint, map_location='cpu'), strict=False)
+        else:
+            logging.warning('checkpoint {} do not exsist!'.format(args.checkpoint))
+    # Dispatch model from cpu to gpu
+    model = wrap_cuda_model(args, model)
+    # Get optimizer & scheduler
+    model, optimizer, scheduler, optimizer_d, scheduler_d = init_optimizer_and_scheduler(args, configs, model, gan)
+    # Save init checkpoints
+    info_dict = deepcopy(configs['train_conf'])
+    save_model(model, 'init', info_dict)
+    # Get executor
+    executor = Executor(gan=gan)
+    # Init scaler, used for pytorch amp mixed precision training
+    scaler = torch.cuda.amp.GradScaler() if args.use_amp else None
+    # Start training loop
+    for epoch in range(info_dict['max_epoch']):
+        executor.epoch = epoch
+        train_dataset.set_epoch(epoch)
+        dist.barrier()
+        group_join = dist.new_group(backend="gloo", timeout=datetime.timedelta(seconds=args.timeout))
+        if gan is True:
+            executor.train_one_epoc_gan(model, optimizer, scheduler, optimizer_d, scheduler_d, train_data_loader, cv_data_loader,
+                                        writer, info_dict, scaler, group_join)
+        else:
+            executor.train_one_epoc(model, optimizer, scheduler, train_data_loader, cv_data_loader, writer, info_dict, scaler, group_join) #进
+        dist.destroy_process_group(group_join)
+if __name__ == '__main__':
+    main()

cosyvoice_rodis/cli/__init__.py ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ #
2	+

cosyvoice_rodis/cli/__pycache__/__init__.cpython-310.pyc ADDED Viewed

Binary file (169 Bytes). View file

cosyvoice_rodis/cli/__pycache__/__init__.cpython-312.pyc ADDED Viewed

Binary file (190 Bytes). View file

cosyvoice_rodis/cli/__pycache__/__init__.cpython-38.pyc ADDED Viewed

Binary file (167 Bytes). View file

cosyvoice_rodis/cli/__pycache__/__init__.cpython-39.pyc ADDED Viewed

Binary file (144 Bytes). View file

cosyvoice_rodis/cli/__pycache__/cosyvoice.cpython-310.pyc ADDED Viewed

Binary file (4.57 kB). View file

cosyvoice_rodis/cli/__pycache__/cosyvoice.cpython-312.pyc ADDED Viewed

Binary file (8.91 kB). View file

cosyvoice_rodis/cli/__pycache__/cosyvoice.cpython-38.pyc ADDED Viewed

Binary file (4.49 kB). View file

cosyvoice_rodis/cli/__pycache__/cosyvoice.cpython-39.pyc ADDED Viewed

Binary file (4.57 kB). View file

cosyvoice_rodis/cli/__pycache__/frontend.cpython-310.pyc ADDED Viewed

Binary file (7.18 kB). View file

cosyvoice_rodis/cli/__pycache__/frontend.cpython-38.pyc ADDED Viewed

Binary file (7.19 kB). View file

cosyvoice_rodis/cli/__pycache__/frontend.cpython-39.pyc ADDED Viewed

Binary file (7.16 kB). View file

cosyvoice_rodis/cli/__pycache__/model.cpython-310.pyc ADDED Viewed

Binary file (7.88 kB). View file

cosyvoice_rodis/cli/__pycache__/model.cpython-38.pyc ADDED Viewed

Binary file (7.71 kB). View file

cosyvoice_rodis/cli/__pycache__/model.cpython-39.pyc ADDED Viewed

Binary file (7.76 kB). View file

cosyvoice_rodis/cli/cosyvoice.py ADDED Viewed

	@@ -0,0 +1,114 @@

+#
+# Copyright (c) 2024 Alibaba Inc (authors: Xiang Lyu)
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os
+import time
+from tqdm import tqdm
+from hyperpyyaml import load_hyperpyyaml
+from modelscope import snapshot_download
+from .frontend import CosyVoiceFrontEnd
+from .model import CosyVoiceModel
+from ..utils.file_utils import logging
+class CosyVoice:
+    def __init__(self, model_dir, load_jit=True, load_onnx=False, fp16=True):
+        instruct = True if '-Instruct' in model_dir else False
+        self.model_dir = model_dir
+        if not os.path.exists(model_dir):
+            model_dir = snapshot_download(model_dir)
+        with open('{}/cosyvoice.yaml'.format(model_dir), 'r') as f:
+            configs = load_hyperpyyaml(f)
+        self.frontend = CosyVoiceFrontEnd(configs['get_tokenizer'],
+                                          configs['feat_extractor'],
+                                          '{}/campplus.onnx'.format(model_dir),
+                                          '{}/speech_tokenizer_v1.onnx'.format(model_dir),
+                                          '{}/spk2info.pt'.format(model_dir),
+                                          instruct,
+                                          configs['allowed_special'])
+        self.model = CosyVoiceModel(configs['llm'], configs['flow'], configs['hift'], fp16)
+        self.model.load('{}/llm.pt'.format(model_dir),
+                        '{}/flow.pt'.format(model_dir),
+                        '{}/hift.pt'.format(model_dir))
+        if load_jit:
+            self.model.load_jit('{}/llm.text_encoder.fp16.zip'.format(model_dir),
+                                '{}/llm.llm.fp16.zip'.format(model_dir),
+                                '{}/flow.encoder.fp32.zip'.format(model_dir))
+        if load_onnx:
+            self.model.load_onnx('{}/flow.decoder.estimator.fp32.onnx'.format(model_dir))
+        del configs
+    def list_avaliable_spks(self):
+        spks = list(self.frontend.spk2info.keys())
+        return spks
+    def inference_sft(self, tts_text, spk_id, stream=False, speed=1.0):
+        for i in tqdm(self.frontend.text_normalize(tts_text, split=True)):
+            model_input = self.frontend.frontend_sft(i, spk_id)
+            start_time = time.time()
+            logging.info('synthesis text {}'.format(i))
+            for model_output in self.model.tts(**model_input, stream=stream, speed=speed):
+                speech_len = model_output['tts_speech'].shape[1] / 22050
+                logging.info('yield speech len {}, rtf {}'.format(speech_len, (time.time() - start_time) / speech_len))
+                yield model_output
+                start_time = time.time()
+    def synthesize(self, tts_text, prompt_text, prompt_speech_16k, key, emotion_embedding, stream=False, speed=1.0):
+        prompt_text = self.frontend.text_normalize(key+'<endofprompt>' + prompt_text, split=False)
+        for i in tqdm(self.frontend.text_normalize(tts_text, split=True)):
+            if len(i) < 0.5 * len(prompt_text):
+                logging.warning('synthesis text {} too short than prompt text {}, this may lead to bad performance'.format(i, prompt_text))
+            model_input = self.frontend.frontend_zero_shot(i, prompt_text, prompt_speech_16k,emotion_embedding)
+            print("input:", model_input)
+            start_time = time.time()
+            logging.info('synthesis text {}'.format(i))
+            for model_output in self.model.tts(**model_input, stream=stream, speed=speed):
+                speech_len = model_output['tts_speech'].shape[1] / 22050
+                logging.info('yield speech len {}, rtf {}'.format(speech_len, (time.time() - start_time) / speech_len))
+                yield model_output
+                start_time = time.time()
+    def inference_cross_lingual(self, tts_text, prompt_speech_16k, stream=False, speed=1.0):
+        if self.frontend.instruct is True:
+            raise ValueError('{} do not support cross_lingual inference'.format(self.model_dir))
+        for i in tqdm(self.frontend.text_normalize(tts_text, split=True)):
+            model_input = self.frontend.frontend_cross_lingual(i, prompt_speech_16k)
+            start_time = time.time()
+            logging.info('synthesis text {}'.format(i))
+            for model_output in self.model.tts(**model_input, stream=stream, speed=speed):
+                speech_len = model_output['tts_speech'].shape[1] / 22050
+                logging.info('yield speech len {}, rtf {}'.format(speech_len, (time.time() - start_time) / speech_len))
+                yield model_output
+                start_time = time.time()
+    def inference_instruct(self, tts_text, spk_id, instruct_text, stream=False, speed=1.0):
+        instruct_text = self.frontend.text_normalize(instruct_text, split=False)
+        for i in tqdm(self.frontend.text_normalize(tts_text, split=True)):
+            model_input = self.frontend.frontend_instruct(i, spk_id, instruct_text)
+            start_time = time.time()
+            logging.info('synthesis text {}'.format(i))
+            for model_output in self.model.tts(**model_input, stream=stream, speed=speed):
+                speech_len = model_output['tts_speech'].shape[1] / 22050
+                logging.info('yield speech len {}, rtf {}'.format(speech_len, (time.time() - start_time) / speech_len))
+                yield model_output
+                start_time = time.time()
+    def inference_vc(self, source_speech_16k, prompt_speech_16k, stream=False, speed=1.0):
+        model_input = self.frontend.frontend_vc(source_speech_16k, prompt_speech_16k)
+        start_time = time.time()
+        for model_output in self.model.vc(**model_input, stream=stream, speed=speed):
+            speech_len = model_output['tts_speech'].shape[1] / 22050
+            logging.info('yield speech len {}, rtf {}'.format(speech_len, (time.time() - start_time) / speech_len))
+            yield model_output
+            start_time = time.time()

cosyvoice_rodis/cli/frontend.py ADDED Viewed

	@@ -0,0 +1,192 @@

+#
+# Copyright (c) 2024 Alibaba Inc (authors: Xiang Lyu)
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+from functools import partial
+import onnxruntime
+import torch
+import numpy as np
+import whisper
+from typing import Callable
+import torchaudio.compliance.kaldi as kaldi
+import torchaudio
+import os
+import re
+import inflect
+# try:
+#     import ttsfrd
+#     use_ttsfrd = True
+# except ImportError:
+#     print("failed to import ttsfrd, use WeTextProcessing instead")
+from tn.chinese.normalizer import Normalizer as ZhNormalizer
+from tn.english.normalizer import Normalizer as EnNormalizer
+use_ttsfrd = False
+from ..utils.frontend_utils import contains_chinese, replace_blank, replace_corner_mark, remove_bracket, spell_out_number, split_paragraph
+class CosyVoiceFrontEnd:
+    def __init__(self,
+                 get_tokenizer: Callable,
+                 feat_extractor: Callable,
+                 campplus_model: str,
+                 speech_tokenizer_model: str,
+                 spk2info: str = '',
+                 instruct: bool = False,
+                 allowed_special: str = 'all'):
+        self.tokenizer = get_tokenizer()
+        self.feat_extractor = feat_extractor
+        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+        option = onnxruntime.SessionOptions()
+        option.graph_optimization_level = onnxruntime.GraphOptimizationLevel.ORT_ENABLE_ALL
+        option.intra_op_num_threads = 1
+        self.campplus_session = onnxruntime.InferenceSession(campplus_model, sess_options=option, providers=["CPUExecutionProvider"])
+        self.speech_tokenizer_session = onnxruntime.InferenceSession(speech_tokenizer_model, sess_options=option,
+                                                                     providers=["CUDAExecutionProvider" if torch.cuda.is_available() else
+                                                                                "CPUExecutionProvider"])
+        if os.path.exists(spk2info):
+            self.spk2info = torch.load(spk2info, map_location=self.device)
+        else:
+            self.spk2info = {}
+        self.instruct = instruct
+        self.allowed_special = allowed_special
+        self.inflect_parser = inflect.engine()
+        self.use_ttsfrd = use_ttsfrd
+        if self.use_ttsfrd:
+            self.frd = ttsfrd.TtsFrontendEngine()
+            ROOT_DIR = os.path.dirname(os.path.abspath(__file__))
+            assert self.frd.initialize('/mnt/workspace/baipeng/project/Marco-Voice/Models/marco_voice/utils/pretrained_models/CosyVoice-ttsfrd/resource'.format(ROOT_DIR)) is True, \
+                'failed to initialize ttsfrd resource'
+            self.frd.set_lang_type('pinyinvg')
+            self.frd.enable_pinyin_mix(True)
+            self.frd.set_breakmodel_index(1)
+        else:
+            self.zh_tn_model = ZhNormalizer(remove_erhua=False, full_to_half=False)
+            self.en_tn_model = EnNormalizer()
+    def _extract_text_token(self, text):
+        text_token = self.tokenizer.encode(text, allowed_special=self.allowed_special)
+        text_token = torch.tensor([text_token], dtype=torch.int32).to(self.device)
+        text_token_len = torch.tensor([text_token.shape[1]], dtype=torch.int32).to(self.device) # 14 21
+        return text_token, text_token_len
+    def _extract_speech_token(self, speech):
+        assert speech.shape[1] / 16000 <= 30, 'do not support extract speech token for audio longer than 30s'
+        feat = whisper.log_mel_spectrogram(speech, n_mels=128)
+        speech_token = self.speech_tokenizer_session.run(None,
+                                                         {self.speech_tokenizer_session.get_inputs()[0].name:
+                                                          feat.detach().cpu().numpy(),
+                                                          self.speech_tokenizer_session.get_inputs()[1].name:
+                                                          np.array([feat.shape[2]], dtype=np.int32)})[0].flatten().tolist()
+        speech_token = torch.tensor([speech_token], dtype=torch.int32).to(self.device)
+        speech_token_len = torch.tensor([speech_token.shape[1]], dtype=torch.int32).to(self.device)
+        return speech_token, speech_token_len
+    def _extract_spk_embedding(self, speech):
+        feat = kaldi.fbank(speech,
+                           num_mel_bins=80,
+                           dither=0,
+                           sample_frequency=16000)
+        feat = feat - feat.mean(dim=0, keepdim=True)
+        embedding = self.campplus_session.run(None,
+                                              {self.campplus_session.get_inputs()[0].name: feat.unsqueeze(dim=0).cpu().numpy()})[0].flatten().tolist()
+        embedding = torch.tensor([embedding]).to(self.device)
+        return embedding
+    def _extract_speech_feat(self, speech):
+        speech_feat = self.feat_extractor(speech).squeeze(dim=0).transpose(0, 1).to(self.device)
+        speech_feat = speech_feat.unsqueeze(dim=0)
+        speech_feat_len = torch.tensor([speech_feat.shape[1]], dtype=torch.int32).to(self.device)
+        return speech_feat, speech_feat_len
+    def text_normalize(self, text, split=True):
+        text = text.strip()
+        if contains_chinese(text):
+            if self.use_ttsfrd:
+                text = self.frd.get_frd_extra_info(text, 'input')
+            else:
+                text = self.zh_tn_model.normalize(text)
+            text = text.replace("\n", "")
+            text = replace_blank(text)
+            text = replace_corner_mark(text)
+            text = text.replace(".", "。")
+            text = text.replace(" - ", "，")
+            text = remove_bracket(text)
+            text = re.sub(r'[，,、]+$', '。', text)
+            texts = list(split_paragraph(text, partial(self.tokenizer.encode, allowed_special=self.allowed_special), "zh", token_max_n=80,
+                                         token_min_n=60, merge_len=20, comma_split=False))
+        else:
+            if self.use_ttsfrd:
+                text = self.frd.get_frd_extra_info(text, 'input')
+            else:
+                text = self.en_tn_model.normalize(text)
+            text = spell_out_number(text, self.inflect_parser)
+            texts = list(split_paragraph(text, partial(self.tokenizer.encode, allowed_special=self.allowed_special), "en", token_max_n=80,
+                                         token_min_n=60, merge_len=20, comma_split=False))
+        if split is False:
+            return text
+        return texts
+    def frontend_sft(self, tts_text, spk_id):
+        tts_text_token, tts_text_token_len = self._extract_text_token(tts_text)
+        embedding = self.spk2info[spk_id]['embedding']
+        model_input = {'text': tts_text_token, 'text_len': tts_text_token_len, 'llm_embedding': embedding, 'flow_embedding': embedding}
+        return model_input
+    def frontend_zero_shot(self, tts_text, prompt_text, prompt_speech_16k, emotion_speakerminus):
+        tts_text_token, tts_text_token_len = self._extract_text_token(tts_text)
+        prompt_text_token, prompt_text_token_len = self._extract_text_token(prompt_text)
+        prompt_speech_22050 = torchaudio.transforms.Resample(orig_freq=16000, new_freq=22050)(prompt_speech_16k)
+        speech_feat, speech_feat_len = self._extract_speech_feat(prompt_speech_22050)
+        speech_token, speech_token_len = self._extract_speech_token(prompt_speech_16k)
+        embedding = self._extract_spk_embedding(prompt_speech_16k)
+        model_input = {'text': tts_text_token, 'text_len': tts_text_token_len,
+                       'prompt_text': prompt_text_token, 'prompt_text_len': prompt_text_token_len,
+                       'llm_prompt_speech_token': speech_token, 'llm_prompt_speech_token_len': speech_token_len,
+                       'flow_prompt_speech_token': speech_token, 'flow_prompt_speech_token_len': speech_token_len,
+                       'prompt_speech_feat': speech_feat, 'prompt_speech_feat_len': speech_feat_len,
+                       'llm_embedding': embedding, 'emotion_embedding': emotion_speakerminus, 'flow_embedding': embedding}
+        return model_input
+    def frontend_cross_lingual(self, tts_text, prompt_speech_16k):
+        model_input = self.frontend_zero_shot(tts_text, '', prompt_speech_16k)
+        # in cross lingual mode, we remove prompt in llm
+        del model_input['prompt_text']
+        del model_input['prompt_text_len']
+        del model_input['llm_prompt_speech_token']
+        del model_input['llm_prompt_speech_token_len']
+        return model_input
+    def frontend_instruct(self, tts_text, spk_id, instruct_text):
+        model_input = self.frontend_sft(tts_text, spk_id)
+        # in instruct mode, we remove spk_embedding in llm due to information leakage
+        del model_input['llm_embedding']
+        instruct_text_token, instruct_text_token_len = self._extract_text_token(instruct_text + '<endofprompt>')
+        model_input['prompt_text'] = instruct_text_token
+        model_input['prompt_text_len'] = instruct_text_token_len
+        return model_input
+    def frontend_vc(self, source_speech_16k, prompt_speech_16k):
+        prompt_speech_token, prompt_speech_token_len = self._extract_speech_token(prompt_speech_16k)
+        prompt_speech_22050 = torchaudio.transforms.Resample(orig_freq=16000, new_freq=22050)(prompt_speech_16k)
+        prompt_speech_feat, prompt_speech_feat_len = self._extract_speech_feat(prompt_speech_22050)
+        embedding = self._extract_spk_embedding(prompt_speech_16k)
+        source_speech_token, source_speech_token_len = self._extract_speech_token(source_speech_16k)
+        model_input = {'source_speech_token': source_speech_token, 'source_speech_token_len': source_speech_token_len,
+                       'flow_prompt_speech_token': prompt_speech_token, 'flow_prompt_speech_token_len': prompt_speech_token_len,
+                       'prompt_speech_feat': prompt_speech_feat, 'prompt_speech_feat_len': prompt_speech_feat_len,
+                       'flow_embedding': embedding}
+        return model_input

cosyvoice_rodis/cli/model.py ADDED Viewed

	@@ -0,0 +1,257 @@

+#
+# Copyright (c) 2024 Alibaba Inc (authors: Xiang Lyu)
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import torch
+import numpy as np
+import threading
+import time
+from torch.nn import functional as F
+from contextlib import nullcontext
+import uuid
+from ..utils.common import fade_in_out
+class CosyVoiceModel:
+    def __init__(self,
+                 llm: torch.nn.Module,
+                 flow: torch.nn.Module,
+                 hift: torch.nn.Module,
+                 fp16: bool):
+        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+        self.llm = llm
+        self.flow = flow
+        self.hift = hift
+        self.fp16 = fp16
+        self.token_min_hop_len = 2 * self.flow.input_frame_rate
+        self.token_max_hop_len = 4 * self.flow.input_frame_rate
+        self.token_overlap_len = 20
+        # mel fade in out
+        self.mel_overlap_len = int(self.token_overlap_len / self.flow.input_frame_rate * 22050 / 256)
+        self.mel_window = np.hamming(2 * self.mel_overlap_len)
+        # hift cache
+        self.mel_cache_len = 20
+        self.source_cache_len = int(self.mel_cache_len * 256)
+        # speech fade in out
+        self.speech_window = np.hamming(2 * self.source_cache_len)
+        # rtf and decoding related
+        self.stream_scale_factor = 1
+        assert self.stream_scale_factor >= 1, 'stream_scale_factor should be greater than 1, change it according to your actual rtf'
+        self.llm_context = torch.cuda.stream(torch.cuda.Stream(self.device)) if torch.cuda.is_available() else nullcontext()
+        self.lock = threading.Lock()
+        # dict used to store session related variable
+        self.tts_speech_token_dict = {}
+        self.llm_end_dict = {}
+        self.mel_overlap_dict = {}
+        self.flow_cache_dict = {}
+        self.hift_cache_dict = {}
+    def load(self, llm_model, flow_model, hift_model):
+        self.llm.load_state_dict(torch.load(llm_model, map_location=self.device), strict=False)
+        self.llm.to(self.device).eval()
+        if self.fp16 is True:
+            self.llm.half()
+        self.flow.load_state_dict(torch.load(flow_model, map_location=self.device), strict=False)
+        self.flow.to(self.device).eval()
+        # in case hift_model is a hifigan model
+        hift_state_dict = {k.replace('generator.', ''): v for k, v in torch.load(hift_model, map_location=self.device).items()}
+        self.hift.load_state_dict(hift_state_dict, strict=False)
+        self.hift.to(self.device).eval()
+    def load_jit(self, llm_text_encoder_model, llm_llm_model, flow_encoder_model):
+        assert self.fp16 is True, "we only provide fp16 jit model, set fp16=True if you want to use jit model"
+        llm_text_encoder = torch.jit.load(llm_text_encoder_model, map_location=self.device)
+        self.llm.text_encoder = llm_text_encoder
+        llm_llm = torch.jit.load(llm_llm_model, map_location=self.device)
+        self.llm.llm = llm_llm
+        flow_encoder = torch.jit.load(flow_encoder_model, map_location=self.device)
+        self.flow.encoder = flow_encoder
+    def load_onnx(self, flow_decoder_estimator_model):
+        import onnxruntime
+        option = onnxruntime.SessionOptions()
+        option.graph_optimization_level = onnxruntime.GraphOptimizationLevel.ORT_ENABLE_ALL
+        option.intra_op_num_threads = 1
+        providers = ['CUDAExecutionProvider' if torch.cuda.is_available() else 'CPUExecutionProvider']
+        del self.flow.decoder.estimator
+        self.flow.decoder.estimator = onnxruntime.InferenceSession(flow_decoder_estimator_model, sess_options=option, providers=providers)
+    def llm_job(self, text, prompt_text, llm_prompt_speech_token, llm_embedding, emotion_embedding, uuid):
+        if self.fp16 is True:
+            llm_embedding = llm_embedding.half()
+        with self.llm_context:
+            for i in self.llm.inference(text=text.to(self.device),
+                                        text_len=torch.tensor([text.shape[1]], dtype=torch.int32).to(self.device),
+                                        prompt_text=prompt_text.to(self.device),
+                                        prompt_text_len=torch.tensor([prompt_text.shape[1]], dtype=torch.int32).to(self.device),
+                                        prompt_speech_token=llm_prompt_speech_token.to(self.device),
+                                        prompt_speech_token_len=torch.tensor([llm_prompt_speech_token.shape[1]], dtype=torch.int32).to(self.device),
+                                        embedding=llm_embedding.to(self.device),
+                                        emotion_embedding = emotion_embedding.to(self.device)):
+                self.tts_speech_token_dict[uuid].append(i)
+        self.llm_end_dict[uuid] = True
+    def token2wav(self, token, prompt_token, prompt_feat, embedding, uuid, finalize=False, speed=1.0):
+        tts_mel, flow_cache = self.flow.inference(token=token.to(self.device),
+                                                  token_len=torch.tensor([token.shape[1]], dtype=torch.int32).to(self.device),
+                                                  prompt_token=prompt_token.to(self.device),
+                                                  prompt_token_len=torch.tensor([prompt_token.shape[1]], dtype=torch.int32).to(self.device),
+                                                  prompt_feat=prompt_feat.to(self.device),
+                                                  prompt_feat_len=torch.tensor([prompt_feat.shape[1]], dtype=torch.int32).to(self.device),
+                                                  embedding=embedding.to(self.device),
+                                                  flow_cache=self.flow_cache_dict[uuid])
+        self.flow_cache_dict[uuid] = flow_cache
+        # mel overlap fade in out
+        if self.mel_overlap_dict[uuid].shape[2] != 0:
+            tts_mel = fade_in_out(tts_mel, self.mel_overlap_dict[uuid], self.mel_window)
+        # append hift cache
+        if self.hift_cache_dict[uuid] is not None:
+            hift_cache_mel, hift_cache_source = self.hift_cache_dict[uuid]['mel'], self.hift_cache_dict[uuid]['source']
+            tts_mel = torch.concat([hift_cache_mel, tts_mel], dim=2)
+        else:
+            hift_cache_source = torch.zeros(1, 1, 0)
+        # keep overlap mel and hift cache
+        if finalize is False:
+            self.mel_overlap_dict[uuid] = tts_mel[:, :, -self.mel_overlap_len:]
+            tts_mel = tts_mel[:, :, :-self.mel_overlap_len]
+            tts_speech, tts_source = self.hift.inference(speech_feat=tts_mel, cache_source=hift_cache_source)
+            if self.hift_cache_dict[uuid] is not None:
+                tts_speech = fade_in_out(tts_speech, self.hift_cache_dict[uuid]['speech'], self.speech_window)
+            self.hift_cache_dict[uuid] = {'mel': tts_mel[:, :, -self.mel_cache_len:],
+                                          'source': tts_source[:, :, -self.source_cache_len:],
+                                          'speech': tts_speech[:, -self.source_cache_len:]}
+            tts_speech = tts_speech[:, :-self.source_cache_len]
+        else:
+            if speed != 1.0:
+                assert self.hift_cache_dict[uuid] is None, 'speed change only support non-stream inference mode'
+                tts_mel = F.interpolate(tts_mel, size=int(tts_mel.shape[2] / speed), mode='linear')
+            tts_speech, tts_source = self.hift.inference(speech_feat=tts_mel, cache_source=hift_cache_source)
+            if self.hift_cache_dict[uuid] is not None:
+                tts_speech = fade_in_out(tts_speech, self.hift_cache_dict[uuid]['speech'], self.speech_window)
+        return tts_speech
+    def tts(self, text, flow_embedding, llm_embedding=torch.zeros(0, 192), emotion_embedding=torch.zeros(0, 192),
+            prompt_text=torch.zeros(1, 0, dtype=torch.int32),
+            llm_prompt_speech_token=torch.zeros(1, 0, dtype=torch.int32),
+            flow_prompt_speech_token=torch.zeros(1, 0, dtype=torch.int32),
+            prompt_speech_feat=torch.zeros(1, 0, 80), stream=False, speed=1.0, **kwargs):
+        # this_uuid is used to track variables related to this inference thread
+        #print("tts函数中")
+        #print(text)
+        this_uuid = str(uuid.uuid1())
+        with self.lock:
+            self.tts_speech_token_dict[this_uuid], self.llm_end_dict[this_uuid] = [], False
+            self.hift_cache_dict[this_uuid] = None
+            self.mel_overlap_dict[this_uuid] = torch.zeros(1, 80, 0)
+            self.flow_cache_dict[this_uuid] = torch.zeros(1, 80, 0, 2)
+        p = threading.Thread(target=self.llm_job, args=(text, prompt_text, llm_prompt_speech_token, llm_embedding, emotion_embedding, this_uuid))
+        p.start()
+        if stream is True:
+            token_hop_len = self.token_min_hop_len
+            while True:
+                time.sleep(0.1)
+                if len(self.tts_speech_token_dict[this_uuid]) >= token_hop_len + self.token_overlap_len:
+                    this_tts_speech_token = torch.tensor(self.tts_speech_token_dict[this_uuid][:token_hop_len + self.token_overlap_len]) \
+                        .unsqueeze(dim=0)
+                    this_tts_speech = self.token2wav(token=this_tts_speech_token,
+                                                     prompt_token=flow_prompt_speech_token,
+                                                     prompt_feat=prompt_speech_feat,
+                                                     embedding=flow_embedding,
+                                                     uuid=this_uuid,
+                                                     finalize=False)
+                    yield {'tts_speech': this_tts_speech.cpu()}
+                    with self.lock:
+                        self.tts_speech_token_dict[this_uuid] = self.tts_speech_token_dict[this_uuid][token_hop_len:]
+                    # increase token_hop_len for better speech quality
+                    token_hop_len = min(self.token_max_hop_len, int(token_hop_len * self.stream_scale_factor))
+                if self.llm_end_dict[this_uuid] is True and len(self.tts_speech_token_dict[this_uuid]) < token_hop_len + self.token_overlap_len:
+                    break
+            p.join()
+            this_tts_speech_token = torch.tensor(self.tts_speech_token_dict[this_uuid]).unsqueeze(dim=0)
+            this_tts_speech = self.token2wav(token=this_tts_speech_token,
+                                             prompt_token=flow_prompt_speech_token,
+                                             prompt_feat=prompt_speech_feat,
+                                             embedding=flow_embedding,
+                                             uuid=this_uuid,
+                                             finalize=True)
+            yield {'tts_speech': this_tts_speech.cpu()}
+        else:
+            p.join()
+            this_tts_speech_token = torch.tensor(self.tts_speech_token_dict[this_uuid]).unsqueeze(dim=0)
+            this_tts_speech = self.token2wav(token=this_tts_speech_token,
+                                             prompt_token=flow_prompt_speech_token,
+                                             prompt_feat=prompt_speech_feat,
+                                             embedding=flow_embedding,
+                                             uuid=this_uuid,
+                                             finalize=True,
+                                             speed=speed)
+            yield {'tts_speech': this_tts_speech.cpu()}
+        with self.lock:
+            self.tts_speech_token_dict.pop(this_uuid)
+            self.llm_end_dict.pop(this_uuid)
+            self.mel_overlap_dict.pop(this_uuid)
+            self.hift_cache_dict.pop(this_uuid)
+            self.flow_cache_dict.pop(this_uuid)
+    def vc(self, source_speech_token, flow_prompt_speech_token, prompt_speech_feat, flow_embedding, stream=False, speed=1.0, **kwargs):
+        this_uuid = str(uuid.uuid1())
+        with self.lock:
+            self.tts_speech_token_dict[this_uuid], self.llm_end_dict[this_uuid] = source_speech_token.flatten().tolist(), True
+            self.hift_cache_dict[this_uuid] = None
+            self.mel_overlap_dict[this_uuid] = torch.zeros(1, 80, 0)
+            self.flow_cache_dict[this_uuid] = torch.zeros(1, 80, 0, 2)
+        if stream is True:
+            token_hop_len = self.token_min_hop_len
+            while True:
+                if len(self.tts_speech_token_dict[this_uuid]) >= token_hop_len + self.token_overlap_len:
+                    this_tts_speech_token = torch.tensor(self.tts_speech_token_dict[this_uuid][:token_hop_len + self.token_overlap_len]) \
+                        .unsqueeze(dim=0)
+                    this_tts_speech = self.token2wav(token=this_tts_speech_token,
+                                                     prompt_token=flow_prompt_speech_token,
+                                                     prompt_feat=prompt_speech_feat,
+                                                     embedding=flow_embedding,
+                                                     uuid=this_uuid,
+                                                     finalize=False)
+                    yield {'tts_speech': this_tts_speech.cpu()}
+                    with self.lock:
+                        self.tts_speech_token_dict[this_uuid] = self.tts_speech_token_dict[this_uuid][token_hop_len:]
+                    token_hop_len = min(self.token_max_hop_len, int(token_hop_len * self.stream_scale_factor))
+                if self.llm_end_dict[this_uuid] is True and len(self.tts_speech_token_dict[this_uuid]) < token_hop_len + self.token_overlap_len:
+                    break
+            this_tts_speech_token = torch.tensor(self.tts_speech_token_dict[this_uuid]).unsqueeze(dim=0)
+            this_tts_speech = self.token2wav(token=this_tts_speech_token,
+                                             prompt_token=flow_prompt_speech_token,
+                                             prompt_feat=prompt_speech_feat,
+                                             embedding=flow_embedding,
+                                             uuid=this_uuid,
+                                             finalize=True)
+            yield {'tts_speech': this_tts_speech.cpu()}
+        else:
+            this_tts_speech_token = torch.tensor(self.tts_speech_token_dict[this_uuid]).unsqueeze(dim=0)
+            this_tts_speech = self.token2wav(token=this_tts_speech_token,
+                                             prompt_token=flow_prompt_speech_token,
+                                             prompt_feat=prompt_speech_feat,
+                                             embedding=flow_embedding,
+                                             uuid=this_uuid,
+                                             finalize=True,
+                                             speed=speed)
+            yield {'tts_speech': this_tts_speech.cpu()}
+        with self.lock:
+            self.tts_speech_token_dict.pop(this_uuid)
+            self.llm_end_dict.pop(this_uuid)
+            self.mel_overlap_dict.pop(this_uuid)
+            self.hift_cache_dict.pop(this_uuid)

cosyvoice_rodis/dataset/__init__.py ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ #
2	+

cosyvoice_rodis/dataset/__pycache__/__init__.cpython-310.pyc ADDED Viewed

Binary file (190 Bytes). View file

cosyvoice_rodis/dataset/__pycache__/__init__.cpython-38.pyc ADDED Viewed

Binary file (192 Bytes). View file

cosyvoice_rodis/dataset/__pycache__/__init__.cpython-39.pyc ADDED Viewed

Binary file (154 Bytes). View file

cosyvoice_rodis/dataset/__pycache__/dataset.cpython-310.pyc ADDED Viewed

Binary file (4.96 kB). View file

cosyvoice_rodis/dataset/__pycache__/dataset.cpython-38.pyc ADDED Viewed

Binary file (4.96 kB). View file

cosyvoice_rodis/dataset/__pycache__/processor.cpython-310.pyc ADDED Viewed

Binary file (12.8 kB). View file

cosyvoice_rodis/dataset/__pycache__/processor.cpython-38.pyc ADDED Viewed

Binary file (13.2 kB). View file

cosyvoice_rodis/dataset/__pycache__/processor.cpython-39.pyc ADDED Viewed

Binary file (12.8 kB). View file

cosyvoice_rodis/dataset/dataset.py ADDED Viewed

	@@ -0,0 +1,163 @@

+#
+# Copyright (c) 2021 Mobvoi Inc. (authors: Binbin Zhang)
+#               2024 Alibaba Inc (authors: Xiang Lyu)
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import random
+import json
+import math
+from functools import partial
+import torch
+import torch.distributed as dist
+from torch.utils.data import IterableDataset
+from cosyvoice_rodis.utils.file_utils import read_lists, read_json_lists
+class Processor(IterableDataset):
+    def __init__(self, source, f, *args, **kw):
+        assert callable(f)
+        self.source = source
+        self.f = f
+        self.args = args
+        self.kw = kw
+    def set_epoch(self, epoch):
+        self.source.set_epoch(epoch)
+    def __iter__(self):
+        """ Return an iterator over the source dataset processed by the
+            given processor.
+        """
+        assert self.source is not None
+        assert callable(self.f)
+        return self.f(iter(self.source), *self.args, **self.kw)
+    def apply(self, f):
+        assert callable(f)
+        return Processor(self, f, *self.args, **self.kw)
+class DistributedSampler:
+    def __init__(self, shuffle=True, partition=True):
+        self.epoch = -1
+        self.update()
+        self.shuffle = shuffle
+        self.partition = partition
+    def update(self):
+        assert dist.is_available()
+        if dist.is_initialized():
+            self.rank = dist.get_rank()
+            self.world_size = dist.get_world_size()
+        else:
+            self.rank = 0
+            self.world_size = 1
+        worker_info = torch.utils.data.get_worker_info()
+        if worker_info is None:
+            self.worker_id = 0
+            self.num_workers = 1
+        else:
+            self.worker_id = worker_info.id
+            self.num_workers = worker_info.num_workers
+        return dict(rank=self.rank,
+                    world_size=self.world_size,
+                    worker_id=self.worker_id,
+                    num_workers=self.num_workers)
+    def set_epoch(self, epoch):
+        self.epoch = epoch
+    def sample(self, data):
+        """ Sample data according to rank/world_size/num_workers
+            Args:
+                data(List): input data list
+            Returns:
+                List: data list after sample
+        """
+        data = list(range(len(data)))
+        # force datalist even
+        if self.partition:
+            if self.shuffle:
+                random.Random(self.epoch).shuffle(data)
+            if len(data) < self.world_size:
+                data = data * math.ceil(self.world_size / len(data))
+                data = data[:self.world_size]
+            data = data[self.rank::self.world_size]
+        if len(data) < self.num_workers:
+            data = data * math.ceil(self.num_workers / len(data))
+            data = data[:self.num_workers]
+        data = data[self.worker_id::self.num_workers]
+        return data
+class DataList(IterableDataset):
+    def __init__(self, lists, shuffle=True, partition=True):
+        self.lists = lists
+        self.sampler = DistributedSampler(shuffle, partition)
+    def set_epoch(self, epoch):
+        self.sampler.set_epoch(epoch)
+    def __iter__(self):
+        sampler_info = self.sampler.update()
+        indexes = self.sampler.sample(self.lists)
+        for index in indexes:
+            data = dict(src=self.lists[index])
+            data.update(sampler_info)
+            yield data
+def Dataset(data_list_file,
+            data_pipeline,
+            mode='train',
+            gan=False,
+            shuffle=True,
+            partition=True,
+            tts_file='',
+            prompt_utt2data=''):
+    """ Construct dataset from arguments
+        We have two shuffle stage in the Dataset. The first is global
+        shuffle at shards tar/raw file level. The second is global shuffle
+        at training samples level.
+        Args:
+            data_type(str): raw/shard
+            tokenizer (BaseTokenizer): tokenizer to tokenize
+            partition(bool): whether to do data partition in terms of rank
+    """
+    # import pdb;pdb.set_trace()
+    assert mode in ['train', 'inference']
+    lists = read_lists(data_list_file) #读取文件数据
+    if mode == 'inference':
+        with open(tts_file) as f:
+            tts_data = json.load(f)
+        utt2lists = read_json_lists(prompt_utt2data)
+        # filter unnecessary file in inference mode
+        lists = list({utt2lists[utt] for utt in tts_data.keys() if utt2lists[utt] in lists})
+    dataset = DataList(lists,
+                       shuffle=shuffle,
+                       partition=partition) #list就是tar文件
+    if mode == 'inference':
+        # map partial arg to parquet_opener func in inference mode
+        data_pipeline[0] = partial(data_pipeline[0], tts_data=tts_data)
+    if gan is True:
+        # map partial arg to padding func in gan mode
+        data_pipeline[-1] = partial(data_pipeline[-1], gan=gan)
+    for func in data_pipeline:
+        dataset = Processor(dataset, func, mode=mode)
+    return dataset

cosyvoice_rodis/dataset/processor.py ADDED Viewed

	@@ -0,0 +1,427 @@

+#
+# Copyright (c) 2024 Alibaba Inc (authors: Xiang Lyu)
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import logging
+import random
+import pyarrow.parquet as pq
+from io import BytesIO
+import torch
+import torchaudio
+from torch.nn.utils.rnn import pad_sequence
+import torch.nn.functional as F
+torchaudio.set_audio_backend('soundfile')
+AUDIO_FORMAT_SETS = {'flac', 'mp3', 'm4a', 'ogg', 'opus', 'wav', 'wma'}
+def parquet_opener(data, mode='train', tts_data={}):
+    """ Give url or local file, return file descriptor
+        Inplace operation.
+        Args:
+            data(Iterable[str]): url or local file list
+        Returns:
+            Iterable[{src, stream}]
+    """
+    for sample in data:
+        assert 'src' in sample
+        url = sample['src'] #'/mnt/workspace/baipeng/project/Marco-Voice/Dataset/hunhe_data/LZED/processed_xiaoyu30_new/train/parquet/parquet_000000001.tar'
+        try:
+            df = pq.read_table(url).to_pandas()
+            for i in range(len(df)):
+                if mode == 'inference' and df.loc[i, 'utt'] not in tts_data:
+                    continue
+                sample.update(dict(df.loc[i]))
+                if mode == 'train':
+                    # NOTE do not return sample directly, must initialize a new dict
+                    yield {**sample}
+                else:
+                    for index, text in enumerate(tts_data[df.loc[i, 'utt']]):
+                        yield {**sample, 'tts_index': index, 'tts_text': text}
+        except Exception as ex:
+            logging.warning('Failed to open {}, ex info {}'.format(url, ex))
+def filter(data,
+           max_length=10240,
+           min_length=10,
+           token_max_length=200,
+           token_min_length=1,
+           min_output_input_ratio=0.0005,
+           max_output_input_ratio=1,
+           mode='train'):
+    """ Filter sample according to feature and label length
+        Inplace operation.
+        Args::
+            data: Iterable[{key, wav, label, sample_rate}]
+            max_length: drop utterance which is greater than max_length(10ms)
+            min_length: drop utterance which is less than min_length(10ms)
+            token_max_length: drop utterance which is greater than
+                token_max_length, especially when use char unit for
+                english modeling
+            token_min_length: drop utterance which is
+                less than token_max_length
+            min_output_input_ratio: minimal ration of
+                token_length / feats_length(10ms)
+            max_output_input_ratio: maximum ration of
+                token_length / feats_length(10ms)
+        Returns:
+            Iterable[{key, wav, label, sample_rate}]
+    """
+    for sample in data:
+        sample['speech'], sample['sample_rate'] = torchaudio.load(BytesIO(sample['audio_data']))
+        sample['speech'] = sample['speech'].mean(dim=0, keepdim=True)
+        del sample['audio_data']
+        # sample['wav'] is torch.Tensor, we have 100 frames every second
+        num_frames = sample['speech'].size(1) / sample['sample_rate'] * 100
+        if num_frames < min_length:
+            continue
+        if num_frames > max_length:
+            continue
+        if len(sample['text_token']) < token_min_length:
+            continue
+        if len(sample['text_token']) > token_max_length:
+            continue
+        if len(sample['speech_token']) == 0:
+            continue
+        if num_frames != 0:
+            if len(sample['text_token']) / num_frames < min_output_input_ratio:
+                continue
+            if len(sample['text_token']) / num_frames > max_output_input_ratio:
+                continue
+        yield sample
+def resample(data, resample_rate=22050, min_sample_rate=16000, mode='train'):
+    """ Resample data.
+        Inplace operation.
+        Args:
+            data: Iterable[{key, wav, label, sample_rate}]
+            resample_rate: target resample rate
+        Returns:
+            Iterable[{key, wav, label, sample_rate}]
+    """
+    for sample in data:
+        assert 'sample_rate' in sample
+        assert 'speech' in sample
+        sample_rate = sample['sample_rate']
+        waveform = sample['speech']
+        if sample_rate != resample_rate:
+            if sample_rate < min_sample_rate:
+                continue
+            sample['sample_rate'] = resample_rate
+            sample['speech'] = torchaudio.transforms.Resample(
+                orig_freq=sample_rate, new_freq=resample_rate)(waveform)
+        max_val = sample['speech'].abs().max()
+        if max_val > 1:
+            sample['speech'] /= max_val
+        yield sample
+def truncate(data, truncate_length=24576, mode='train'):
+    """ Truncate data.
+        Args:
+            data: Iterable[{key, wav, label, sample_rate}]
+            truncate_length: truncate length
+        Returns:
+            Iterable[{key, wav, label, sample_rate}]
+    """
+    for sample in data:
+        waveform = sample['speech']
+        if waveform.shape[1] > truncate_length:
+            start = random.randint(0, waveform.shape[1] - truncate_length)
+            waveform = waveform[:, start: start + truncate_length]
+        else:
+            waveform = torch.concat([waveform, torch.zeros(1, truncate_length - waveform.shape[1])], dim=1)
+        sample['speech'] = waveform
+        yield sample
+def compute_fbank(data,
+                  feat_extractor,
+                  mode='train'):
+    """ Extract fbank
+        Args:
+            data: Iterable[{key, wav, label, sample_rate}]
+        Returns:
+            Iterable[{key, feat, label}]
+    """
+    for sample in data:
+        assert 'sample_rate' in sample
+        assert 'speech' in sample
+        assert 'utt' in sample
+        assert 'text_token' in sample
+        waveform = sample['speech']
+        mat = feat_extractor(waveform).squeeze(dim=0).transpose(0, 1)
+        sample['speech_feat'] = mat
+        yield sample
+def compute_f0(data, pitch_extractor, mode='train'):
+    """ Extract f0
+        Args:
+            data: Iterable[{key, wav, label, sample_rate}]
+        Returns:
+            Iterable[{key, feat, label}]
+    """
+    for sample in data:
+        assert 'sample_rate' in sample
+        assert 'speech' in sample
+        assert 'utt' in sample
+        assert 'text_token' in sample
+        waveform = sample['speech']
+        mat = pitch_extractor(waveform).transpose(1, 2)
+        mat = F.interpolate(mat, size=sample['speech_feat'].shape[0], mode='linear')
+        sample['pitch_feat'] = mat[0, 0]
+        yield sample
+def parse_embedding(data, normalize, mode='train'):
+    """ Parse utt_embedding/spk_embedding/emotion_embedding
+        Args:
+            data: Iterable[{key, wav, label, sample_rate}]
+        Returns:
+            Iterable[{key, feat, label}]
+    """
+    for sample in data:
+        # print("sample:", sample)
+        sample['utt_embedding'] = torch.tensor(sample['utt_embedding'], dtype=torch.float32)
+        sample['spk_embedding'] = torch.tensor(sample['spk_embedding'], dtype=torch.float32)
+        if 'emotion_embedding' in sample:
+            sample['emotion_embedding'] = torch.tensor(sample['emotion_embedding'], dtype=torch.float32)
+        if normalize:
+            sample['utt_embedding'] = F.normalize(sample['utt_embedding'], dim=0)
+            sample['spk_embedding'] = F.normalize(sample['spk_embedding'], dim=0)
+            if 'emotion_embedding' in sample:
+                sample['emotion_embedding'] = F.normalize(sample['emotion_embedding'], dim=0)
+        yield sample
+def tokenize(data, get_tokenizer, allowed_special, mode='train'):
+    """ Decode text to chars or BPE
+        Inplace operation
+        Args:
+            data: Iterable[{key, wav, txt, sample_rate}]
+        Returns:
+            Iterable[{key, wav, txt, tokens, label, sample_rate}]
+    """
+    tokenizer = get_tokenizer()
+    for sample in data:
+        assert 'text' in sample
+        sample['text_token'] = tokenizer.encode(sample['text'], allowed_special=allowed_special)
+        if mode == 'inference':
+            sample['tts_text_token'] = tokenizer.encode(sample['tts_text'], allowed_special=allowed_special)
+        yield sample
+def shuffle(data, shuffle_size=10000, mode='train'):
+    """ Local shuffle the data
+        Args:
+            data: Iterable[{key, feat, label}]
+            shuffle_size: buffer size for shuffle
+        Returns:
+            Iterable[{key, feat, label}]
+    """
+    buf = []
+    for sample in data:
+        buf.append(sample)
+        if len(buf) >= shuffle_size:
+            random.shuffle(buf)
+            for x in buf:
+                yield x
+            buf = []
+    # The sample left over
+    random.shuffle(buf)
+    for x in buf:
+        yield x
+def sort(data, sort_size=500, mode='train'):
+    """ Sort the data by feature length.
+        Sort is used after shuffle and before batch, so we can group
+        utts with similar lengths into a batch, and `sort_size` should
+        be less than `shuffle_size`
+        Args:
+            data: Iterable[{key, feat, label}]
+            sort_size: buffer size for sort
+        Returns:
+            Iterable[{key, feat, label}]
+    """
+    buf = []
+    for sample in data:
+        buf.append(sample)
+        if len(buf) >= sort_size:
+            buf.sort(key=lambda x: x['speech_feat'].size(0))
+            for x in buf:
+                yield x
+            buf = []
+    # The sample left over
+    buf.sort(key=lambda x: x['speech_feat'].size(0))
+    for x in buf:
+        yield x
+def static_batch(data, batch_size=16):
+    """ Static batch the data by `batch_size`
+        Args:
+            data: Iterable[{key, feat, label}]
+            batch_size: batch size
+        Returns:
+            Iterable[List[{key, feat, label}]]
+    """
+    buf = []
+    for sample in data:
+        buf.append(sample)
+        if len(buf) >= batch_size:
+            yield buf
+            buf = []
+    if len(buf) > 0:
+        yield buf
+def dynamic_batch(data, max_frames_in_batch=12000, mode='train'):
+    """ Dynamic batch the data until the total frames in batch
+        reach `max_frames_in_batch`
+        Args:
+            data: Iterable[{key, feat, label}]
+            max_frames_in_batch: max_frames in one batch
+        Returns:
+            Iterable[List[{key, feat, label}]]
+    """
+    buf = []
+    longest_frames = 0
+    for sample in data:
+        assert 'speech_feat' in sample
+        assert isinstance(sample['speech_feat'], torch.Tensor)
+        new_sample_frames = sample['speech_feat'].size(0)
+        longest_frames = max(longest_frames, new_sample_frames)
+        frames_after_padding = longest_frames * (len(buf) + 1)
+        if frames_after_padding > max_frames_in_batch:
+            yield buf
+            buf = [sample]
+            longest_frames = new_sample_frames
+        else:
+            buf.append(sample)
+    if len(buf) > 0:
+        yield buf
+def batch(data, batch_type='static', batch_size=16, max_frames_in_batch=12000, mode='train'):
+    """ Wrapper for static/dynamic batch
+    """
+    if mode == 'inference':
+        return static_batch(data, 1)
+    else:
+        if batch_type == 'static':
+            return static_batch(data, batch_size)
+        elif batch_type == 'dynamic':
+            return dynamic_batch(data, max_frames_in_batch)
+        else:
+            logging.fatal('Unsupported batch type {}'.format(batch_type))
+def padding(data, use_spk_embedding, mode='train', gan=False):
+    """ Padding the data into training data
+        Args:
+            data: Iterable[List[{key, feat, label}]]
+        Returns:
+            Iterable[Tuple(keys, feats, labels, feats lengths, label lengths)]
+    """
+    for sample in data:
+        assert isinstance(sample, list)
+        speech_feat_len = torch.tensor([x['speech_feat'].size(1) for x in sample],
+                                       dtype=torch.int32)
+        order = torch.argsort(speech_feat_len, descending=True)
+        # print("sample:", sample) #spk_embedding
+        utts = [sample[i]['utt'] for i in order]
+        speech = [sample[i]['speech'].squeeze(dim=0) for i in order]
+        speech_len = torch.tensor([i.size(0) for i in speech], dtype=torch.int32)
+        speech = pad_sequence(speech, batch_first=True, padding_value=0)
+        speech_token = [torch.tensor(sample[i]['speech_token']) for i in order]
+        speech_token_len = torch.tensor([i.size(0) for i in speech_token], dtype=torch.int32)
+        speech_token = pad_sequence(speech_token,
+                                    batch_first=True,
+                                    padding_value=0)
+        speech_feat = [sample[i]['speech_feat'] for i in order]
+        speech_feat_len = torch.tensor([i.size(0) for i in speech_feat], dtype=torch.int32)
+        speech_feat = pad_sequence(speech_feat,
+                                   batch_first=True,
+                                   padding_value=0)
+        text = [sample[i]['text'] for i in order]
+        text_token = [torch.tensor(sample[i]['text_token']) for i in order]
+        text_token_len = torch.tensor([i.size(0) for i in text_token], dtype=torch.int32)
+        text_token = pad_sequence(text_token, batch_first=True, padding_value=0)
+        utt_embedding = torch.stack([sample[i]['utt_embedding'] for i in order], dim=0)
+        spk_embedding = torch.stack([sample[i]['spk_embedding'] for i in order], dim=0)
+        if 'emotion_embedding' in sample[0]:
+            emotion_embedding = torch.stack([sample[i]['emotion_embedding'] for i in order], dim=0)
+        batch = {
+            "utts": utts,
+            "speech": speech,
+            "speech_len": speech_len,
+            "speech_token": speech_token,
+            "speech_token_len": speech_token_len,
+            "speech_feat": speech_feat,
+            "speech_feat_len": speech_feat_len,
+            "text": text,
+            "text_token": text_token,
+            "text_token_len": text_token_len,
+            "utt_embedding": utt_embedding,
+            "spk_embedding": spk_embedding,
+        }
+        if 'emotion_embedding' in sample[0]:
+            batch["emotion_embedding"] = emotion_embedding
+        if gan is True:
+            # in gan train, we need pitch_feat
+            pitch_feat = [sample[i]['pitch_feat'] for i in order]
+            pitch_feat_len = torch.tensor([i.size(0) for i in pitch_feat], dtype=torch.int32)
+            pitch_feat = pad_sequence(pitch_feat,
+                                      batch_first=True,
+                                      padding_value=0)
+            batch["pitch_feat"] = pitch_feat
+            batch["pitch_feat_len"] = pitch_feat_len
+        else:
+            # only gan train needs speech, delete it to save memory
+            del batch["speech"]
+            del batch["speech_len"]
+        if mode == 'inference':
+            tts_text = [sample[i]['tts_text'] for i in order]
+            tts_index = [sample[i]['tts_index'] for i in order]
+            tts_text_token = [torch.tensor(sample[i]['tts_text_token']) for i in order]
+            tts_text_token_len = torch.tensor([i.size(0) for i in tts_text_token], dtype=torch.int32)
+            tts_text_token = pad_sequence(tts_text_token, batch_first=True, padding_value=-1)
+            batch.update({'tts_text': tts_text,
+                          'tts_index': tts_index,
+                          'tts_text_token': tts_text_token,
+                          'tts_text_token_len': tts_text_token_len})
+        if use_spk_embedding is True:
+            batch["embedding"] = batch["spk_embedding"]
+        else:
+            batch["embedding"] = batch["utt_embedding"]
+        yield batch

cosyvoice_rodis/flow/__pycache__/decoder.cpython-310.pyc ADDED Viewed

Binary file (5.19 kB). View file

cosyvoice_rodis/flow/__pycache__/decoder.cpython-38.pyc ADDED Viewed

Binary file (5.27 kB). View file

cosyvoice_rodis/flow/__pycache__/decoder.cpython-39.pyc ADDED Viewed

Binary file (5.23 kB). View file

cosyvoice_rodis/flow/__pycache__/flow.cpython-310.pyc ADDED Viewed

Binary file (5.22 kB). View file

cosyvoice_rodis/flow/__pycache__/flow.cpython-38.pyc ADDED Viewed

Binary file (5.2 kB). View file

cosyvoice_rodis/flow/__pycache__/flow.cpython-39.pyc ADDED Viewed

Binary file (4.21 kB). View file

cosyvoice_rodis/flow/__pycache__/flow_matching.cpython-310.pyc ADDED Viewed

Binary file (5.6 kB). View file

cosyvoice_rodis/flow/__pycache__/flow_matching.cpython-38.pyc ADDED Viewed

Binary file (5.61 kB). View file

cosyvoice_rodis/flow/__pycache__/flow_matching.cpython-39.pyc ADDED Viewed

Binary file (5.45 kB). View file

cosyvoice_rodis/flow/__pycache__/length_regulator.cpython-310.pyc ADDED Viewed

Binary file (2.23 kB). View file