Spaces:

AIDC-AI
/

Marco-Voice-TTS

Running

File size: 11,125 Bytes

149fbcd

import gradio as gr
import sys, os
import torch
from cosyvoice.utils.file_utils import load_wav
from tts_model.base_model.cosyvoice import CosyVoice as CosyVoiceTTS_base
from tts_model.speaker_minus.cosyvoice import CosyVoice as CosyVoiceTTS_speakerminus
# from tts_model.model_cosy2_instruct import CosyVoiceTTS as CosyVoiceTTS_cosy2
from pydub import AudioSegment
import tempfile
import soundfile as sf
import subprocess
import numpy as np
import random


from pydub import AudioSegment
# AudioSegment.converter = "/mnt/by079416/fengping/ffmpeg-7.0.2-amd64-static/ffmpeg"
# AudioSegment.ffprobe = "/mnt/by079416/fengping/ffmpeg-7.0.2-amd64-static/ffprobe"

ffmpeg_path = os.path.expanduser("/mnt/by079416/fengping/ffmpeg-7.0.2-amd64-static/ffmpeg/")
os.environ["PATH"] += os.pathsep + ffmpeg_path

sys.path.append('third_party/Matcha-TTS')
os.system('export PYTHONPATH=third_party/Matcha-TTS')

tts_base = CosyVoiceTTS_base(model_dir="./pretrained_models/CosyVoice-300M/")
tts_speakerminus = CosyVoiceTTS_speakerminus(model_dir="./pretrained_models/CosyVoice-300M-speakerminus/")
# tts_cosy2_instruct = CosyVoiceTTS_cosy2(model_path="./pretrained_models/CosyVoice-300M-Instruct_cosy2/")

text_prompt = {
"翟佳宁": "这个节目就是把四个男嘉宾，四个女嘉宾放一个大别墅里让他们朝夕相处一整个月，月末选择心动的彼此。",
"范志毅": "没这个能力知道吗，我已经说了，你像这样的比赛本身就没有打好基础。",
"呼兰": "发完之后那个工作人员说，老师，呼兰老师你还要再加个标签儿，我说加什么标签儿，他说你就加一个呼兰太好笑了。",
"江梓浩": "就是很多我们这帮演员一整年也就上这么一个脱口秀类型的节目。",
"李雪琴": "我就劝他，我说你呀，你没事儿也放松放松，你那身体都亮红灯儿了你还不注意。",
"刘旸": "比如这三年我在街上开车，会在开车的时候进行一些哲思，我有天开车的时候路过一个地方。",
"唐香玉": "大家好我叫唐香玉， 我年前把我的工作辞了，成了一个全职脱口秀演员。",
"小鹿":  "然后我就老家的亲戚太多了，我也记不清谁该叫谁，所以我妈带着我和我。",
"于祥宇": "我大学专业学的是哲学，然后节目组就说那这期主题你可以刚好聊一下哲学专业毕业之后的就业方向。",
"赵晓卉": "终于没有人问我为什么不辞职了，结果谈到现在，谈恋爱第一天人家问我，能打个电话吗？我说你有啥事儿。",
"徐志胜": "最舒服的一个方式，这个舞台也不一定就是说是来第一年就好嘛，只要你坚持，肯定会有发光发热的那天嘛。"
} 
audio_prompt = {
    "翟佳宁": "zhaijianing",
    "范志毅": "fanzhiyi",
    "呼兰": "hulan",
    "江梓浩": "jiangzhihao",
    "李雪琴": "lixueqin",
    "刘旸": "liuchang",
    "唐香玉": "tangxiangyu",
    "小鹿": "xiaolu",
    "于祥宇": "yuxiangyu",
    "赵晓卉": "zhaoxiaohui",
    "徐志胜": "xuzhisheng"
}
audio_prompt_path = "/mnt/by079416/fengping/CosyVoice2/talk_show_prompt/"

def load_audio_and_convert_to_16bit(file_path, target_sample_rate=16000):
    audio = AudioSegment.from_file(file_path)
    
    if audio.channels > 1:
        audio = audio.set_channels(1)
    
    if audio.frame_rate != target_sample_rate:
        audio = audio.set_frame_rate(target_sample_rate)
    
    audio_data = np.array(audio.get_array_of_samples(), dtype=np.float32)
    
    audio_data = audio_data / np.max(np.abs(audio_data))
    
    audio_data = (audio_data * 32767).astype(np.int16)
    
    return torch.tensor(audio_data), target_sample_rate

def convert_audio_with_sox(input_file, output_file, target_sample_rate=16000):
    try:
        command = [
            'sox', input_file,  
            '-r', str(target_sample_rate),  
            '-b', '16',  
            '-c', '1',
            output_file  
        ]
        
        subprocess.run(command, check=True)
        print(f"Audio converted successfully: {output_file}")
    except subprocess.CalledProcessError as e:
        print(f"Error during conversion: {e}")

def generate_speech_base(tts_text, speed, speaker, emotion, ref_audio, ref_text):
    if not ref_audio and not ref_text:
        ref_text = text_prompt.get(speaker, "")
        ref_audio = os.path.join(audio_prompt_path, f"{audio_prompt.get(speaker)}.wav") 
    else:
        random_int = random.randint(0, 90)
        soxsed_ref_audio = "/tmp/{random_int}_ref.wav"
        convert_audio_with_sox(ref_audio, soxsed_ref_audio)
    ref_audio = load_wav(ref_audio, 16000)
    # ref_audio, target_sample_rate = load_audio_and_convert_to_16bit(ref_audio)
    sample_rate, full_audio = tts_base.inference_zero_shot(
        tts_text,
        prompt_text = ref_text,
        prompt_speech_16k = ref_audio,
        speed=speed,
        # speaker=speaker,
        # emotion=emotion,

    )
    print("sample_rate:", sample_rate, "full_audio:", full_audio.min(), full_audio.max())
    with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as temp_audio_file:
        output_audio_path = temp_audio_file.name
        audio_segment = AudioSegment(
            full_audio.tobytes(), 
            frame_rate=sample_rate,
            sample_width=full_audio.dtype.itemsize, 
            channels=1
        )
        audio_segment.export(output_audio_path, format="wav")
    print(f"Audio saved to {output_audio_path}")

    return output_audio_path

def generate_speech_speakerminus(tts_text, speed, speaker, key, ref_audio, ref_text):
    if not ref_audio and not ref_text:
        ref_text = text_prompt.get(speaker, "")
        ref_audio = os.path.join(audio_prompt_path, f"{audio_prompt.get(speaker)}.wav") 
    else:
        random_int = random.randint(0, 90)
        soxsed_ref_audio = "/tmp/{random_int}_ref.wav"
        convert_audio_with_sox(ref_audio, soxsed_ref_audio)
    # print("output_file:", output_file)
    # ref_audio, target_sample_rate = load_audio_and_convert_to_16bit(ref_audio)
    ref_audio = load_wav(ref_audio, 16000)
    # if key == "Surprise":
    #     emotion_info = torch.load("/mnt/by079416/surprise.pt")
    # if key == "Sad":
    #     emotion_info = torch.load("/mnt/by079416/sad.pt")
    # if key == "Angry":
    #     emotion_info = torch.load("/mnt/by079416/angry.pt")
    # if key == "Happy":
    #     emotion_info = torch.load("/mnt/by079416/happy.pt")

    emotion_info = torch.load("/mnt/by079416/fengping/CosyVoice2/embedding_info.pt")["0002"][key]
    sample_rate, full_audio = tts_speakerminus.inference_zero_shot(
        tts_text,
        prompt_text = ref_text,
        # speaker=speaker,
        prompt_speech_16k = ref_audio,
        key = key,
        emotion_speakerminus=emotion_info,
        # ref_audio = ref_audio,
        speed=speed
        
    )
    print("sample_rate:", sample_rate, "full_audio:", full_audio.min(), full_audio.max())
    with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as temp_audio_file:
        output_audio_path = temp_audio_file.name
        audio_segment = AudioSegment(
            full_audio.tobytes(), 
            frame_rate=sample_rate,
            sample_width=full_audio.dtype.itemsize, 
            channels=1
        )
        audio_segment.export(output_audio_path, format="wav")
    print(f"Audio saved to {output_audio_path}")
    return output_audio_path

names = [
    "于祥宇",
    "刘旸",
    "呼兰",
    "唐香玉",
    "小鹿",
    "李雪琴",
    "江梓浩",
    "翟佳宁",
    "范志毅",
    "赵晓卉",
    "徐志胜"
]

with gr.Blocks() as demo:
    gr.Markdown("base model and instruct model")
    # base
    with gr.Tab("TTS-v1"):
        gr.Markdown("## base model testing ##")
        tts_base_inputs = [
            gr.Textbox(lines=2, placeholder="Enter text here...", label="Input Text"),
            gr.Slider(minimum=0.5, maximum=2.0, value=1.0, step=0.1, label="speed", info="Adjust speech rate (0.5x to 2.0x)"),
            gr.Radio(choices=names, value="徐志胜", label="可选说话人", info="选择语音说话人"),
            gr.Radio(choices=["peace"], value="peace", label="情感", info="选择情感风格"),
            gr.Audio(type="filepath", label="输入音频"),
            gr.Textbox(lines=2, placeholder="Enter audio corresponding text here...", label="音频对应的文本")
        ]
        tts_base_output = gr.Audio(type="filepath", label="生成的语音")
        tts_base_button = gr.Button("生成语音")
        tts_base_button.click(
            fn=generate_speech_base,
            inputs=tts_base_inputs,
            outputs=tts_base_output
        )

    # with gr.Tab("TTS-v2"):
    #     gr.Markdown("## base model testing ##")
    #     tts_base_inputs = [
    #         gr.Textbox(lines=2, placeholder="Enter text here...", label="Input Text"),
    #         gr.Slider(minimum=0.5, maximum=2.0, value=1.0, step=0.1, label="speed", info="Adjust speech rate (0.5x to 2.0x)"),
    #         gr.Radio(choices=names, value="徐志胜", label="说话人", info="选择语音说话人"),
    #         gr.Radio(choices=["peace", "excited", "mixed"], value="peace", label="情感", info="选择情感风格"),
    #         gr.Textbox(lines=2, placeholder="Enter your instruct text here...", label="Input Text"),
    #         gr.Audio(type="filepath", label="输入音频"),
    #         gr.Textbox(lines=2, placeholder="Enter audio corresponding text here...", label="音频对应的文本")
    #     ]
    #     tts_base_output = gr.Audio(type="filepath", label="生成的语音")
    #     tts_base_button = gr.Button("生成语音")
    #     tts_base_button.click(
    #         fn=generate_speech_cosy2_instruct,
    #         inputs=tts_base_inputs,
    #         outputs=tts_base_output
    #     )

    # # instruct

    # def generate_speech_speakerminus(tts_text, speed, speaker, emotion_speakerminus, key, ref_audio, ref_text):

    with gr.Tab("TTS-v3"):
        gr.Markdown("## instruct model testing")
        tts2_inputs = [
            gr.Textbox(lines=2, placeholder="输入文本...", label="输入文本"),
            gr.Slider(minimum=0.5, maximum=2.0, value=1.0, step=0.1, label="语速", info="调整语速 (0.5x 到 2.0x)"),
            gr.Radio(choices=names, value="徐志胜", label="可选说话人", info="选择语音说话人"),
            gr.Radio(choices=["Angry", "Happy", "Surprise", "Sad"], value="peace", label="情感", info="选择情感风格"),
            gr.Audio(type="filepath", label="输入音频"),
            gr.Textbox(lines=2, placeholder="Enter audio corresponding text here...", label="音频对应的文本")
        ]
        tts2_output = gr.Audio(type="filepath", label="生成的语音")
        tts2_button = gr.Button("生成语音")
        tts2_button.click(
            fn=generate_speech_speakerminus,
            inputs=tts2_inputs,
            outputs=tts2_output
        )

if __name__ == "__main__":
    demo.launch(
        server_name="0.0.0.0",
        server_port=10132,
        share=False
    )