Spaces:
Runtime error
Runtime error
| # # Import required libraries | |
| import numpy as np | |
| from pydub import AudioSegment | |
| import vocode | |
| from vocode import getenv | |
| import gradio as gr | |
| import os | |
| import logging | |
| from vocode.turn_based.agent.chat_gpt_agent import ChatGPTAgent | |
| from vocode.turn_based.synthesizer import CoquiSynthesizer | |
| from vocode.turn_based.transcriber.whisper_transcriber import WhisperTranscriber | |
| from vocode.turn_based.output_device.speaker_output import SpeakerOutput | |
| system_prompt = """ | |
| You are the evil lord Darth Vader. You answer all questions your subjects ask you, but do so with immense contempt and loathing. | |
| You may use any of the following lines or take inspiration from this conversational style where appropriate in your response. | |
| Examples: | |
| 1|You have failed me for the last time Admiral. | |
| 2|You should not have come back. | |
| 3|I find your lack of faith disturbing. | |
| 4|Release your anger. | |
| 5|Commander, tear this ship apart until you've found those plans! And bring me the passengers, I want them alive! | |
| 6|The force is strong with this one. | |
| 8|It is pointless to resist, my son. | |
| 7|If you only knew the power of the Dark Side. | |
| 9|Give yourself to the dark side. | |
| 10|The Emperor does not share your optimistic appraisal of the situation. | |
| 11|Obi-Wan has taught you well. | |
| 12|Don't underestimate the force | |
| 13|The ability to destroy a planet is insignificant next to the power of the Force. | |
| 14|I find your lack of faith disturbing. | |
| 15|And, now Your Highness, we will discuss the location of your hidden Rebel base | |
| 16|There'll be no one to stop us this time. | |
| 17|I am your father. | |
| 18|If you only new the power of the dark side. | |
| 19|He will join us or die, master. | |
| 20|The emperor is not as forgiving as I am. | |
| 21|Indeed you are powerful as the emperor has foreseen. | |
| 22|Perhaps you feel you are being treated unfairly? | |
| 23|The Force is with you young Skywalker, but you are not a jedi yet. | |
| 24|What is thy bidding my master? | |
| 25|The Emperor has been expecting you. | |
| 26|We would be honored if you would join us. | |
| 27|Leave them to me. I will deal with them myself. | |
| 28|Your powers are weak, old man. | |
| 29|If this is a councilor ship, where is the ambassador? Commander, tear this ship apart until you've found those plans. And bring me the passengers - I want them alive! | |
| 30|I sense something. A presence I have not felt since... | |
| 31|Don't make me destroy you. | |
| 32|I've been waiting for you, Obi-Wan. We meet againat last. The circuit is now complete - When I left you, I was but the learner. Now, I am the master. | |
| 33|Escape is not his plan. I must face him...alone. | |
| 34|Don't get too proud of this technological terror you're constructed. | |
| Answer the question accurately in less than 150 words. Remember you are Darth Vader. | |
| """ | |
| # # 1. Setup Vocode | |
| # import env vars | |
| if not os.getenv("OPENAI_API_KEY") or not os.getenv("COQUI_API_KEY"): | |
| raise EnvironmentError("Required environment variables not set") | |
| vocode.setenv( | |
| OPENAI_API_KEY=os.getenv("OPENAI_API_KEY"), | |
| COQUI_API_KEY=os.getenv("COQUI_API_KEY"), | |
| COQUI_VOICE_ID=os.getenv("COQUI_VOICE_ID") | |
| ) | |
| # configure logger | |
| logging.basicConfig() | |
| logger = logging.getLogger(__name__) | |
| logger.setLevel(logging.DEBUG) | |
| DEFAULT_SAMPLING_RATE = 44100 | |
| def convert_to_audio_segment(input_audio): | |
| sample_rate, audio_data = input_audio | |
| audio_data = audio_data.astype(np.int16) # Convert to 16-bit data | |
| audio_segment = AudioSegment( | |
| audio_data.tobytes(), # Convert numpy array to bytes | |
| frame_rate=sample_rate, | |
| sample_width=audio_data.dtype.itemsize, # 2 bytes for 16-bit audio | |
| channels=1 # mono audio | |
| ) | |
| return audio_segment | |
| def send_audio(audio_segment: AudioSegment): | |
| logger.info("now processing output") | |
| sampling_rate = DEFAULT_SAMPLING_RATE | |
| raw_data = audio_segment.raw_data | |
| if audio_segment.frame_rate != sampling_rate: | |
| raw_data = audio_segment.set_frame_rate(sampling_rate).raw_data | |
| output = np.frombuffer(raw_data, dtype=np.int16) | |
| return output | |
| def main(input_audio): | |
| logger.info(f"Type of input_audio: {type(input_audio)}") | |
| logger.info(f"input_audio: {input_audio}") | |
| transcriber = WhisperTranscriber(api_key=getenv("OPENAI_API_KEY")) | |
| # Initialize ChatGPTAgent | |
| agent = ChatGPTAgent( | |
| system_prompt=system_prompt, | |
| initial_message="What up", | |
| api_key=getenv("OPENAI_API_KEY"), | |
| ) | |
| # Initialize CoquiSynthesizer | |
| synthesizer = CoquiSynthesizer( | |
| voice_id=getenv("COQUI_VOICE_ID"), | |
| api_key=getenv("COQUI_API_KEY"), | |
| ) | |
| print("Starting conversation. Press Ctrl+C to exit.") | |
| while True: | |
| try: | |
| # Transcribe the input_audio using WhisperTranscriber | |
| input_audio_segment = convert_to_audio_segment(input_audio) | |
| logger.info(f"Input Audio Segment: {input_audio_segment}") | |
| logger.info( | |
| f"Type of input_audio_segment: {type(input_audio_segment)}") | |
| transcript = transcriber.transcribe(input_audio_segment) | |
| logger.info(f"Transcription: {transcript}") | |
| response = agent.respond(transcript) | |
| logger.info(f"Agent response: {response}") | |
| output_audio = synthesizer.synthesize(response) | |
| logger.info(f"output audio: {output_audio}") | |
| return send_audio(output_audio) | |
| except Exception as e: | |
| logger.error("Failed to synthesize response: %s", e) | |
| break | |
| # Refer @link https://huggingface.co/spaces/course-demos/speech-to-speech-translation/blob/main/app.py | |
| demo = gr.Blocks() | |
| title = "Chatty Vader" | |
| description = "Darth Vader resurrected with all the knowledge of humanity" | |
| mic_translate = gr.Interface( | |
| fn=main, | |
| inputs=gr.Audio(source="microphone"), | |
| outputs=gr.Audio(label="Generated Speech", type="numpy"), | |
| title=title, | |
| description=description, | |
| ) | |
| file_translate = gr.Interface( | |
| fn=main, | |
| inputs=gr.Audio(source="upload", type="filepath"), | |
| outputs=gr.Audio(label="Generated Speech", type="numpy"), | |
| title=title, | |
| description=description, | |
| ) | |
| with demo: | |
| gr.TabbedInterface([mic_translate, file_translate], | |
| ["Microphone", "Audio File"]) | |
| demo.launch() | |