Spaces:
Runtime error
Runtime error
github-actions[bot]
commited on
Commit
·
4657892
1
Parent(s):
623b3b9
Sync with https://github.com/mozilla-ai/document-to-podcast
Browse files
app.py
CHANGED
|
@@ -1,7 +1,6 @@
|
|
| 1 |
"""Streamlit app for converting documents to podcasts."""
|
| 2 |
|
| 3 |
import io
|
| 4 |
-
import os
|
| 5 |
import re
|
| 6 |
from pathlib import Path
|
| 7 |
|
|
@@ -23,16 +22,13 @@ from document_to_podcast.utils import stack_audio_segments
|
|
| 23 |
@st.cache_resource
|
| 24 |
def load_text_to_text_model():
|
| 25 |
return load_llama_cpp_model(
|
| 26 |
-
model_id="bartowski/Qwen2.5-
|
| 27 |
)
|
| 28 |
|
| 29 |
|
| 30 |
@st.cache_resource
|
| 31 |
-
def load_text_to_speech_model():
|
| 32 |
-
|
| 33 |
-
return load_tts_model("hexgrad/kLegacy/v0.19/kokoro-v0_19.pth")
|
| 34 |
-
else:
|
| 35 |
-
return load_tts_model("OuteAI/OuteTTS-0.2-500M-GGUF/OuteTTS-0.2-500M-FP16.gguf")
|
| 36 |
|
| 37 |
|
| 38 |
def numpy_to_wav(audio_array: np.ndarray, sample_rate: int) -> io.BytesIO:
|
|
@@ -115,29 +111,11 @@ if "clean_text" in st.session_state:
|
|
| 115 |
"[Docs for this Step](https://mozilla-ai.github.io/document-to-podcast/step-by-step-guide/#step-2-podcast-script-generation)"
|
| 116 |
)
|
| 117 |
st.divider()
|
|
|
|
|
|
|
|
|
|
| 118 |
|
| 119 |
text_model = load_text_to_text_model()
|
| 120 |
-
speech_model = load_text_to_speech_model()
|
| 121 |
-
|
| 122 |
-
if os.environ.get("HF_SPACE") == "TRUE":
|
| 123 |
-
tts_link = "- [hexgrad/Kokoro-82M](https://huggingface.co/hexgrad/Kokoro-82M)"
|
| 124 |
-
SPEAKERS = [
|
| 125 |
-
{
|
| 126 |
-
"id": 1,
|
| 127 |
-
"name": "Sarah",
|
| 128 |
-
"description": "The main host. She explains topics clearly using anecdotes and analogies, teaching in an engaging and captivating way.",
|
| 129 |
-
"voice_profile": "af_sarah",
|
| 130 |
-
},
|
| 131 |
-
{
|
| 132 |
-
"id": 2,
|
| 133 |
-
"name": "Michael",
|
| 134 |
-
"description": "The co-host. He keeps the conversation on track, asks curious follow-up questions, and reacts with excitement or confusion, often using interjections like hmm or umm.",
|
| 135 |
-
"voice_profile": "am_michael",
|
| 136 |
-
},
|
| 137 |
-
]
|
| 138 |
-
else:
|
| 139 |
-
tts_link = "- [OuteAI/OuteTTS-0.2-500M](https://huggingface.co/OuteAI/OuteTTS-0.2-500M-GGUF)"
|
| 140 |
-
SPEAKERS = DEFAULT_SPEAKERS
|
| 141 |
|
| 142 |
st.markdown(
|
| 143 |
"For this demo, we are using the following models: \n"
|
|
@@ -180,6 +158,15 @@ if "clean_text" in st.session_state:
|
|
| 180 |
speaker.get(x, None) for x in ["name", "description", "voice_profile"]
|
| 181 |
)
|
| 182 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 183 |
system_prompt = DEFAULT_PROMPT.replace("{SPEAKERS}", speakers_str)
|
| 184 |
with st.spinner("Generating Podcast..."):
|
| 185 |
text = ""
|
|
|
|
| 1 |
"""Streamlit app for converting documents to podcasts."""
|
| 2 |
|
| 3 |
import io
|
|
|
|
| 4 |
import re
|
| 5 |
from pathlib import Path
|
| 6 |
|
|
|
|
| 22 |
@st.cache_resource
|
| 23 |
def load_text_to_text_model():
|
| 24 |
return load_llama_cpp_model(
|
| 25 |
+
model_id="bartowski/Qwen2.5-7B-Instruct-GGUF/Qwen2.5-7B-Instruct-Q8_0.gguf"
|
| 26 |
)
|
| 27 |
|
| 28 |
|
| 29 |
@st.cache_resource
|
| 30 |
+
def load_text_to_speech_model(lang_code: str):
|
| 31 |
+
return load_tts_model("hexgrad/Kokoro-82M", **{"lang_code": lang_code})
|
|
|
|
|
|
|
|
|
|
| 32 |
|
| 33 |
|
| 34 |
def numpy_to_wav(audio_array: np.ndarray, sample_rate: int) -> io.BytesIO:
|
|
|
|
| 111 |
"[Docs for this Step](https://mozilla-ai.github.io/document-to-podcast/step-by-step-guide/#step-2-podcast-script-generation)"
|
| 112 |
)
|
| 113 |
st.divider()
|
| 114 |
+
tts_link = "- [hexgrad/Kokoro-82M](https://github.com/hexgrad/kokoro)"
|
| 115 |
+
|
| 116 |
+
SPEAKERS = DEFAULT_SPEAKERS
|
| 117 |
|
| 118 |
text_model = load_text_to_text_model()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 119 |
|
| 120 |
st.markdown(
|
| 121 |
"For this demo, we are using the following models: \n"
|
|
|
|
| 158 |
speaker.get(x, None) for x in ["name", "description", "voice_profile"]
|
| 159 |
)
|
| 160 |
)
|
| 161 |
+
if speakers[0]["voice_profile"][0] != speakers[1]["voice_profile"][0]:
|
| 162 |
+
raise ValueError(
|
| 163 |
+
"Both Kokoro speakers need to have the same language code. "
|
| 164 |
+
"More info here https://huggingface.co/hexgrad/Kokoro-82M/blob/main/VOICES.md"
|
| 165 |
+
)
|
| 166 |
+
# Get which language is used for generation from the first character of the Kokoro voice profile
|
| 167 |
+
language_code = speakers[0]["voice_profile"][0]
|
| 168 |
+
speech_model = load_text_to_speech_model(lang_code=language_code)
|
| 169 |
+
|
| 170 |
system_prompt = DEFAULT_PROMPT.replace("{SPEAKERS}", speakers_str)
|
| 171 |
with st.spinner("Generating Podcast..."):
|
| 172 |
text = ""
|