Spaces:
Running
on
Zero
Running
on
Zero
add simple version
Browse files
app.py
CHANGED
|
@@ -538,6 +538,154 @@ def generate_audio(
|
|
| 538 |
)
|
| 539 |
|
| 540 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 541 |
# UI Helper Functions
|
| 542 |
|
| 543 |
def load_speaker_metadata(speaker_id):
|
|
@@ -1709,512 +1857,718 @@ def init_and_compile():
|
|
| 1709 |
# On Zero GPU, don't try to compile
|
| 1710 |
return session_id, gr.update(), gr.update()
|
| 1711 |
|
| 1712 |
-
|
| 1713 |
-
|
| 1714 |
-
|
| 1715 |
-
|
| 1716 |
-
|
| 1717 |
-
|
| 1718 |
-
|
| 1719 |
-
|
| 1720 |
-
|
| 1721 |
-
|
| 1722 |
-
|
| 1723 |
-
|
| 1724 |
-
|
| 1725 |
-
|
| 1726 |
-
|
| 1727 |
-
|
| 1728 |
-
1. **Pick or upload a voice** - Choose from the voicebank or upload your own audio (up to 2 minutes)
|
| 1729 |
-
2. **Choose a text prompt preset or enter your own prompt** - What you want the voice to say (the presets are a good guide for format/style)
|
| 1730 |
-
3. **Select a Sampling preset (optional) ** - The default preset "Independent (High Speaker CFG)" is usually good to start
|
| 1731 |
-
4. **Click Generate Audio** - Wait for the model to generate your audio
|
| 1732 |
-
|
| 1733 |
-
<div class="tip-box">
|
| 1734 |
-
|
| 1735 |
-
💡 **Tip:** If the generated voice doesn't match the reference speaker at all, enable "Speaker KV Attention Scaling" and click Generate Audio again.
|
| 1736 |
-
|
| 1737 |
-
</div>
|
| 1738 |
-
|
| 1739 |
-
### Advanced Mode
|
| 1740 |
-
|
| 1741 |
-
Switch to Advanced mode for full control over all generation parameters including CFG scales, sampling steps, truncation, and more.
|
| 1742 |
-
|
| 1743 |
-
### Other tips
|
| 1744 |
-
|
| 1745 |
-
High CFG settings are recommended but may lead to oversaturation; APG might help with this. Flat settings tend to reduce "impulse" artifacts but might result in worse (blunted/compressed/artifact-y) laughter, breathing, etc. generation.
|
| 1746 |
|
| 1747 |
-
|
| 1748 |
-
|
|
|
|
| 1749 |
|
| 1750 |
-
# Session state for per-user file management
|
| 1751 |
session_id_state = gr.State(None)
|
| 1752 |
|
| 1753 |
-
#
|
| 1754 |
-
|
| 1755 |
-
|
| 1756 |
-
|
| 1757 |
-
|
| 1758 |
-
|
| 1759 |
-
|
| 1760 |
-
|
| 1761 |
-
|
| 1762 |
-
|
| 1763 |
-
|
| 1764 |
-
|
| 1765 |
-
|
| 1766 |
-
|
| 1767 |
-
|
| 1768 |
-
|
| 1769 |
-
|
| 1770 |
-
|
| 1771 |
-
|
| 1772 |
-
|
| 1773 |
-
|
| 1774 |
-
|
| 1775 |
-
|
| 1776 |
-
|
| 1777 |
-
|
| 1778 |
-
|
| 1779 |
-
|
| 1780 |
-
|
| 1781 |
-
|
| 1782 |
-
|
| 1783 |
-
|
| 1784 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1785 |
interactive=False,
|
| 1786 |
-
|
|
|
|
| 1787 |
)
|
|
|
|
|
|
|
|
|
|
| 1788 |
|
| 1789 |
-
|
| 1790 |
-
|
| 1791 |
-
|
| 1792 |
-
|
| 1793 |
-
|
| 1794 |
-
|
| 1795 |
-
|
| 1796 |
-
|
| 1797 |
-
|
| 1798 |
-
|
| 1799 |
-
|
| 1800 |
-
|
| 1801 |
-
|
| 1802 |
-
|
| 1803 |
-
|
| 1804 |
-
|
| 1805 |
-
|
| 1806 |
-
|
| 1807 |
-
|
| 1808 |
-
|
| 1809 |
-
|
| 1810 |
-
|
| 1811 |
-
|
| 1812 |
-
|
| 1813 |
-
|
| 1814 |
-
|
| 1815 |
-
|
| 1816 |
-
|
| 1817 |
-
|
| 1818 |
-
|
| 1819 |
-
|
| 1820 |
-
|
| 1821 |
-
|
| 1822 |
-
|
| 1823 |
-
|
| 1824 |
-
interactive=False,
|
| 1825 |
-
label="Click any cell to select",
|
| 1826 |
-
column_widths=["10%", "8%", "15%", "30%", "37%"]
|
| 1827 |
-
)
|
| 1828 |
-
|
| 1829 |
-
# VCTK UI (hidden by default)
|
| 1830 |
-
with gr.Column(scale=2, visible=False) as vctk_column:
|
| 1831 |
-
gr.Markdown("### 1. Speakers (VCTK)")
|
| 1832 |
-
vctk_speaker_display = gr.Textbox(
|
| 1833 |
-
value="",
|
| 1834 |
-
label="",
|
| 1835 |
-
show_label=False,
|
| 1836 |
-
interactive=False,
|
| 1837 |
-
visible=False,
|
| 1838 |
-
lines=2,
|
| 1839 |
-
max_lines=2
|
| 1840 |
-
)
|
| 1841 |
-
vctk_speaker_search = gr.Textbox(
|
| 1842 |
-
placeholder="Search speakers (by ID, gender, age, details)...",
|
| 1843 |
-
label="",
|
| 1844 |
-
show_label=False,
|
| 1845 |
-
container=False
|
| 1846 |
-
)
|
| 1847 |
-
vctk_speakers_table = gr.Dataframe(
|
| 1848 |
-
value=get_vctk_speakers_table(),
|
| 1849 |
-
headers=["ID", "G", "Age", "Details", "Length"],
|
| 1850 |
-
datatype=["str", "str", "str", "str", "str"],
|
| 1851 |
-
row_count=(8, "dynamic"),
|
| 1852 |
-
col_count=(5, "fixed"),
|
| 1853 |
-
interactive=False,
|
| 1854 |
-
label="Click any cell to select",
|
| 1855 |
-
column_widths=["10%", "8%", "12%", "50%", "20%"]
|
| 1856 |
-
)
|
| 1857 |
-
|
| 1858 |
-
# Expresso UI (hidden by default)
|
| 1859 |
-
with gr.Column(scale=2, visible=False) as expresso_column:
|
| 1860 |
-
gr.Markdown("### 1. Voices (Expresso)")
|
| 1861 |
-
expresso_speaker_display = gr.Textbox(
|
| 1862 |
-
value="",
|
| 1863 |
-
label="",
|
| 1864 |
-
show_label=False,
|
| 1865 |
-
interactive=False,
|
| 1866 |
-
visible=False,
|
| 1867 |
-
lines=2,
|
| 1868 |
-
max_lines=2
|
| 1869 |
-
)
|
| 1870 |
-
expresso_speaker_search = gr.Textbox(
|
| 1871 |
-
placeholder="Search voices (by ID, type, speakers, style)...",
|
| 1872 |
-
label="",
|
| 1873 |
-
show_label=False,
|
| 1874 |
-
container=False
|
| 1875 |
-
)
|
| 1876 |
-
expresso_speakers_table = gr.Dataframe(
|
| 1877 |
-
value=get_expresso_speakers_table(),
|
| 1878 |
-
headers=["ID", "Type", "Speakers", "Style", "Length"],
|
| 1879 |
-
datatype=["str", "str", "str", "str", "str"],
|
| 1880 |
-
row_count=(8, "dynamic"),
|
| 1881 |
-
col_count=(5, "fixed"),
|
| 1882 |
-
interactive=False,
|
| 1883 |
-
label="Click any cell to select",
|
| 1884 |
-
column_widths=["35%", "15%", "15%", "15%", "20%"]
|
| 1885 |
-
)
|
| 1886 |
-
|
| 1887 |
-
# HF-Custom UI (hidden by default)
|
| 1888 |
-
with gr.Column(scale=2, visible=False) as hf_custom_column:
|
| 1889 |
-
gr.Markdown("### 1. Voices (HF-Custom)")
|
| 1890 |
-
hf_custom_speaker_display = gr.Textbox(
|
| 1891 |
-
value="",
|
| 1892 |
-
label="",
|
| 1893 |
-
show_label=False,
|
| 1894 |
-
interactive=False,
|
| 1895 |
-
visible=False,
|
| 1896 |
-
lines=2,
|
| 1897 |
-
max_lines=2
|
| 1898 |
-
)
|
| 1899 |
-
hf_custom_speaker_search = gr.Textbox(
|
| 1900 |
-
placeholder="Search voices (by name, dataset, description)...",
|
| 1901 |
-
label="",
|
| 1902 |
-
show_label=False,
|
| 1903 |
-
container=False
|
| 1904 |
-
)
|
| 1905 |
-
hf_custom_speakers_table = gr.Dataframe(
|
| 1906 |
-
value=get_hf_custom_speakers_table(),
|
| 1907 |
-
headers=["Name", "Dataset", "Description", "Length"],
|
| 1908 |
-
datatype=["str", "str", "str", "str"],
|
| 1909 |
-
row_count=(8, "dynamic"),
|
| 1910 |
-
col_count=(4, "fixed"),
|
| 1911 |
-
interactive=False,
|
| 1912 |
-
label="Click any cell to select",
|
| 1913 |
-
column_widths=["15%", "15%", "50%", "20%"]
|
| 1914 |
-
)
|
| 1915 |
-
|
| 1916 |
-
with gr.Column(scale=1, visible=True) as voice_type_column:
|
| 1917 |
-
gr.Markdown("### 2. Voice Type")
|
| 1918 |
-
selected_voice_display = gr.Textbox(
|
| 1919 |
-
value="",
|
| 1920 |
-
label="",
|
| 1921 |
-
show_label=False,
|
| 1922 |
-
interactive=False,
|
| 1923 |
-
visible=False,
|
| 1924 |
-
lines=2,
|
| 1925 |
-
max_lines=2
|
| 1926 |
-
)
|
| 1927 |
-
freeform_table = gr.Dataframe(
|
| 1928 |
-
value=[],
|
| 1929 |
-
headers=["Type", "Length"],
|
| 1930 |
-
datatype=["str", "str"],
|
| 1931 |
-
row_count=(1, "fixed"),
|
| 1932 |
-
col_count=(2, "fixed"),
|
| 1933 |
-
interactive=False,
|
| 1934 |
-
label="Freeform voice",
|
| 1935 |
-
visible=True,
|
| 1936 |
-
column_widths=["60%", "40%"]
|
| 1937 |
-
)
|
| 1938 |
-
gr.Markdown("**Emotions:**")
|
| 1939 |
-
emotions_table = gr.Dataframe(
|
| 1940 |
-
value=[],
|
| 1941 |
-
headers=["Emotion", "Length"],
|
| 1942 |
-
datatype=["str", "str"],
|
| 1943 |
-
row_count=(8, "dynamic"),
|
| 1944 |
-
col_count=(2, "fixed"),
|
| 1945 |
-
interactive=False,
|
| 1946 |
-
visible=True,
|
| 1947 |
-
column_widths=["60%", "40%"]
|
| 1948 |
-
)
|
| 1949 |
-
|
| 1950 |
-
with gr.Column(scale=1):
|
| 1951 |
-
gr.Markdown("### 3. Audio Preview")
|
| 1952 |
-
audio_preview = gr.Audio(label="Voice Sample", type="filepath", interactive=False)
|
| 1953 |
-
|
| 1954 |
-
gr.HTML('<hr class="section-separator">')
|
| 1955 |
-
gr.Markdown("# Text Prompt")
|
| 1956 |
-
with gr.Accordion("Text Presets", open=True):
|
| 1957 |
-
text_presets_table = gr.Dataframe(
|
| 1958 |
-
value=load_text_presets(),
|
| 1959 |
-
headers=["Category", "Words", "Preset Text"],
|
| 1960 |
-
datatype=["str", "str", "str"],
|
| 1961 |
-
row_count=(3, "dynamic"),
|
| 1962 |
-
col_count=(3, "fixed"),
|
| 1963 |
-
interactive=False,
|
| 1964 |
-
column_widths=["12%", "6%", "82%"]
|
| 1965 |
-
)
|
| 1966 |
-
text_prompt = gr.Textbox(
|
| 1967 |
-
label="Text Prompt",
|
| 1968 |
-
placeholder="[S1] Enter your text prompt here...",
|
| 1969 |
-
lines=4
|
| 1970 |
-
)
|
| 1971 |
-
|
| 1972 |
-
gr.HTML('<hr class="section-separator">')
|
| 1973 |
-
gr.Markdown("# Generation")
|
| 1974 |
-
|
| 1975 |
-
# Mode selector: Simple or Advanced (outside the accordion, centered and prominent)
|
| 1976 |
-
with gr.Row():
|
| 1977 |
-
with gr.Column(scale=1):
|
| 1978 |
-
pass # Empty column for spacing
|
| 1979 |
-
with gr.Column(scale=2):
|
| 1980 |
-
mode_selector = gr.Radio(
|
| 1981 |
-
choices=["Simple Mode", "Advanced Mode"],
|
| 1982 |
-
value="Simple Mode",
|
| 1983 |
-
label="",
|
| 1984 |
-
info=None,
|
| 1985 |
-
elem_id="component-mode-selector"
|
| 1986 |
-
)
|
| 1987 |
-
with gr.Column(scale=1):
|
| 1988 |
-
pass # Empty column for spacing
|
| 1989 |
-
|
| 1990 |
-
with gr.Accordion("⚙️ Generation Parameters", open=True):
|
| 1991 |
-
|
| 1992 |
-
with gr.Row():
|
| 1993 |
-
presets = load_sampler_presets()
|
| 1994 |
-
preset_keys = list(presets.keys())
|
| 1995 |
-
first_preset = preset_keys[0] if preset_keys else "Custom"
|
| 1996 |
|
| 1997 |
-
|
| 1998 |
-
|
| 1999 |
-
|
| 2000 |
-
|
| 2001 |
-
|
| 2002 |
-
|
| 2003 |
)
|
| 2004 |
|
| 2005 |
-
|
| 2006 |
-
|
| 2007 |
-
|
| 2008 |
-
info="Random seed for starting noise",
|
| 2009 |
-
precision=0,
|
| 2010 |
-
scale=1
|
| 2011 |
)
|
| 2012 |
|
| 2013 |
-
#
|
| 2014 |
-
with gr.
|
| 2015 |
-
|
| 2016 |
-
|
| 2017 |
-
|
| 2018 |
-
|
| 2019 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2020 |
|
| 2021 |
-
|
| 2022 |
-
|
| 2023 |
-
|
| 2024 |
-
|
| 2025 |
-
|
| 2026 |
-
|
| 2027 |
-
|
| 2028 |
-
|
| 2029 |
-
|
| 2030 |
-
|
| 2031 |
-
|
| 2032 |
-
|
| 2033 |
-
|
| 2034 |
-
|
| 2035 |
-
|
| 2036 |
-
|
| 2037 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2038 |
|
| 2039 |
-
|
| 2040 |
-
|
| 2041 |
-
|
| 2042 |
-
|
| 2043 |
-
|
| 2044 |
-
|
| 2045 |
-
|
| 2046 |
-
|
| 2047 |
-
|
| 2048 |
-
|
| 2049 |
-
|
| 2050 |
-
|
| 2051 |
-
|
| 2052 |
-
|
| 2053 |
-
|
| 2054 |
-
|
| 2055 |
-
|
| 2056 |
-
|
| 2057 |
-
|
| 2058 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2059 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2060 |
|
|
|
|
|
|
|
| 2061 |
|
|
|
|
| 2062 |
with gr.Row():
|
| 2063 |
-
# Left column: Core Sampling Parameters
|
| 2064 |
with gr.Column(scale=1):
|
| 2065 |
-
|
| 2066 |
-
|
| 2067 |
-
|
| 2068 |
-
|
| 2069 |
-
|
| 2070 |
-
|
| 2071 |
-
|
| 2072 |
-
""
|
| 2073 |
-
|
| 2074 |
-
|
| 2075 |
-
|
| 2076 |
-
|
| 2077 |
-
|
| 2078 |
-
|
| 2079 |
-
|
| 2080 |
-
|
| 2081 |
-
|
| 2082 |
-
|
| 2083 |
-
|
| 2084 |
-
|
| 2085 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2086 |
)
|
| 2087 |
|
| 2088 |
-
|
| 2089 |
-
|
| 2090 |
-
|
| 2091 |
-
|
| 2092 |
-
|
| 2093 |
-
|
| 2094 |
-
|
| 2095 |
-
|
| 2096 |
-
|
| 2097 |
-
|
| 2098 |
-
|
| 2099 |
-
|
| 2100 |
-
|
| 2101 |
-
|
| 2102 |
-
|
| 2103 |
-
|
| 2104 |
-
|
| 2105 |
-
cfg_scale_speaker = gr.Number(label="Speaker CFG Scale", value=5.0, info="Guidance strength for speaker", minimum=0, step=0.5)
|
| 2106 |
-
|
| 2107 |
-
with gr.Row():
|
| 2108 |
-
cfg_min_t = gr.Number(label="CFG Min t", value=0.5, info="(0-1), CFG applied when t >= val", minimum=0, maximum=1, step=0.05)
|
| 2109 |
-
cfg_max_t = gr.Number(label="CFG Max t", value=1.0, info="(0-1), CFG applied when t <= val", minimum=0, maximum=1, step=0.05)
|
| 2110 |
|
| 2111 |
-
#
|
| 2112 |
-
with gr.Column(
|
| 2113 |
-
with gr.
|
| 2114 |
-
gr.
|
| 2115 |
-
|
| 2116 |
-
|
| 2117 |
-
|
| 2118 |
-
|
| 2119 |
-
|
| 2120 |
-
|
| 2121 |
-
|
| 2122 |
-
|
| 2123 |
-
|
| 2124 |
-
|
| 2125 |
-
|
| 2126 |
-
|
| 2127 |
-
|
| 2128 |
-
|
| 2129 |
-
|
| 2130 |
-
|
|
|
|
| 2131 |
|
| 2132 |
-
with gr.Group():
|
| 2133 |
-
gr.HTML("""
|
| 2134 |
-
<div class="preset-inline">
|
| 2135 |
-
<span class="title">Truncation & Temporal Rescaling</span><span class="dim">(</span>
|
| 2136 |
-
<a href="javascript:void(0)" class="preset-link" data-fire="trunc_flat">flat</a>
|
| 2137 |
-
<span class="dim">,</span>
|
| 2138 |
-
<a href="javascript:void(0)" class="preset-link" data-fire="trunc_sharp">sharp</a>
|
| 2139 |
-
<span class="dim">,</span>
|
| 2140 |
-
<a href="javascript:void(0)" class="preset-link" data-fire="trunc_baseline">baseline(sharp)</a>
|
| 2141 |
-
<span class="dim">)</span>
|
| 2142 |
-
</div>
|
| 2143 |
-
""")
|
| 2144 |
-
trunc_preset_flat = gr.Button("", elem_id="trunc_flat", elem_classes=["proxy-btn"])
|
| 2145 |
-
trunc_preset_sharp = gr.Button("", elem_id="trunc_sharp", elem_classes=["proxy-btn"])
|
| 2146 |
-
trunc_preset_baseline = gr.Button("", elem_id="trunc_baseline", elem_classes=["proxy-btn"])
|
| 2147 |
-
with gr.Row():
|
| 2148 |
-
truncation_factor = gr.Number(label="Truncation Factor", value=0.8, info="Multiply initial noise (<1 helps artifacts)", minimum=0, step=0.05)
|
| 2149 |
-
rescale_k = gr.Number(label="Rescale k", value=1.2, info="<1=sharpen, >1=flatten, 1=off", minimum=0, step=0.05)
|
| 2150 |
-
rescale_sigma = gr.Number(label="Rescale σ", value=3.0, info="Sigma parameter", minimum=0, step=0.1)
|
| 2151 |
|
| 2152 |
-
with gr.
|
| 2153 |
-
|
| 2154 |
-
|
| 2155 |
-
|
| 2156 |
-
|
| 2157 |
-
|
| 2158 |
-
|
| 2159 |
-
|
| 2160 |
-
|
| 2161 |
-
|
| 2162 |
-
|
| 2163 |
-
|
| 2164 |
-
|
| 2165 |
-
|
| 2166 |
-
|
| 2167 |
-
|
| 2168 |
-
|
| 2169 |
-
|
| 2170 |
-
|
| 2171 |
-
|
| 2172 |
-
|
| 2173 |
-
|
| 2174 |
-
|
| 2175 |
-
|
| 2176 |
-
|
| 2177 |
-
|
| 2178 |
-
|
| 2179 |
-
|
| 2180 |
-
|
| 2181 |
-
|
| 2182 |
-
|
| 2183 |
-
|
| 2184 |
-
|
| 2185 |
-
|
| 2186 |
-
|
| 2187 |
-
|
| 2188 |
-
|
| 2189 |
-
|
| 2190 |
-
|
| 2191 |
-
|
| 2192 |
-
|
| 2193 |
-
|
| 2194 |
-
|
| 2195 |
-
|
| 2196 |
-
|
| 2197 |
-
|
| 2198 |
-
|
| 2199 |
-
|
| 2200 |
-
|
| 2201 |
-
|
| 2202 |
-
|
| 2203 |
-
|
| 2204 |
-
|
| 2205 |
-
|
| 2206 |
-
|
| 2207 |
-
|
| 2208 |
-
|
| 2209 |
-
|
| 2210 |
-
|
| 2211 |
-
|
| 2212 |
-
|
| 2213 |
-
|
| 2214 |
-
|
| 2215 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2216 |
|
| 2217 |
# Event handlers
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2218 |
# Custom Audio Panel - handle audio change to update speaker_audio_path_state
|
| 2219 |
custom_audio_input.change(
|
| 2220 |
lambda audio: gr.update(value=audio if audio else ""),
|
|
|
|
| 538 |
)
|
| 539 |
|
| 540 |
|
| 541 |
+
@spaces.GPU
|
| 542 |
+
def generate_audio_simple(
|
| 543 |
+
text_prompt: str,
|
| 544 |
+
speaker_audio_path: str,
|
| 545 |
+
preset_name: str,
|
| 546 |
+
rng_seed: int,
|
| 547 |
+
num_steps: int,
|
| 548 |
+
speaker_kv_enable: bool,
|
| 549 |
+
speaker_kv_scale: float,
|
| 550 |
+
session_id: str,
|
| 551 |
+
) -> Tuple[Any, Any]:
|
| 552 |
+
"""Simplified audio generation with preset-based parameters for the Simple View."""
|
| 553 |
+
|
| 554 |
+
# Load models on first use (required for Zero GPU)
|
| 555 |
+
load_models()
|
| 556 |
+
|
| 557 |
+
# Use compiled model if available, otherwise uncompiled
|
| 558 |
+
global model, model_compiled
|
| 559 |
+
active_model = model_compiled if model_compiled is not None else model
|
| 560 |
+
|
| 561 |
+
# Cleanup old temp files
|
| 562 |
+
cleanup_temp_audio(TEMP_AUDIO_DIR, session_id)
|
| 563 |
+
|
| 564 |
+
# Check if speaker is provided
|
| 565 |
+
use_zero_speaker = not speaker_audio_path or speaker_audio_path == ""
|
| 566 |
+
if use_zero_speaker:
|
| 567 |
+
speaker_audio_path = None
|
| 568 |
+
|
| 569 |
+
start_time = time.time()
|
| 570 |
+
|
| 571 |
+
# Load preset values
|
| 572 |
+
presets = load_sampler_presets()
|
| 573 |
+
preset = presets.get(preset_name, {})
|
| 574 |
+
|
| 575 |
+
# Helper to convert string values to float
|
| 576 |
+
def to_float(val, default):
|
| 577 |
+
try:
|
| 578 |
+
return float(val) if val is not None else default
|
| 579 |
+
except (ValueError, TypeError):
|
| 580 |
+
return default
|
| 581 |
+
|
| 582 |
+
# Apply preset values (or use defaults)
|
| 583 |
+
num_steps_int = min(max(int(num_steps), 1), 80)
|
| 584 |
+
rng_seed_int = int(rng_seed) if rng_seed is not None else 0
|
| 585 |
+
cfg_scale_text_val = to_float(preset.get("cfg_scale_text"), 3.0)
|
| 586 |
+
cfg_scale_speaker_val = to_float(preset.get("cfg_scale_speaker"), 8.0)
|
| 587 |
+
cfg_min_t_val = to_float(preset.get("cfg_min_t"), 0.5)
|
| 588 |
+
cfg_max_t_val = to_float(preset.get("cfg_max_t"), 1.0)
|
| 589 |
+
truncation_factor_val = to_float(preset.get("truncation_factor"), 1.0)
|
| 590 |
+
rescale_k_raw = to_float(preset.get("rescale_k"), 1.0)
|
| 591 |
+
rescale_k_val = rescale_k_raw if rescale_k_raw != 1.0 else None # 1.0 means off
|
| 592 |
+
rescale_sigma_val = to_float(preset.get("rescale_sigma"), 3.0)
|
| 593 |
+
guidance_mode = GuidanceMode.INDEPENDENT # Simple view always uses independent
|
| 594 |
+
|
| 595 |
+
# Speaker KV parameters (user override takes precedence)
|
| 596 |
+
if speaker_kv_enable:
|
| 597 |
+
speaker_k_scale_val = float(speaker_kv_scale) if speaker_kv_scale else 1.5
|
| 598 |
+
speaker_k_min_t_val = 0.9
|
| 599 |
+
speaker_k_max_layers_val = 24
|
| 600 |
+
else:
|
| 601 |
+
speaker_k_scale_val = None
|
| 602 |
+
speaker_k_min_t_val = None
|
| 603 |
+
speaker_k_max_layers_val = None
|
| 604 |
+
|
| 605 |
+
# Default shapes
|
| 606 |
+
pad_to_max_text_seq_len = 768
|
| 607 |
+
pad_to_max_speaker_latent_len = 2560
|
| 608 |
+
sample_latent_len_val = 640
|
| 609 |
+
|
| 610 |
+
# Create sample function with parameters
|
| 611 |
+
sample_fn = partial(
|
| 612 |
+
sample_euler_cfg_any,
|
| 613 |
+
num_steps=num_steps_int,
|
| 614 |
+
guidance_mode=guidance_mode,
|
| 615 |
+
cfg_scale_text=cfg_scale_text_val,
|
| 616 |
+
cfg_scale_speaker=cfg_scale_speaker_val,
|
| 617 |
+
cfg_min_t=cfg_min_t_val,
|
| 618 |
+
cfg_max_t=cfg_max_t_val,
|
| 619 |
+
truncation_factor=truncation_factor_val,
|
| 620 |
+
rescale_k=rescale_k_val,
|
| 621 |
+
rescale_sigma=rescale_sigma_val,
|
| 622 |
+
speaker_k_scale=speaker_k_scale_val,
|
| 623 |
+
speaker_k_min_t=speaker_k_min_t_val,
|
| 624 |
+
speaker_k_max_layers=speaker_k_max_layers_val,
|
| 625 |
+
apg_eta_text=None,
|
| 626 |
+
apg_eta_speaker=None,
|
| 627 |
+
apg_momentum_text=None,
|
| 628 |
+
apg_momentum_speaker=None,
|
| 629 |
+
apg_norm_text=None,
|
| 630 |
+
apg_norm_speaker=None,
|
| 631 |
+
block_size=sample_latent_len_val
|
| 632 |
+
)
|
| 633 |
+
|
| 634 |
+
# Load speaker audio if provided
|
| 635 |
+
if speaker_audio_path is not None:
|
| 636 |
+
speaker_audio = load_audio(speaker_audio_path).cuda()
|
| 637 |
+
else:
|
| 638 |
+
speaker_audio = None
|
| 639 |
+
|
| 640 |
+
# Generate audio
|
| 641 |
+
audio_out = sample_pipeline(
|
| 642 |
+
model=active_model,
|
| 643 |
+
fish_ae=fish_ae,
|
| 644 |
+
pca_state=pca_state,
|
| 645 |
+
sample_fn=sample_fn,
|
| 646 |
+
text_prompt=text_prompt,
|
| 647 |
+
speaker_audio=speaker_audio,
|
| 648 |
+
rng_seed=rng_seed_int,
|
| 649 |
+
pad_to_max_text_seq_len=pad_to_max_text_seq_len,
|
| 650 |
+
pad_to_max_speaker_latent_len=pad_to_max_speaker_latent_len,
|
| 651 |
+
)
|
| 652 |
+
|
| 653 |
+
# Apply silentcipher watermarking if enabled
|
| 654 |
+
audio_to_save = audio_out[0].cpu()
|
| 655 |
+
if USE_SILENTCIPHER and silentcipher_model is not None:
|
| 656 |
+
try:
|
| 657 |
+
audio_numpy = audio_to_save.squeeze(0).numpy()
|
| 658 |
+
encoded_audio, sdr = silentcipher_model.encode_wav(
|
| 659 |
+
audio_numpy,
|
| 660 |
+
44100,
|
| 661 |
+
SILENTCIPHER_MESSAGE,
|
| 662 |
+
message_sdr=SILENTCIPHER_SDR
|
| 663 |
+
)
|
| 664 |
+
audio_to_save = torch.tensor(encoded_audio).unsqueeze(0)
|
| 665 |
+
except Exception as e:
|
| 666 |
+
print(f"Warning: Watermarking failed: {e}")
|
| 667 |
+
print("Saving audio without watermark...")
|
| 668 |
+
|
| 669 |
+
# Save generated audio as WAV (unique filename per session)
|
| 670 |
+
stem = make_stem("generated_simple", session_id)
|
| 671 |
+
output_path = save_audio_with_format(
|
| 672 |
+
audio_to_save,
|
| 673 |
+
TEMP_AUDIO_DIR,
|
| 674 |
+
stem,
|
| 675 |
+
44100,
|
| 676 |
+
"wav"
|
| 677 |
+
)
|
| 678 |
+
|
| 679 |
+
# Calculate generation time
|
| 680 |
+
generation_time = time.time() - start_time
|
| 681 |
+
time_str = f"⏱️ Generated in {generation_time:.1f}s"
|
| 682 |
+
|
| 683 |
+
return (
|
| 684 |
+
gr.update(value=str(output_path), visible=True),
|
| 685 |
+
gr.update(value=time_str, visible=True)
|
| 686 |
+
)
|
| 687 |
+
|
| 688 |
+
|
| 689 |
# UI Helper Functions
|
| 690 |
|
| 691 |
def load_speaker_metadata(speaker_id):
|
|
|
|
| 1857 |
# On Zero GPU, don't try to compile
|
| 1858 |
return session_id, gr.update(), gr.update()
|
| 1859 |
|
| 1860 |
+
SIMPLE_CSS = """
|
| 1861 |
+
.simple-container {
|
| 1862 |
+
max-width: 1200px;
|
| 1863 |
+
margin: 0 auto;
|
| 1864 |
+
}
|
| 1865 |
+
.simple-generate-btn {
|
| 1866 |
+
font-size: 1.2rem !important;
|
| 1867 |
+
padding: 1rem 2rem !important;
|
| 1868 |
+
}
|
| 1869 |
+
.simple-output-container {
|
| 1870 |
+
min-height: 200px;
|
| 1871 |
+
display: flex;
|
| 1872 |
+
flex-direction: column;
|
| 1873 |
+
justify-content: center;
|
| 1874 |
+
}
|
| 1875 |
+
"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1876 |
|
| 1877 |
+
with gr.Blocks(title="Echo-TTS", css=LINK_CSS + SIMPLE_CSS, js=JS_CODE) as demo:
|
| 1878 |
+
gr.Markdown("# Echo-TTS")
|
| 1879 |
+
gr.Markdown("*Jordan Darefsky, 2025. See technical details [here](https://jordandarefsky.com/blog/2025/echo/). All audio outputs are subject to non-commercial use [CC-BY-NC-SA-4.0](https://creativecommons.org/licenses/by-nc-sa/4.0/).*")
|
| 1880 |
|
| 1881 |
+
# Session state for per-user file management (shared between tabs)
|
| 1882 |
session_id_state = gr.State(None)
|
| 1883 |
|
| 1884 |
+
# ==================== TABS ====================
|
| 1885 |
+
with gr.Tabs() as main_tabs:
|
| 1886 |
+
# ==================== SIMPLE VIEW TAB ====================
|
| 1887 |
+
with gr.TabItem("🎯 Simple", id="simple_tab"):
|
| 1888 |
+
gr.Markdown("Upload a voice reference (or select a voice from the library), enter text (or select a text preset), and generate!")
|
| 1889 |
+
gr.Markdown("Generate up to 30 seconds of audio. *If the generated voice does not match the reference speaker, enable Speaker KV in the Generation Parameters section.*")
|
| 1890 |
+
|
| 1891 |
+
with gr.Row():
|
| 1892 |
+
# LEFT: Inputs
|
| 1893 |
+
with gr.Column(scale=1):
|
| 1894 |
+
# Voice section - dropdown above audio
|
| 1895 |
+
with gr.Group():
|
| 1896 |
+
simple_audio_preset = gr.Dropdown(
|
| 1897 |
+
choices=["(upload your own or select from dropdown)"] + [f[0] for f in get_audio_prompt_files()],
|
| 1898 |
+
value="(upload your own or select from dropdown)",
|
| 1899 |
+
label="Voice",
|
| 1900 |
+
container=False
|
| 1901 |
+
)
|
| 1902 |
+
simple_audio_input = gr.Audio(
|
| 1903 |
+
sources=["upload", "microphone"],
|
| 1904 |
+
type="filepath",
|
| 1905 |
+
label=None,
|
| 1906 |
+
max_length=600
|
| 1907 |
+
)
|
| 1908 |
+
|
| 1909 |
+
gr.Markdown("---")
|
| 1910 |
+
|
| 1911 |
+
# Text input
|
| 1912 |
+
simple_text_prompt = gr.Textbox(
|
| 1913 |
+
label="Text",
|
| 1914 |
+
info="Enter the text you want the voice to say... or select a text preset below.",
|
| 1915 |
+
value="[S1] One of the cool things about Echo is that it can generate speech that sounds, I don't know, more human maybe? Like, uh, it can actually generate pretty natural disfluencies, well, at least some of the time. Like if you run it for a few different random seeds, a few different settings, there's uh, there's a decent chance that one of them will actually be pretty good. I mean, it's not perfect, obviously,",
|
| 1916 |
+
lines=4
|
| 1917 |
+
)
|
| 1918 |
+
|
| 1919 |
+
# RIGHT: Generate + Output
|
| 1920 |
+
with gr.Column(scale=1):
|
| 1921 |
+
# Generation parameters accordion
|
| 1922 |
+
with gr.Accordion("⚙️ Generation Parameters (optional)", open=True):
|
| 1923 |
+
# Only show independent mode presets in simple view
|
| 1924 |
+
simple_presets = {k: v for k, v in load_sampler_presets().items() if v.get("cfg_mode") == "independent"}
|
| 1925 |
+
with gr.Row():
|
| 1926 |
+
simple_preset = gr.Dropdown(
|
| 1927 |
+
choices=list(simple_presets.keys()),
|
| 1928 |
+
value=list(simple_presets.keys())[0] if simple_presets else None,
|
| 1929 |
+
label="Preset",
|
| 1930 |
+
scale=3,
|
| 1931 |
+
interactive=True
|
| 1932 |
+
)
|
| 1933 |
+
simple_rng_seed = gr.Number(
|
| 1934 |
+
label="Seed",
|
| 1935 |
+
value=0,
|
| 1936 |
+
precision=0,
|
| 1937 |
+
scale=1,
|
| 1938 |
+
min_width=60
|
| 1939 |
+
)
|
| 1940 |
+
simple_num_steps = gr.Number(
|
| 1941 |
+
label="Steps",
|
| 1942 |
+
value=40,
|
| 1943 |
+
precision=0,
|
| 1944 |
+
minimum=5,
|
| 1945 |
+
maximum=80,
|
| 1946 |
+
step=5,
|
| 1947 |
+
scale=1,
|
| 1948 |
+
min_width=60
|
| 1949 |
+
)
|
| 1950 |
+
with gr.Row():
|
| 1951 |
+
simple_speaker_kv_enable = gr.Checkbox(
|
| 1952 |
+
label="Enable Speaker KV",
|
| 1953 |
+
info="Check this if the generated voice does NOT match the reference speaker",
|
| 1954 |
+
value=False,
|
| 1955 |
+
scale=1
|
| 1956 |
+
)
|
| 1957 |
+
simple_speaker_kv_scale = gr.Number(
|
| 1958 |
+
label="KV Scale",
|
| 1959 |
+
info="(Try 1.5, 1.3, ..., 1.1)",
|
| 1960 |
+
value=1.5,
|
| 1961 |
+
step=0.1,
|
| 1962 |
+
visible=False,
|
| 1963 |
+
scale=1
|
| 1964 |
+
)
|
| 1965 |
+
|
| 1966 |
+
simple_generate_btn = gr.Button(
|
| 1967 |
+
"🎙️ Generate Audio",
|
| 1968 |
+
variant="primary",
|
| 1969 |
+
size="lg"
|
| 1970 |
+
)
|
| 1971 |
+
simple_time_display = gr.Markdown("", visible=False)
|
| 1972 |
+
simple_generated_audio = gr.Audio(
|
| 1973 |
+
label="Generated Audio",
|
| 1974 |
+
visible=True,
|
| 1975 |
+
interactive=False
|
| 1976 |
+
)
|
| 1977 |
+
|
| 1978 |
+
# Text presets - full width below
|
| 1979 |
+
with gr.Accordion("📝 Text Presets", open=False):
|
| 1980 |
+
simple_text_presets_table = gr.Dataframe(
|
| 1981 |
+
value=load_text_presets(),
|
| 1982 |
+
headers=["Category", "Words", "Text"],
|
| 1983 |
+
datatype=["str", "str", "str"],
|
| 1984 |
+
row_count=(4, "fixed"),
|
| 1985 |
+
col_count=(3, "fixed"),
|
| 1986 |
interactive=False,
|
| 1987 |
+
column_widths=["10%", "6%", "84%"],
|
| 1988 |
+
wrap=True
|
| 1989 |
)
|
| 1990 |
+
|
| 1991 |
+
gr.Markdown("---")
|
| 1992 |
+
gr.Markdown("*💡 For more control over generation parameters, switch to the **Advanced** tab.*")
|
| 1993 |
|
| 1994 |
+
# ==================== ADVANCED VIEW TAB ====================
|
| 1995 |
+
with gr.TabItem("⚙️ Advanced", id="advanced_tab"):
|
| 1996 |
+
# Instructions for Simple Mode
|
| 1997 |
+
with gr.Accordion("📖 Quick Start Instructions", open=False):
|
| 1998 |
+
gr.Markdown("""
|
| 1999 |
+
### Simple Mode (Recommended for Beginners)
|
| 2000 |
+
|
| 2001 |
+
1. **Pick or upload a voice** - Choose from the voicebank or upload your own audio (up to 2 minutes)
|
| 2002 |
+
2. **Choose a text prompt preset or enter your own prompt** - What you want the voice to say (the presets are a good guide for format/style)
|
| 2003 |
+
3. **Select a Sampling preset (optional) ** - The default preset "Independent (High Speaker CFG)" is usually good to start
|
| 2004 |
+
4. **Click Generate Audio** - Wait for the model to generate your audio
|
| 2005 |
+
|
| 2006 |
+
<div class="tip-box">
|
| 2007 |
+
|
| 2008 |
+
💡 **Tip:** If the generated voice doesn't match the reference speaker at all, enable "Speaker KV Attention Scaling" and click Generate Audio again.
|
| 2009 |
+
|
| 2010 |
+
</div>
|
| 2011 |
+
|
| 2012 |
+
### Advanced Mode
|
| 2013 |
+
|
| 2014 |
+
Switch to Advanced mode for full control over all generation parameters including CFG scales, sampling steps, truncation, and more.
|
| 2015 |
+
|
| 2016 |
+
### Other tips
|
| 2017 |
+
|
| 2018 |
+
High CFG settings are recommended but may lead to oversaturation; APG might help with this. Flat settings tend to reduce "impulse" artifacts but might result in worse (blunted/compressed/artifact-y) laughter, breathing, etc. generation.
|
| 2019 |
+
|
| 2020 |
+
Echo will try to fit the entire text-prompt into (<=) 30 seconds of audio. If your prompt is very long, the generated speech may be too quick (this is not an issue for shorter text-prompts). For disfluent, single-speaker speech, we recommend trying the reference text beginning with "[S1] ... explore how we can design" as a starting point.
|
| 2021 |
+
""")
|
| 2022 |
+
|
| 2023 |
+
# Hidden state variables to store paths and selection
|
| 2024 |
+
selected_speaker_state = gr.Textbox(visible=False, value="")
|
| 2025 |
+
speaker_st_path_state = gr.Textbox(visible=False, value="")
|
| 2026 |
+
speaker_audio_path_state = gr.Textbox(visible=False, value="")
|
| 2027 |
+
|
| 2028 |
+
gr.Markdown("# Voice Selection")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2029 |
|
| 2030 |
+
# Dataset selector
|
| 2031 |
+
dataset_selector = gr.Radio(
|
| 2032 |
+
choices=["Custom Audio Panel", "EARS", "VCTK", "Expresso", "HF-Custom"],
|
| 2033 |
+
value="Custom Audio Panel",
|
| 2034 |
+
label="Select Dataset",
|
| 2035 |
+
info="Choose which voicebank to use"
|
| 2036 |
)
|
| 2037 |
|
| 2038 |
+
dataset_license_info = gr.Markdown(
|
| 2039 |
+
"",
|
| 2040 |
+
visible=False
|
|
|
|
|
|
|
|
|
|
| 2041 |
)
|
| 2042 |
|
| 2043 |
+
# Custom Audio Panel UI (visible by default, takes full width)
|
| 2044 |
+
with gr.Row(visible=True) as custom_audio_row:
|
| 2045 |
+
# Optional: Audio prompt library table (only shown if AUDIO_PROMPT_FOLDER is configured)
|
| 2046 |
+
if AUDIO_PROMPT_FOLDER is not None and AUDIO_PROMPT_FOLDER.exists():
|
| 2047 |
+
with gr.Column(scale=1, min_width=200):
|
| 2048 |
+
gr.Markdown("#### Audio Library (favorite examples from voicebank datasets)")
|
| 2049 |
+
audio_prompt_table = gr.Dataframe(
|
| 2050 |
+
value=get_audio_prompt_files(),
|
| 2051 |
+
headers=["Filename"],
|
| 2052 |
+
datatype=["str"],
|
| 2053 |
+
row_count=(10, "dynamic"),
|
| 2054 |
+
col_count=(1, "fixed"),
|
| 2055 |
+
interactive=False,
|
| 2056 |
+
label="Click to select (or upload your own audio file directly on the right)"
|
| 2057 |
+
)
|
| 2058 |
+
|
| 2059 |
+
with gr.Column(scale=2):
|
| 2060 |
+
custom_audio_input = gr.Audio(
|
| 2061 |
+
sources=["upload", "microphone"],
|
| 2062 |
+
type="filepath",
|
| 2063 |
+
label="Speaker Reference Audio (only first two minutes will be used; leave empty for zero speaker conditioning)",
|
| 2064 |
+
max_length=600 # Maximum duration in seconds (10 minutes)
|
| 2065 |
+
)
|
| 2066 |
|
| 2067 |
+
with gr.Row(visible=False) as voicebank_row:
|
| 2068 |
+
# Voice selection UI for all voicebank datasets
|
| 2069 |
+
|
| 2070 |
+
# EARS UI (visible by default when voicebank_row is shown)
|
| 2071 |
+
with gr.Column(scale=2, visible=True) as ears_column:
|
| 2072 |
+
gr.Markdown("### 1. Speakers (EARS)")
|
| 2073 |
+
selected_speaker_display = gr.Textbox(
|
| 2074 |
+
value="",
|
| 2075 |
+
label="",
|
| 2076 |
+
show_label=False,
|
| 2077 |
+
interactive=False,
|
| 2078 |
+
visible=False,
|
| 2079 |
+
lines=2,
|
| 2080 |
+
max_lines=2
|
| 2081 |
+
)
|
| 2082 |
+
speaker_search = gr.Textbox(
|
| 2083 |
+
placeholder="Search speakers (by ID, gender, age, ethnicity, language)...",
|
| 2084 |
+
label="",
|
| 2085 |
+
show_label=False,
|
| 2086 |
+
container=False
|
| 2087 |
+
)
|
| 2088 |
+
speakers_table = gr.Dataframe(
|
| 2089 |
+
value=get_speakers_table(),
|
| 2090 |
+
headers=["ID", "G", "Age", "Ethnicity", "Native Lang"],
|
| 2091 |
+
datatype=["str", "str", "str", "str", "str"],
|
| 2092 |
+
row_count=(8, "dynamic"),
|
| 2093 |
+
col_count=(5, "fixed"),
|
| 2094 |
+
interactive=False,
|
| 2095 |
+
label="Click any cell to select",
|
| 2096 |
+
column_widths=["10%", "8%", "15%", "30%", "37%"]
|
| 2097 |
+
)
|
| 2098 |
|
| 2099 |
+
# VCTK UI (hidden by default)
|
| 2100 |
+
with gr.Column(scale=2, visible=False) as vctk_column:
|
| 2101 |
+
gr.Markdown("### 1. Speakers (VCTK)")
|
| 2102 |
+
vctk_speaker_display = gr.Textbox(
|
| 2103 |
+
value="",
|
| 2104 |
+
label="",
|
| 2105 |
+
show_label=False,
|
| 2106 |
+
interactive=False,
|
| 2107 |
+
visible=False,
|
| 2108 |
+
lines=2,
|
| 2109 |
+
max_lines=2
|
| 2110 |
+
)
|
| 2111 |
+
vctk_speaker_search = gr.Textbox(
|
| 2112 |
+
placeholder="Search speakers (by ID, gender, age, details)...",
|
| 2113 |
+
label="",
|
| 2114 |
+
show_label=False,
|
| 2115 |
+
container=False
|
| 2116 |
+
)
|
| 2117 |
+
vctk_speakers_table = gr.Dataframe(
|
| 2118 |
+
value=get_vctk_speakers_table(),
|
| 2119 |
+
headers=["ID", "G", "Age", "Details", "Length"],
|
| 2120 |
+
datatype=["str", "str", "str", "str", "str"],
|
| 2121 |
+
row_count=(8, "dynamic"),
|
| 2122 |
+
col_count=(5, "fixed"),
|
| 2123 |
+
interactive=False,
|
| 2124 |
+
label="Click any cell to select",
|
| 2125 |
+
column_widths=["10%", "8%", "12%", "50%", "20%"]
|
| 2126 |
+
)
|
| 2127 |
+
|
| 2128 |
+
# Expresso UI (hidden by default)
|
| 2129 |
+
with gr.Column(scale=2, visible=False) as expresso_column:
|
| 2130 |
+
gr.Markdown("### 1. Voices (Expresso)")
|
| 2131 |
+
expresso_speaker_display = gr.Textbox(
|
| 2132 |
+
value="",
|
| 2133 |
+
label="",
|
| 2134 |
+
show_label=False,
|
| 2135 |
+
interactive=False,
|
| 2136 |
+
visible=False,
|
| 2137 |
+
lines=2,
|
| 2138 |
+
max_lines=2
|
| 2139 |
+
)
|
| 2140 |
+
expresso_speaker_search = gr.Textbox(
|
| 2141 |
+
placeholder="Search voices (by ID, type, speakers, style)...",
|
| 2142 |
+
label="",
|
| 2143 |
+
show_label=False,
|
| 2144 |
+
container=False
|
| 2145 |
+
)
|
| 2146 |
+
expresso_speakers_table = gr.Dataframe(
|
| 2147 |
+
value=get_expresso_speakers_table(),
|
| 2148 |
+
headers=["ID", "Type", "Speakers", "Style", "Length"],
|
| 2149 |
+
datatype=["str", "str", "str", "str", "str"],
|
| 2150 |
+
row_count=(8, "dynamic"),
|
| 2151 |
+
col_count=(5, "fixed"),
|
| 2152 |
+
interactive=False,
|
| 2153 |
+
label="Click any cell to select",
|
| 2154 |
+
column_widths=["35%", "15%", "15%", "15%", "20%"]
|
| 2155 |
+
)
|
| 2156 |
+
|
| 2157 |
+
# HF-Custom UI (hidden by default)
|
| 2158 |
+
with gr.Column(scale=2, visible=False) as hf_custom_column:
|
| 2159 |
+
gr.Markdown("### 1. Voices (HF-Custom)")
|
| 2160 |
+
hf_custom_speaker_display = gr.Textbox(
|
| 2161 |
+
value="",
|
| 2162 |
+
label="",
|
| 2163 |
+
show_label=False,
|
| 2164 |
+
interactive=False,
|
| 2165 |
+
visible=False,
|
| 2166 |
+
lines=2,
|
| 2167 |
+
max_lines=2
|
| 2168 |
+
)
|
| 2169 |
+
hf_custom_speaker_search = gr.Textbox(
|
| 2170 |
+
placeholder="Search voices (by name, dataset, description)...",
|
| 2171 |
+
label="",
|
| 2172 |
+
show_label=False,
|
| 2173 |
+
container=False
|
| 2174 |
+
)
|
| 2175 |
+
hf_custom_speakers_table = gr.Dataframe(
|
| 2176 |
+
value=get_hf_custom_speakers_table(),
|
| 2177 |
+
headers=["Name", "Dataset", "Description", "Length"],
|
| 2178 |
+
datatype=["str", "str", "str", "str"],
|
| 2179 |
+
row_count=(8, "dynamic"),
|
| 2180 |
+
col_count=(4, "fixed"),
|
| 2181 |
+
interactive=False,
|
| 2182 |
+
label="Click any cell to select",
|
| 2183 |
+
column_widths=["15%", "15%", "50%", "20%"]
|
| 2184 |
+
)
|
| 2185 |
+
|
| 2186 |
+
with gr.Column(scale=1, visible=True) as voice_type_column:
|
| 2187 |
+
gr.Markdown("### 2. Voice Type")
|
| 2188 |
+
selected_voice_display = gr.Textbox(
|
| 2189 |
+
value="",
|
| 2190 |
+
label="",
|
| 2191 |
+
show_label=False,
|
| 2192 |
+
interactive=False,
|
| 2193 |
+
visible=False,
|
| 2194 |
+
lines=2,
|
| 2195 |
+
max_lines=2
|
| 2196 |
+
)
|
| 2197 |
+
freeform_table = gr.Dataframe(
|
| 2198 |
+
value=[],
|
| 2199 |
+
headers=["Type", "Length"],
|
| 2200 |
+
datatype=["str", "str"],
|
| 2201 |
+
row_count=(1, "fixed"),
|
| 2202 |
+
col_count=(2, "fixed"),
|
| 2203 |
+
interactive=False,
|
| 2204 |
+
label="Freeform voice",
|
| 2205 |
+
visible=True,
|
| 2206 |
+
column_widths=["60%", "40%"]
|
| 2207 |
+
)
|
| 2208 |
+
gr.Markdown("**Emotions:**")
|
| 2209 |
+
emotions_table = gr.Dataframe(
|
| 2210 |
+
value=[],
|
| 2211 |
+
headers=["Emotion", "Length"],
|
| 2212 |
+
datatype=["str", "str"],
|
| 2213 |
+
row_count=(8, "dynamic"),
|
| 2214 |
+
col_count=(2, "fixed"),
|
| 2215 |
+
interactive=False,
|
| 2216 |
+
visible=True,
|
| 2217 |
+
column_widths=["60%", "40%"]
|
| 2218 |
+
)
|
| 2219 |
+
|
| 2220 |
+
with gr.Column(scale=1):
|
| 2221 |
+
gr.Markdown("### 3. Audio Preview")
|
| 2222 |
+
audio_preview = gr.Audio(label="Voice Sample", type="filepath", interactive=False)
|
| 2223 |
+
|
| 2224 |
+
gr.HTML('<hr class="section-separator">')
|
| 2225 |
+
gr.Markdown("# Text Prompt")
|
| 2226 |
+
with gr.Accordion("Text Presets", open=True):
|
| 2227 |
+
text_presets_table = gr.Dataframe(
|
| 2228 |
+
value=load_text_presets(),
|
| 2229 |
+
headers=["Category", "Words", "Preset Text"],
|
| 2230 |
+
datatype=["str", "str", "str"],
|
| 2231 |
+
row_count=(3, "dynamic"),
|
| 2232 |
+
col_count=(3, "fixed"),
|
| 2233 |
+
interactive=False,
|
| 2234 |
+
column_widths=["12%", "6%", "82%"]
|
| 2235 |
)
|
| 2236 |
+
text_prompt = gr.Textbox(
|
| 2237 |
+
label="Text Prompt",
|
| 2238 |
+
placeholder="[S1] Enter your text prompt here...",
|
| 2239 |
+
lines=4
|
| 2240 |
+
)
|
| 2241 |
|
| 2242 |
+
gr.HTML('<hr class="section-separator">')
|
| 2243 |
+
gr.Markdown("# Generation")
|
| 2244 |
|
| 2245 |
+
# Mode selector: Simple or Advanced (outside the accordion, centered and prominent)
|
| 2246 |
with gr.Row():
|
|
|
|
| 2247 |
with gr.Column(scale=1):
|
| 2248 |
+
pass # Empty column for spacing
|
| 2249 |
+
with gr.Column(scale=2):
|
| 2250 |
+
mode_selector = gr.Radio(
|
| 2251 |
+
choices=["Simple Mode", "Advanced Mode"],
|
| 2252 |
+
value="Simple Mode",
|
| 2253 |
+
label="",
|
| 2254 |
+
info=None,
|
| 2255 |
+
elem_id="component-mode-selector"
|
| 2256 |
+
)
|
| 2257 |
+
with gr.Column(scale=1):
|
| 2258 |
+
pass # Empty column for spacing
|
| 2259 |
+
|
| 2260 |
+
with gr.Accordion("⚙️ Generation Parameters", open=True):
|
| 2261 |
+
|
| 2262 |
+
with gr.Row():
|
| 2263 |
+
presets = load_sampler_presets()
|
| 2264 |
+
preset_keys = list(presets.keys())
|
| 2265 |
+
first_preset = preset_keys[0] if preset_keys else "Custom"
|
| 2266 |
+
|
| 2267 |
+
preset_dropdown = gr.Dropdown(
|
| 2268 |
+
choices=["Custom"] + preset_keys,
|
| 2269 |
+
value=first_preset, # Default to first preset instead of Custom
|
| 2270 |
+
label="Sampler Preset",
|
| 2271 |
+
info="Load preset configurations",
|
| 2272 |
+
scale=2
|
| 2273 |
+
)
|
| 2274 |
+
|
| 2275 |
+
rng_seed = gr.Number(
|
| 2276 |
+
label="RNG Seed",
|
| 2277 |
+
value=0,
|
| 2278 |
+
info="Random seed for starting noise",
|
| 2279 |
+
precision=0,
|
| 2280 |
+
scale=1
|
| 2281 |
+
)
|
| 2282 |
+
|
| 2283 |
+
# Simple mode: Speaker KV checkbox on same row (visible by default)
|
| 2284 |
+
with gr.Column(scale=1, visible=True) as simple_mode_row:
|
| 2285 |
+
speaker_kv_simple_checkbox = gr.Checkbox(
|
| 2286 |
+
label="\"Force Speaker\" (Enable Speaker KV Attention Scaling)",
|
| 2287 |
+
value=False,
|
| 2288 |
+
info="Enable if generation does not match reference voice (otherwise leave off)"
|
| 2289 |
)
|
| 2290 |
|
| 2291 |
+
# Advanced mode: Compile and custom shapes checkboxes (hidden by default)
|
| 2292 |
+
with gr.Column(scale=1, visible=False) as advanced_mode_compile_column:
|
| 2293 |
+
compile_checkbox = gr.Checkbox(
|
| 2294 |
+
label="Compile Model",
|
| 2295 |
+
value=True, # Default to True in simple mode
|
| 2296 |
+
interactive=not IS_ZEROGPU,
|
| 2297 |
+
info="Compile disabled on Zero GPU" if IS_ZEROGPU else "~20-30% faster after initial compilation"
|
| 2298 |
+
)
|
| 2299 |
+
compile_status = gr.Markdown(
|
| 2300 |
+
value="⚠️ Compile disabled on Zero GPU" if IS_ZEROGPU else "",
|
| 2301 |
+
visible=IS_ZEROGPU
|
| 2302 |
+
)
|
| 2303 |
+
use_custom_shapes_checkbox = gr.Checkbox(
|
| 2304 |
+
label="Use Custom Shapes (Advanced)",
|
| 2305 |
+
value=False,
|
| 2306 |
+
info="Override default sequence lengths for text, speaker, and sample"
|
| 2307 |
+
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2308 |
|
| 2309 |
+
# Advanced mode controls (hidden by default)
|
| 2310 |
+
with gr.Column(visible=False) as advanced_mode_column:
|
| 2311 |
+
with gr.Row(visible=False) as custom_shapes_row:
|
| 2312 |
+
max_text_byte_length = gr.Textbox(
|
| 2313 |
+
label="Max Text Byte Length (padded)",
|
| 2314 |
+
value="768",
|
| 2315 |
+
info="Maximum text utf-8 byte sequence length (blank -> no padding)",
|
| 2316 |
+
scale=1
|
| 2317 |
+
)
|
| 2318 |
+
max_speaker_latent_length = gr.Textbox(
|
| 2319 |
+
label="Max Speaker Latent Length (padded)",
|
| 2320 |
+
value="2560",
|
| 2321 |
+
info="Maximum (unpatched)speaker latent length (blank -> no padding), default 2560 = ~30s",
|
| 2322 |
+
scale=1
|
| 2323 |
+
)
|
| 2324 |
+
sample_latent_len = gr.Textbox(
|
| 2325 |
+
label="Sample Latent Length",
|
| 2326 |
+
value="640",
|
| 2327 |
+
info="Maximum sample latent length (EXPERIMENTAL!!! ONLY TRAINED WITH 640 BUT SOMEHOW WORKS WITH < 640 TO GENERATE PREFIXES)",
|
| 2328 |
+
scale=1
|
| 2329 |
+
)
|
| 2330 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2331 |
|
| 2332 |
+
with gr.Row():
|
| 2333 |
+
# Left column: Core Sampling Parameters
|
| 2334 |
+
with gr.Column(scale=1):
|
| 2335 |
+
with gr.Group():
|
| 2336 |
+
gr.HTML("""
|
| 2337 |
+
<div class="preset-inline">
|
| 2338 |
+
<span class="title">Core Sampling Parameters</span><span class="dim">(</span>
|
| 2339 |
+
<a href="javascript:void(0)" class="preset-link" data-fire="core_default">default</a>
|
| 2340 |
+
<span class="dim">)</span>
|
| 2341 |
+
</div>
|
| 2342 |
+
""")
|
| 2343 |
+
core_preset_default = gr.Button("", elem_id="core_default", elem_classes=["proxy-btn"])
|
| 2344 |
+
num_steps = gr.Number(label="Number of Steps", value=40, info="Number of sampling steps (consider 20 - 80) (capped at 80)", precision=0, minimum=1, step=5, maximum=80)
|
| 2345 |
+
|
| 2346 |
+
cfg_mode = gr.Radio(
|
| 2347 |
+
choices=[
|
| 2348 |
+
"independent",
|
| 2349 |
+
"apg-independent",
|
| 2350 |
+
"alternating",
|
| 2351 |
+
"joint-unconditional"
|
| 2352 |
+
],
|
| 2353 |
+
value="independent",
|
| 2354 |
+
label="CFG Mode",
|
| 2355 |
+
info="Independent (3 NFE), Adaptive Projected Guidance (3 NFE, see https://arxiv.org/abs/2410.02416), Alternating (2 NFE), Joint-Unconditional (2 NFE)"
|
| 2356 |
+
)
|
| 2357 |
+
|
| 2358 |
+
with gr.Group():
|
| 2359 |
+
gr.HTML("""
|
| 2360 |
+
<div class="preset-inline">
|
| 2361 |
+
<span class="title">CFG Guidance</span><span class="dim">(</span>
|
| 2362 |
+
<a href="javascript:void(0)" class="preset-link" data-fire="cfg_default">default</a>
|
| 2363 |
+
<span class="dim">,</span>
|
| 2364 |
+
<a href="javascript:void(0)" class="preset-link" data-fire="cfg_higher">higher speaker</a>
|
| 2365 |
+
<span class="dim">,</span>
|
| 2366 |
+
<a href="javascript:void(0)" class="preset-link" data-fire="cfg_large">large guidances(works with apg)</a>
|
| 2367 |
+
<span class="dim">)</span>
|
| 2368 |
+
</div>
|
| 2369 |
+
""")
|
| 2370 |
+
cfg_preset_default = gr.Button("", elem_id="cfg_default", elem_classes=["proxy-btn"])
|
| 2371 |
+
cfg_preset_higher_speaker = gr.Button("", elem_id="cfg_higher", elem_classes=["proxy-btn"])
|
| 2372 |
+
cfg_preset_large_guidances = gr.Button("", elem_id="cfg_large", elem_classes=["proxy-btn"])
|
| 2373 |
+
with gr.Row():
|
| 2374 |
+
cfg_scale_text = gr.Number(label="Text CFG Scale", value=3.0, info="Guidance strength for text", minimum=0, step=0.5)
|
| 2375 |
+
cfg_scale_speaker = gr.Number(label="Speaker CFG Scale", value=5.0, info="Guidance strength for speaker", minimum=0, step=0.5)
|
| 2376 |
+
|
| 2377 |
+
with gr.Row():
|
| 2378 |
+
cfg_min_t = gr.Number(label="CFG Min t", value=0.5, info="(0-1), CFG applied when t >= val", minimum=0, maximum=1, step=0.05)
|
| 2379 |
+
cfg_max_t = gr.Number(label="CFG Max t", value=1.0, info="(0-1), CFG applied when t <= val", minimum=0, maximum=1, step=0.05)
|
| 2380 |
+
|
| 2381 |
+
# Right column: Speaker KV, Truncation + APG
|
| 2382 |
+
with gr.Column(scale=1):
|
| 2383 |
+
with gr.Group():
|
| 2384 |
+
gr.HTML("""
|
| 2385 |
+
<div class="preset-inline">
|
| 2386 |
+
<span class="title">Speaker KV Attention Scaling</span><span class="dim">(</span>
|
| 2387 |
+
<a href="javascript:void(0)" class="preset-link" data-fire="spk_kv_enable">enable if generation does not match reference</a>
|
| 2388 |
+
<span class="dim">,</span>
|
| 2389 |
+
<a href="javascript:void(0)" class="preset-link" data-fire="spk_kv_off">off</a>
|
| 2390 |
+
<span class="dim">)</span>
|
| 2391 |
+
</div>
|
| 2392 |
+
""")
|
| 2393 |
+
spk_kv_preset_enable = gr.Button("", elem_id="spk_kv_enable", elem_classes=["proxy-btn"])
|
| 2394 |
+
spk_kv_preset_off = gr.Button("", elem_id="spk_kv_off", elem_classes=["proxy-btn"])
|
| 2395 |
+
speaker_k_enable = gr.Checkbox(label="Enable Speaker KV Scaling", value=False, info="Scale speaker attention key-values; useful when the model-generated audio does not at all match the reference audio (i.e. ignores speaker-reference)")
|
| 2396 |
+
|
| 2397 |
+
with gr.Row(visible=False) as speaker_k_row:
|
| 2398 |
+
speaker_k_scale = gr.Number(label="KV Scale", value=1.5, info="Scale factor", minimum=0, step=0.1)
|
| 2399 |
+
speaker_k_min_t = gr.Number(label="KV Min t", value=0.9, info="(0-1), scale applied from steps t=1. to val", minimum=0, maximum=1, step=0.05)
|
| 2400 |
+
speaker_k_max_layers = gr.Number(label="Max Layers", value=24, info="(0-24), scale applied in first N layers", precision=0, minimum=0, maximum=24)
|
| 2401 |
+
|
| 2402 |
+
with gr.Group():
|
| 2403 |
+
gr.HTML("""
|
| 2404 |
+
<div class="preset-inline">
|
| 2405 |
+
<span class="title">Truncation & Temporal Rescaling</span><span class="dim">(</span>
|
| 2406 |
+
<a href="javascript:void(0)" class="preset-link" data-fire="trunc_flat">flat</a>
|
| 2407 |
+
<span class="dim">,</span>
|
| 2408 |
+
<a href="javascript:void(0)" class="preset-link" data-fire="trunc_sharp">sharp</a>
|
| 2409 |
+
<span class="dim">,</span>
|
| 2410 |
+
<a href="javascript:void(0)" class="preset-link" data-fire="trunc_baseline">baseline(sharp)</a>
|
| 2411 |
+
<span class="dim">)</span>
|
| 2412 |
+
</div>
|
| 2413 |
+
""")
|
| 2414 |
+
trunc_preset_flat = gr.Button("", elem_id="trunc_flat", elem_classes=["proxy-btn"])
|
| 2415 |
+
trunc_preset_sharp = gr.Button("", elem_id="trunc_sharp", elem_classes=["proxy-btn"])
|
| 2416 |
+
trunc_preset_baseline = gr.Button("", elem_id="trunc_baseline", elem_classes=["proxy-btn"])
|
| 2417 |
+
with gr.Row():
|
| 2418 |
+
truncation_factor = gr.Number(label="Truncation Factor", value=0.8, info="Multiply initial noise (<1 helps artifacts)", minimum=0, step=0.05)
|
| 2419 |
+
rescale_k = gr.Number(label="Rescale k", value=1.2, info="<1=sharpen, >1=flatten, 1=off", minimum=0, step=0.05)
|
| 2420 |
+
rescale_sigma = gr.Number(label="Rescale σ", value=3.0, info="Sigma parameter", minimum=0, step=0.1)
|
| 2421 |
+
|
| 2422 |
+
with gr.Group(visible=False) as apg_row:
|
| 2423 |
+
gr.HTML("""
|
| 2424 |
+
<div class="preset-inline">
|
| 2425 |
+
<span class="title">APG Parameters</span><span class="dim">(</span>
|
| 2426 |
+
<a href="javascript:void(0)" class="preset-link" data-fire="apg_default">default</a>
|
| 2427 |
+
<span class="dim">,</span>
|
| 2428 |
+
<a href="javascript:void(0)" class="preset-link" data-fire="apg_no_momentum">no momentum</a>
|
| 2429 |
+
<span class="dim">,</span>
|
| 2430 |
+
<a href="javascript:void(0)" class="preset-link" data-fire="apg_norms">norms</a>
|
| 2431 |
+
<span class="dim">,</span>
|
| 2432 |
+
<a href="javascript:void(0)" class="preset-link" data-fire="apg_no_eta">no eta</a>
|
| 2433 |
+
<span class="dim">)</span>
|
| 2434 |
+
</div>
|
| 2435 |
+
""")
|
| 2436 |
+
apg_preset_default = gr.Button("", elem_id="apg_default", elem_classes=["proxy-btn"])
|
| 2437 |
+
apg_preset_no_momentum = gr.Button("", elem_id="apg_no_momentum", elem_classes=["proxy-btn"])
|
| 2438 |
+
apg_preset_norms = gr.Button("", elem_id="apg_norms", elem_classes=["proxy-btn"])
|
| 2439 |
+
apg_preset_no_eta = gr.Button("", elem_id="apg_no_eta", elem_classes=["proxy-btn"])
|
| 2440 |
+
with gr.Row():
|
| 2441 |
+
apg_eta_text = gr.Number(label="APG η (text)", value=0.5, info="Eta for text projection (0-1, higher -> more like CFG)", minimum=0, maximum=1, step=0.25)
|
| 2442 |
+
apg_eta_speaker = gr.Number(label="APG η (speaker)", value=0.5, info="Eta for speaker projection (0-1, higher -> more like CFG)", minimum=0, maximum=1, step=0.25)
|
| 2443 |
+
|
| 2444 |
+
with gr.Row() as apg_row2:
|
| 2445 |
+
apg_momentum_text = gr.Number(label="APG Momentum (text)", value=-0.25, info="Text momentum (can try 0., -.25, -0.5, -0.75...)", step=0.25)
|
| 2446 |
+
apg_momentum_speaker = gr.Number(label="APG Momentum (speaker)", value=-0.25, info="Speaker momentum (can try 0., -.25, -0.5, -0.75...)", step=0.25)
|
| 2447 |
+
with gr.Row():
|
| 2448 |
+
apg_norm_text = gr.Textbox(label="APG Norm (text)", value="", info="Text norm clip (leave blank to disable, can try 7.5, 15.0)")
|
| 2449 |
+
apg_norm_speaker = gr.Textbox(label="APG Norm (speaker)", value="", info="Speaker norm clip (leave blank to disable, can try 7.5, 15.0)")
|
| 2450 |
+
# End of advanced_mode_column
|
| 2451 |
+
|
| 2452 |
+
with gr.Row(equal_height=True):
|
| 2453 |
+
audio_format = gr.Radio(
|
| 2454 |
+
choices=["wav", "mp3"],
|
| 2455 |
+
value="wav",
|
| 2456 |
+
label="Format",
|
| 2457 |
+
scale=1,
|
| 2458 |
+
min_width=90
|
| 2459 |
+
)
|
| 2460 |
+
generate_btn = gr.Button("Generate Audio", variant="primary", size="lg", scale=10)
|
| 2461 |
+
with gr.Column(scale=1):
|
| 2462 |
+
show_original_audio = gr.Checkbox(
|
| 2463 |
+
label="Re-display original audio (full 2-minute cropped mono)",
|
| 2464 |
+
value=False
|
| 2465 |
+
)
|
| 2466 |
+
reconstruct_first_30_seconds = gr.Checkbox(
|
| 2467 |
+
label="Show Autoencoder Reconstruction (only first 30s of reference)",
|
| 2468 |
+
value=False
|
| 2469 |
+
)
|
| 2470 |
+
|
| 2471 |
+
gr.HTML('<hr class="section-separator">')
|
| 2472 |
+
with gr.Accordion("Generated Audio", open=True, visible=True) as generated_section:
|
| 2473 |
+
generation_time_display = gr.Markdown("", visible=False)
|
| 2474 |
+
with gr.Group(elem_classes=["generated-audio-player"]):
|
| 2475 |
+
generated_audio = gr.Audio(label="Generated Audio", visible=True)
|
| 2476 |
+
text_prompt_display = gr.Markdown("", visible=False)
|
| 2477 |
+
|
| 2478 |
+
gr.Markdown("---")
|
| 2479 |
+
reference_audio_header = gr.Markdown("#### Reference Audio", visible=False)
|
| 2480 |
+
|
| 2481 |
+
with gr.Accordion("Original Audio (2 min Cropped Mono)", open=False, visible=False) as original_accordion:
|
| 2482 |
+
original_audio = gr.Audio(label="Original Reference Audio (2 min)", visible=True)
|
| 2483 |
+
|
| 2484 |
+
with gr.Accordion("Autoencoder Reconstruction of First 30s of Reference", open=False, visible=False) as reference_accordion:
|
| 2485 |
+
reference_audio = gr.Audio(label="Decoded Reference Audio (30s)", visible=True)
|
| 2486 |
+
|
| 2487 |
+
# End of Advanced TabItem
|
| 2488 |
+
# End of Tabs
|
| 2489 |
|
| 2490 |
# Event handlers
|
| 2491 |
+
|
| 2492 |
+
# Simple View - Generate button handler
|
| 2493 |
+
simple_generate_btn.click(
|
| 2494 |
+
generate_audio_simple,
|
| 2495 |
+
inputs=[
|
| 2496 |
+
simple_text_prompt,
|
| 2497 |
+
simple_audio_input,
|
| 2498 |
+
simple_preset,
|
| 2499 |
+
simple_rng_seed,
|
| 2500 |
+
simple_num_steps,
|
| 2501 |
+
simple_speaker_kv_enable,
|
| 2502 |
+
simple_speaker_kv_scale,
|
| 2503 |
+
session_id_state,
|
| 2504 |
+
],
|
| 2505 |
+
outputs=[simple_generated_audio, simple_time_display]
|
| 2506 |
+
)
|
| 2507 |
+
|
| 2508 |
+
# Simple View - Speaker KV checkbox toggle
|
| 2509 |
+
simple_speaker_kv_enable.change(
|
| 2510 |
+
lambda enabled: gr.update(visible=enabled),
|
| 2511 |
+
inputs=[simple_speaker_kv_enable],
|
| 2512 |
+
outputs=[simple_speaker_kv_scale]
|
| 2513 |
+
)
|
| 2514 |
+
|
| 2515 |
+
# Simple View - Preset dropdown handler
|
| 2516 |
+
def apply_simple_preset(preset_name):
|
| 2517 |
+
if not preset_name:
|
| 2518 |
+
return [gr.update()] * 3
|
| 2519 |
+
presets = load_sampler_presets()
|
| 2520 |
+
if preset_name in presets:
|
| 2521 |
+
preset = presets[preset_name]
|
| 2522 |
+
steps = int(preset.get("num_steps", 40))
|
| 2523 |
+
speaker_kv = preset.get("speaker_k_enable", False)
|
| 2524 |
+
return [
|
| 2525 |
+
gr.update(value=steps),
|
| 2526 |
+
gr.update(value=speaker_kv),
|
| 2527 |
+
gr.update(visible=speaker_kv)
|
| 2528 |
+
]
|
| 2529 |
+
return [gr.update()] * 3
|
| 2530 |
+
|
| 2531 |
+
simple_preset.change(
|
| 2532 |
+
apply_simple_preset,
|
| 2533 |
+
inputs=[simple_preset],
|
| 2534 |
+
outputs=[simple_num_steps, simple_speaker_kv_enable, simple_speaker_kv_scale]
|
| 2535 |
+
)
|
| 2536 |
+
|
| 2537 |
+
# Simple View - Audio preset dropdown handler
|
| 2538 |
+
def select_simple_audio_preset(preset_name):
|
| 2539 |
+
if preset_name == "(upload your own or select from dropdown)" or not preset_name:
|
| 2540 |
+
return gr.update(value=None) # Clear the audio input
|
| 2541 |
+
if AUDIO_PROMPT_FOLDER is not None:
|
| 2542 |
+
file_path = AUDIO_PROMPT_FOLDER / preset_name
|
| 2543 |
+
if file_path.exists():
|
| 2544 |
+
return gr.update(value=str(file_path))
|
| 2545 |
+
return gr.update()
|
| 2546 |
+
|
| 2547 |
+
simple_audio_preset.change(
|
| 2548 |
+
select_simple_audio_preset,
|
| 2549 |
+
inputs=[simple_audio_preset],
|
| 2550 |
+
outputs=[simple_audio_input]
|
| 2551 |
+
)
|
| 2552 |
+
|
| 2553 |
+
# Simple View - Text preset table selection handler
|
| 2554 |
+
def select_simple_text_preset(evt: gr.SelectData):
|
| 2555 |
+
text_presets = load_text_presets()
|
| 2556 |
+
if evt.index[0] < len(text_presets):
|
| 2557 |
+
return gr.update(value=text_presets[evt.index[0]][2])
|
| 2558 |
+
return gr.update()
|
| 2559 |
+
|
| 2560 |
+
simple_text_presets_table.select(
|
| 2561 |
+
select_simple_text_preset,
|
| 2562 |
+
outputs=[simple_text_prompt]
|
| 2563 |
+
)
|
| 2564 |
+
|
| 2565 |
+
# Simple View - Reset audio preset dropdown when audio is cleared
|
| 2566 |
+
simple_audio_input.clear(
|
| 2567 |
+
lambda: gr.update(value="(upload your own or select from dropdown)"),
|
| 2568 |
+
outputs=[simple_audio_preset]
|
| 2569 |
+
)
|
| 2570 |
+
|
| 2571 |
+
# Advanced View Event handlers
|
| 2572 |
# Custom Audio Panel - handle audio change to update speaker_audio_path_state
|
| 2573 |
custom_audio_input.change(
|
| 2574 |
lambda audio: gr.update(value=audio if audio else ""),
|