jordand commited on
Commit
e099082
·
verified ·
1 Parent(s): b3f53dd

add simple version

Browse files
Files changed (1) hide show
  1. app.py +834 -480
app.py CHANGED
@@ -538,6 +538,154 @@ def generate_audio(
538
  )
539
 
540
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
541
  # UI Helper Functions
542
 
543
  def load_speaker_metadata(speaker_id):
@@ -1709,512 +1857,718 @@ def init_and_compile():
1709
  # On Zero GPU, don't try to compile
1710
  return session_id, gr.update(), gr.update()
1711
 
1712
- with gr.Blocks(title="Echo-TTS", css=LINK_CSS, js=JS_CODE) as demo:
1713
- gr.Markdown("# Echo-TTS")
1714
- gr.Markdown("*Jordan Darefsky, 2025. See technical details [here](https://jordandarefsky.com/blog/2025/echo/)*")
1715
-
1716
- # License notice for Fish Speech autoencoder
1717
- gr.Markdown("**License Notice:** All audio outputs are subject to non-commercial use [CC-BY-NC-SA-4.0](https://creativecommons.org/licenses/by-nc-sa/4.0/).")
1718
-
1719
- # Silentcipher watermarking notice
1720
- if USE_SILENTCIPHER:
1721
- gr.Markdown(f"*Audio output is watermarked with [silentcipher](https://github.com/sony/silentcipher) using message `{SILENTCIPHER_MESSAGE}`*")
1722
-
1723
- # Instructions for Simple Mode
1724
- with gr.Accordion("📖 Quick Start Instructions", open=True):
1725
- gr.Markdown("""
1726
- ### Simple Mode (Recommended for Beginners)
1727
-
1728
- 1. **Pick or upload a voice** - Choose from the voicebank or upload your own audio (up to 2 minutes)
1729
- 2. **Choose a text prompt preset or enter your own prompt** - What you want the voice to say (the presets are a good guide for format/style)
1730
- 3. **Select a Sampling preset (optional) ** - The default preset "Independent (High Speaker CFG)" is usually good to start
1731
- 4. **Click Generate Audio** - Wait for the model to generate your audio
1732
-
1733
- <div class="tip-box">
1734
-
1735
- 💡 **Tip:** If the generated voice doesn't match the reference speaker at all, enable "Speaker KV Attention Scaling" and click Generate Audio again.
1736
-
1737
- </div>
1738
-
1739
- ### Advanced Mode
1740
-
1741
- Switch to Advanced mode for full control over all generation parameters including CFG scales, sampling steps, truncation, and more.
1742
-
1743
- ### Other tips
1744
-
1745
- High CFG settings are recommended but may lead to oversaturation; APG might help with this. Flat settings tend to reduce "impulse" artifacts but might result in worse (blunted/compressed/artifact-y) laughter, breathing, etc. generation.
1746
 
1747
- Echo will try to fit the entire text-prompt into (<=) 30 seconds of audio. If your prompt is very long, the generated speech may be too quick (this is not an issue for shorter text-prompts). For disfluent, single-speaker speech, we recommend trying the reference text beginning with "[S1] ... explore how we can design" as a starting point.
1748
- """)
 
1749
 
1750
- # Session state for per-user file management
1751
  session_id_state = gr.State(None)
1752
 
1753
- # Hidden state variables to store paths and selection
1754
- selected_speaker_state = gr.Textbox(visible=False, value="")
1755
- speaker_st_path_state = gr.Textbox(visible=False, value="")
1756
- speaker_audio_path_state = gr.Textbox(visible=False, value="")
1757
-
1758
- gr.Markdown("# Voice Selection")
1759
-
1760
- # Dataset selector
1761
- dataset_selector = gr.Radio(
1762
- choices=["Custom Audio Panel", "EARS", "VCTK", "Expresso", "HF-Custom"],
1763
- value="Custom Audio Panel",
1764
- label="Select Dataset",
1765
- info="Choose which voicebank to use"
1766
- )
1767
-
1768
- dataset_license_info = gr.Markdown(
1769
- "",
1770
- visible=False
1771
- )
1772
-
1773
- # Custom Audio Panel UI (visible by default, takes full width)
1774
- with gr.Row(visible=True) as custom_audio_row:
1775
- # Optional: Audio prompt library table (only shown if AUDIO_PROMPT_FOLDER is configured)
1776
- if AUDIO_PROMPT_FOLDER is not None and AUDIO_PROMPT_FOLDER.exists():
1777
- with gr.Column(scale=1, min_width=200):
1778
- gr.Markdown("#### Audio Library (favorite examples from voicebank datasets)")
1779
- audio_prompt_table = gr.Dataframe(
1780
- value=get_audio_prompt_files(),
1781
- headers=["Filename"],
1782
- datatype=["str"],
1783
- row_count=(10, "dynamic"),
1784
- col_count=(1, "fixed"),
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1785
  interactive=False,
1786
- label="Click to select (or upload your own audio file directly on the right)"
 
1787
  )
 
 
 
1788
 
1789
- with gr.Column(scale=2):
1790
- custom_audio_input = gr.Audio(
1791
- sources=["upload", "microphone"],
1792
- type="filepath",
1793
- label="Speaker Reference Audio (only first two minutes will be used; leave empty for zero speaker conditioning)",
1794
- max_length=600 # Maximum duration in seconds (10 minutes)
1795
- )
1796
-
1797
- with gr.Row(visible=False) as voicebank_row:
1798
- # Voice selection UI for all voicebank datasets
1799
-
1800
- # EARS UI (visible by default when voicebank_row is shown)
1801
- with gr.Column(scale=2, visible=True) as ears_column:
1802
- gr.Markdown("### 1. Speakers (EARS)")
1803
- selected_speaker_display = gr.Textbox(
1804
- value="",
1805
- label="",
1806
- show_label=False,
1807
- interactive=False,
1808
- visible=False,
1809
- lines=2,
1810
- max_lines=2
1811
- )
1812
- speaker_search = gr.Textbox(
1813
- placeholder="Search speakers (by ID, gender, age, ethnicity, language)...",
1814
- label="",
1815
- show_label=False,
1816
- container=False
1817
- )
1818
- speakers_table = gr.Dataframe(
1819
- value=get_speakers_table(),
1820
- headers=["ID", "G", "Age", "Ethnicity", "Native Lang"],
1821
- datatype=["str", "str", "str", "str", "str"],
1822
- row_count=(8, "dynamic"),
1823
- col_count=(5, "fixed"),
1824
- interactive=False,
1825
- label="Click any cell to select",
1826
- column_widths=["10%", "8%", "15%", "30%", "37%"]
1827
- )
1828
-
1829
- # VCTK UI (hidden by default)
1830
- with gr.Column(scale=2, visible=False) as vctk_column:
1831
- gr.Markdown("### 1. Speakers (VCTK)")
1832
- vctk_speaker_display = gr.Textbox(
1833
- value="",
1834
- label="",
1835
- show_label=False,
1836
- interactive=False,
1837
- visible=False,
1838
- lines=2,
1839
- max_lines=2
1840
- )
1841
- vctk_speaker_search = gr.Textbox(
1842
- placeholder="Search speakers (by ID, gender, age, details)...",
1843
- label="",
1844
- show_label=False,
1845
- container=False
1846
- )
1847
- vctk_speakers_table = gr.Dataframe(
1848
- value=get_vctk_speakers_table(),
1849
- headers=["ID", "G", "Age", "Details", "Length"],
1850
- datatype=["str", "str", "str", "str", "str"],
1851
- row_count=(8, "dynamic"),
1852
- col_count=(5, "fixed"),
1853
- interactive=False,
1854
- label="Click any cell to select",
1855
- column_widths=["10%", "8%", "12%", "50%", "20%"]
1856
- )
1857
-
1858
- # Expresso UI (hidden by default)
1859
- with gr.Column(scale=2, visible=False) as expresso_column:
1860
- gr.Markdown("### 1. Voices (Expresso)")
1861
- expresso_speaker_display = gr.Textbox(
1862
- value="",
1863
- label="",
1864
- show_label=False,
1865
- interactive=False,
1866
- visible=False,
1867
- lines=2,
1868
- max_lines=2
1869
- )
1870
- expresso_speaker_search = gr.Textbox(
1871
- placeholder="Search voices (by ID, type, speakers, style)...",
1872
- label="",
1873
- show_label=False,
1874
- container=False
1875
- )
1876
- expresso_speakers_table = gr.Dataframe(
1877
- value=get_expresso_speakers_table(),
1878
- headers=["ID", "Type", "Speakers", "Style", "Length"],
1879
- datatype=["str", "str", "str", "str", "str"],
1880
- row_count=(8, "dynamic"),
1881
- col_count=(5, "fixed"),
1882
- interactive=False,
1883
- label="Click any cell to select",
1884
- column_widths=["35%", "15%", "15%", "15%", "20%"]
1885
- )
1886
-
1887
- # HF-Custom UI (hidden by default)
1888
- with gr.Column(scale=2, visible=False) as hf_custom_column:
1889
- gr.Markdown("### 1. Voices (HF-Custom)")
1890
- hf_custom_speaker_display = gr.Textbox(
1891
- value="",
1892
- label="",
1893
- show_label=False,
1894
- interactive=False,
1895
- visible=False,
1896
- lines=2,
1897
- max_lines=2
1898
- )
1899
- hf_custom_speaker_search = gr.Textbox(
1900
- placeholder="Search voices (by name, dataset, description)...",
1901
- label="",
1902
- show_label=False,
1903
- container=False
1904
- )
1905
- hf_custom_speakers_table = gr.Dataframe(
1906
- value=get_hf_custom_speakers_table(),
1907
- headers=["Name", "Dataset", "Description", "Length"],
1908
- datatype=["str", "str", "str", "str"],
1909
- row_count=(8, "dynamic"),
1910
- col_count=(4, "fixed"),
1911
- interactive=False,
1912
- label="Click any cell to select",
1913
- column_widths=["15%", "15%", "50%", "20%"]
1914
- )
1915
-
1916
- with gr.Column(scale=1, visible=True) as voice_type_column:
1917
- gr.Markdown("### 2. Voice Type")
1918
- selected_voice_display = gr.Textbox(
1919
- value="",
1920
- label="",
1921
- show_label=False,
1922
- interactive=False,
1923
- visible=False,
1924
- lines=2,
1925
- max_lines=2
1926
- )
1927
- freeform_table = gr.Dataframe(
1928
- value=[],
1929
- headers=["Type", "Length"],
1930
- datatype=["str", "str"],
1931
- row_count=(1, "fixed"),
1932
- col_count=(2, "fixed"),
1933
- interactive=False,
1934
- label="Freeform voice",
1935
- visible=True,
1936
- column_widths=["60%", "40%"]
1937
- )
1938
- gr.Markdown("**Emotions:**")
1939
- emotions_table = gr.Dataframe(
1940
- value=[],
1941
- headers=["Emotion", "Length"],
1942
- datatype=["str", "str"],
1943
- row_count=(8, "dynamic"),
1944
- col_count=(2, "fixed"),
1945
- interactive=False,
1946
- visible=True,
1947
- column_widths=["60%", "40%"]
1948
- )
1949
-
1950
- with gr.Column(scale=1):
1951
- gr.Markdown("### 3. Audio Preview")
1952
- audio_preview = gr.Audio(label="Voice Sample", type="filepath", interactive=False)
1953
-
1954
- gr.HTML('<hr class="section-separator">')
1955
- gr.Markdown("# Text Prompt")
1956
- with gr.Accordion("Text Presets", open=True):
1957
- text_presets_table = gr.Dataframe(
1958
- value=load_text_presets(),
1959
- headers=["Category", "Words", "Preset Text"],
1960
- datatype=["str", "str", "str"],
1961
- row_count=(3, "dynamic"),
1962
- col_count=(3, "fixed"),
1963
- interactive=False,
1964
- column_widths=["12%", "6%", "82%"]
1965
- )
1966
- text_prompt = gr.Textbox(
1967
- label="Text Prompt",
1968
- placeholder="[S1] Enter your text prompt here...",
1969
- lines=4
1970
- )
1971
-
1972
- gr.HTML('<hr class="section-separator">')
1973
- gr.Markdown("# Generation")
1974
-
1975
- # Mode selector: Simple or Advanced (outside the accordion, centered and prominent)
1976
- with gr.Row():
1977
- with gr.Column(scale=1):
1978
- pass # Empty column for spacing
1979
- with gr.Column(scale=2):
1980
- mode_selector = gr.Radio(
1981
- choices=["Simple Mode", "Advanced Mode"],
1982
- value="Simple Mode",
1983
- label="",
1984
- info=None,
1985
- elem_id="component-mode-selector"
1986
- )
1987
- with gr.Column(scale=1):
1988
- pass # Empty column for spacing
1989
-
1990
- with gr.Accordion("⚙️ Generation Parameters", open=True):
1991
-
1992
- with gr.Row():
1993
- presets = load_sampler_presets()
1994
- preset_keys = list(presets.keys())
1995
- first_preset = preset_keys[0] if preset_keys else "Custom"
1996
 
1997
- preset_dropdown = gr.Dropdown(
1998
- choices=["Custom"] + preset_keys,
1999
- value=first_preset, # Default to first preset instead of Custom
2000
- label="Sampler Preset",
2001
- info="Load preset configurations",
2002
- scale=2
2003
  )
2004
 
2005
- rng_seed = gr.Number(
2006
- label="RNG Seed",
2007
- value=0,
2008
- info="Random seed for starting noise",
2009
- precision=0,
2010
- scale=1
2011
  )
2012
 
2013
- # Simple mode: Speaker KV checkbox on same row (visible by default)
2014
- with gr.Column(scale=1, visible=True) as simple_mode_row:
2015
- speaker_kv_simple_checkbox = gr.Checkbox(
2016
- label="\"Force Speaker\" (Enable Speaker KV Attention Scaling)",
2017
- value=False,
2018
- info="Enable if generation does not match reference voice (otherwise leave off)"
2019
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2020
 
2021
- # Advanced mode: Compile and custom shapes checkboxes (hidden by default)
2022
- with gr.Column(scale=1, visible=False) as advanced_mode_compile_column:
2023
- compile_checkbox = gr.Checkbox(
2024
- label="Compile Model",
2025
- value=True, # Default to True in simple mode
2026
- interactive=not IS_ZEROGPU,
2027
- info="Compile disabled on Zero GPU" if IS_ZEROGPU else "~20-30% faster after initial compilation"
2028
- )
2029
- compile_status = gr.Markdown(
2030
- value="⚠️ Compile disabled on Zero GPU" if IS_ZEROGPU else "",
2031
- visible=IS_ZEROGPU
2032
- )
2033
- use_custom_shapes_checkbox = gr.Checkbox(
2034
- label="Use Custom Shapes (Advanced)",
2035
- value=False,
2036
- info="Override default sequence lengths for text, speaker, and sample"
2037
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2038
 
2039
- # Advanced mode controls (hidden by default)
2040
- with gr.Column(visible=False) as advanced_mode_column:
2041
- with gr.Row(visible=False) as custom_shapes_row:
2042
- max_text_byte_length = gr.Textbox(
2043
- label="Max Text Byte Length (padded)",
2044
- value="768",
2045
- info="Maximum text utf-8 byte sequence length (blank -> no padding)",
2046
- scale=1
2047
- )
2048
- max_speaker_latent_length = gr.Textbox(
2049
- label="Max Speaker Latent Length (padded)",
2050
- value="2560",
2051
- info="Maximum (unpatched)speaker latent length (blank -> no padding), default 2560 = ~30s",
2052
- scale=1
2053
- )
2054
- sample_latent_len = gr.Textbox(
2055
- label="Sample Latent Length",
2056
- value="640",
2057
- info="Maximum sample latent length (EXPERIMENTAL!!! ONLY TRAINED WITH 640 BUT SOMEHOW WORKS WITH < 640 TO GENERATE PREFIXES)",
2058
- scale=1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2059
  )
 
 
 
 
 
2060
 
 
 
2061
 
 
2062
  with gr.Row():
2063
- # Left column: Core Sampling Parameters
2064
  with gr.Column(scale=1):
2065
- with gr.Group():
2066
- gr.HTML("""
2067
- <div class="preset-inline">
2068
- <span class="title">Core Sampling Parameters</span><span class="dim">(</span>
2069
- <a href="javascript:void(0)" class="preset-link" data-fire="core_default">default</a>
2070
- <span class="dim">)</span>
2071
- </div>
2072
- """)
2073
- core_preset_default = gr.Button("", elem_id="core_default", elem_classes=["proxy-btn"])
2074
- num_steps = gr.Number(label="Number of Steps", value=40, info="Number of sampling steps (consider 20 - 80) (capped at 80)", precision=0, minimum=1, step=5, maximum=80)
2075
-
2076
- cfg_mode = gr.Radio(
2077
- choices=[
2078
- "independent",
2079
- "apg-independent",
2080
- "alternating",
2081
- "joint-unconditional"
2082
- ],
2083
- value="independent",
2084
- label="CFG Mode",
2085
- info="Independent (3 NFE), Adaptive Projected Guidance (3 NFE, see https://arxiv.org/abs/2410.02416), Alternating (2 NFE), Joint-Unconditional (2 NFE)"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2086
  )
2087
 
2088
- with gr.Group():
2089
- gr.HTML("""
2090
- <div class="preset-inline">
2091
- <span class="title">CFG Guidance</span><span class="dim">(</span>
2092
- <a href="javascript:void(0)" class="preset-link" data-fire="cfg_default">default</a>
2093
- <span class="dim">,</span>
2094
- <a href="javascript:void(0)" class="preset-link" data-fire="cfg_higher">higher speaker</a>
2095
- <span class="dim">,</span>
2096
- <a href="javascript:void(0)" class="preset-link" data-fire="cfg_large">large guidances(works with apg)</a>
2097
- <span class="dim">)</span>
2098
- </div>
2099
- """)
2100
- cfg_preset_default = gr.Button("", elem_id="cfg_default", elem_classes=["proxy-btn"])
2101
- cfg_preset_higher_speaker = gr.Button("", elem_id="cfg_higher", elem_classes=["proxy-btn"])
2102
- cfg_preset_large_guidances = gr.Button("", elem_id="cfg_large", elem_classes=["proxy-btn"])
2103
- with gr.Row():
2104
- cfg_scale_text = gr.Number(label="Text CFG Scale", value=3.0, info="Guidance strength for text", minimum=0, step=0.5)
2105
- cfg_scale_speaker = gr.Number(label="Speaker CFG Scale", value=5.0, info="Guidance strength for speaker", minimum=0, step=0.5)
2106
-
2107
- with gr.Row():
2108
- cfg_min_t = gr.Number(label="CFG Min t", value=0.5, info="(0-1), CFG applied when t >= val", minimum=0, maximum=1, step=0.05)
2109
- cfg_max_t = gr.Number(label="CFG Max t", value=1.0, info="(0-1), CFG applied when t <= val", minimum=0, maximum=1, step=0.05)
2110
 
2111
- # Right column: Speaker KV, Truncation + APG
2112
- with gr.Column(scale=1):
2113
- with gr.Group():
2114
- gr.HTML("""
2115
- <div class="preset-inline">
2116
- <span class="title">Speaker KV Attention Scaling</span><span class="dim">(</span>
2117
- <a href="javascript:void(0)" class="preset-link" data-fire="spk_kv_enable">enable if generation does not match reference</a>
2118
- <span class="dim">,</span>
2119
- <a href="javascript:void(0)" class="preset-link" data-fire="spk_kv_off">off</a>
2120
- <span class="dim">)</span>
2121
- </div>
2122
- """)
2123
- spk_kv_preset_enable = gr.Button("", elem_id="spk_kv_enable", elem_classes=["proxy-btn"])
2124
- spk_kv_preset_off = gr.Button("", elem_id="spk_kv_off", elem_classes=["proxy-btn"])
2125
- speaker_k_enable = gr.Checkbox(label="Enable Speaker KV Scaling", value=False, info="Scale speaker attention key-values; useful when the model-generated audio does not at all match the reference audio (i.e. ignores speaker-reference)")
2126
-
2127
- with gr.Row(visible=False) as speaker_k_row:
2128
- speaker_k_scale = gr.Number(label="KV Scale", value=1.5, info="Scale factor", minimum=0, step=0.1)
2129
- speaker_k_min_t = gr.Number(label="KV Min t", value=0.9, info="(0-1), scale applied from steps t=1. to val", minimum=0, maximum=1, step=0.05)
2130
- speaker_k_max_layers = gr.Number(label="Max Layers", value=24, info="(0-24), scale applied in first N layers", precision=0, minimum=0, maximum=24)
 
2131
 
2132
- with gr.Group():
2133
- gr.HTML("""
2134
- <div class="preset-inline">
2135
- <span class="title">Truncation &amp; Temporal Rescaling</span><span class="dim">(</span>
2136
- <a href="javascript:void(0)" class="preset-link" data-fire="trunc_flat">flat</a>
2137
- <span class="dim">,</span>
2138
- <a href="javascript:void(0)" class="preset-link" data-fire="trunc_sharp">sharp</a>
2139
- <span class="dim">,</span>
2140
- <a href="javascript:void(0)" class="preset-link" data-fire="trunc_baseline">baseline(sharp)</a>
2141
- <span class="dim">)</span>
2142
- </div>
2143
- """)
2144
- trunc_preset_flat = gr.Button("", elem_id="trunc_flat", elem_classes=["proxy-btn"])
2145
- trunc_preset_sharp = gr.Button("", elem_id="trunc_sharp", elem_classes=["proxy-btn"])
2146
- trunc_preset_baseline = gr.Button("", elem_id="trunc_baseline", elem_classes=["proxy-btn"])
2147
- with gr.Row():
2148
- truncation_factor = gr.Number(label="Truncation Factor", value=0.8, info="Multiply initial noise (<1 helps artifacts)", minimum=0, step=0.05)
2149
- rescale_k = gr.Number(label="Rescale k", value=1.2, info="<1=sharpen, >1=flatten, 1=off", minimum=0, step=0.05)
2150
- rescale_sigma = gr.Number(label="Rescale σ", value=3.0, info="Sigma parameter", minimum=0, step=0.1)
2151
 
2152
- with gr.Group(visible=False) as apg_row:
2153
- gr.HTML("""
2154
- <div class="preset-inline">
2155
- <span class="title">APG Parameters</span><span class="dim">(</span>
2156
- <a href="javascript:void(0)" class="preset-link" data-fire="apg_default">default</a>
2157
- <span class="dim">,</span>
2158
- <a href="javascript:void(0)" class="preset-link" data-fire="apg_no_momentum">no momentum</a>
2159
- <span class="dim">,</span>
2160
- <a href="javascript:void(0)" class="preset-link" data-fire="apg_norms">norms</a>
2161
- <span class="dim">,</span>
2162
- <a href="javascript:void(0)" class="preset-link" data-fire="apg_no_eta">no eta</a>
2163
- <span class="dim">)</span>
2164
- </div>
2165
- """)
2166
- apg_preset_default = gr.Button("", elem_id="apg_default", elem_classes=["proxy-btn"])
2167
- apg_preset_no_momentum = gr.Button("", elem_id="apg_no_momentum", elem_classes=["proxy-btn"])
2168
- apg_preset_norms = gr.Button("", elem_id="apg_norms", elem_classes=["proxy-btn"])
2169
- apg_preset_no_eta = gr.Button("", elem_id="apg_no_eta", elem_classes=["proxy-btn"])
2170
- with gr.Row():
2171
- apg_eta_text = gr.Number(label="APG η (text)", value=0.5, info="Eta for text projection (0-1, higher -> more like CFG)", minimum=0, maximum=1, step=0.25)
2172
- apg_eta_speaker = gr.Number(label="APG η (speaker)", value=0.5, info="Eta for speaker projection (0-1, higher -> more like CFG)", minimum=0, maximum=1, step=0.25)
2173
-
2174
- with gr.Row() as apg_row2:
2175
- apg_momentum_text = gr.Number(label="APG Momentum (text)", value=-0.25, info="Text momentum (can try 0., -.25, -0.5, -0.75...)", step=0.25)
2176
- apg_momentum_speaker = gr.Number(label="APG Momentum (speaker)", value=-0.25, info="Speaker momentum (can try 0., -.25, -0.5, -0.75...)", step=0.25)
2177
- with gr.Row():
2178
- apg_norm_text = gr.Textbox(label="APG Norm (text)", value="", info="Text norm clip (leave blank to disable, can try 7.5, 15.0)")
2179
- apg_norm_speaker = gr.Textbox(label="APG Norm (speaker)", value="", info="Speaker norm clip (leave blank to disable, can try 7.5, 15.0)")
2180
- # End of advanced_mode_column
2181
-
2182
- with gr.Row(equal_height=True):
2183
- audio_format = gr.Radio(
2184
- choices=["wav", "mp3"],
2185
- value="wav",
2186
- label="Format",
2187
- scale=1,
2188
- min_width=90
2189
- )
2190
- generate_btn = gr.Button("Generate Audio", variant="primary", size="lg", scale=10)
2191
- with gr.Column(scale=1):
2192
- show_original_audio = gr.Checkbox(
2193
- label="Re-display original audio (full 2-minute cropped mono)",
2194
- value=False
2195
- )
2196
- reconstruct_first_30_seconds = gr.Checkbox(
2197
- label="Show Autoencoder Reconstruction (only first 30s of reference)",
2198
- value=False
2199
- )
2200
-
2201
- gr.HTML('<hr class="section-separator">')
2202
- with gr.Accordion("Generated Audio", open=True, visible=True) as generated_section:
2203
- generation_time_display = gr.Markdown("", visible=False)
2204
- with gr.Group(elem_classes=["generated-audio-player"]):
2205
- generated_audio = gr.Audio(label="Generated Audio", visible=True)
2206
- text_prompt_display = gr.Markdown("", visible=False)
2207
-
2208
- gr.Markdown("---")
2209
- reference_audio_header = gr.Markdown("#### Reference Audio", visible=False)
2210
-
2211
- with gr.Accordion("Original Audio (2 min Cropped Mono)", open=False, visible=False) as original_accordion:
2212
- original_audio = gr.Audio(label="Original Reference Audio (2 min)", visible=True)
2213
-
2214
- with gr.Accordion("Autoencoder Reconstruction of First 30s of Reference", open=False, visible=False) as reference_accordion:
2215
- reference_audio = gr.Audio(label="Decoded Reference Audio (30s)", visible=True)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2216
 
2217
  # Event handlers
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2218
  # Custom Audio Panel - handle audio change to update speaker_audio_path_state
2219
  custom_audio_input.change(
2220
  lambda audio: gr.update(value=audio if audio else ""),
 
538
  )
539
 
540
 
541
+ @spaces.GPU
542
+ def generate_audio_simple(
543
+ text_prompt: str,
544
+ speaker_audio_path: str,
545
+ preset_name: str,
546
+ rng_seed: int,
547
+ num_steps: int,
548
+ speaker_kv_enable: bool,
549
+ speaker_kv_scale: float,
550
+ session_id: str,
551
+ ) -> Tuple[Any, Any]:
552
+ """Simplified audio generation with preset-based parameters for the Simple View."""
553
+
554
+ # Load models on first use (required for Zero GPU)
555
+ load_models()
556
+
557
+ # Use compiled model if available, otherwise uncompiled
558
+ global model, model_compiled
559
+ active_model = model_compiled if model_compiled is not None else model
560
+
561
+ # Cleanup old temp files
562
+ cleanup_temp_audio(TEMP_AUDIO_DIR, session_id)
563
+
564
+ # Check if speaker is provided
565
+ use_zero_speaker = not speaker_audio_path or speaker_audio_path == ""
566
+ if use_zero_speaker:
567
+ speaker_audio_path = None
568
+
569
+ start_time = time.time()
570
+
571
+ # Load preset values
572
+ presets = load_sampler_presets()
573
+ preset = presets.get(preset_name, {})
574
+
575
+ # Helper to convert string values to float
576
+ def to_float(val, default):
577
+ try:
578
+ return float(val) if val is not None else default
579
+ except (ValueError, TypeError):
580
+ return default
581
+
582
+ # Apply preset values (or use defaults)
583
+ num_steps_int = min(max(int(num_steps), 1), 80)
584
+ rng_seed_int = int(rng_seed) if rng_seed is not None else 0
585
+ cfg_scale_text_val = to_float(preset.get("cfg_scale_text"), 3.0)
586
+ cfg_scale_speaker_val = to_float(preset.get("cfg_scale_speaker"), 8.0)
587
+ cfg_min_t_val = to_float(preset.get("cfg_min_t"), 0.5)
588
+ cfg_max_t_val = to_float(preset.get("cfg_max_t"), 1.0)
589
+ truncation_factor_val = to_float(preset.get("truncation_factor"), 1.0)
590
+ rescale_k_raw = to_float(preset.get("rescale_k"), 1.0)
591
+ rescale_k_val = rescale_k_raw if rescale_k_raw != 1.0 else None # 1.0 means off
592
+ rescale_sigma_val = to_float(preset.get("rescale_sigma"), 3.0)
593
+ guidance_mode = GuidanceMode.INDEPENDENT # Simple view always uses independent
594
+
595
+ # Speaker KV parameters (user override takes precedence)
596
+ if speaker_kv_enable:
597
+ speaker_k_scale_val = float(speaker_kv_scale) if speaker_kv_scale else 1.5
598
+ speaker_k_min_t_val = 0.9
599
+ speaker_k_max_layers_val = 24
600
+ else:
601
+ speaker_k_scale_val = None
602
+ speaker_k_min_t_val = None
603
+ speaker_k_max_layers_val = None
604
+
605
+ # Default shapes
606
+ pad_to_max_text_seq_len = 768
607
+ pad_to_max_speaker_latent_len = 2560
608
+ sample_latent_len_val = 640
609
+
610
+ # Create sample function with parameters
611
+ sample_fn = partial(
612
+ sample_euler_cfg_any,
613
+ num_steps=num_steps_int,
614
+ guidance_mode=guidance_mode,
615
+ cfg_scale_text=cfg_scale_text_val,
616
+ cfg_scale_speaker=cfg_scale_speaker_val,
617
+ cfg_min_t=cfg_min_t_val,
618
+ cfg_max_t=cfg_max_t_val,
619
+ truncation_factor=truncation_factor_val,
620
+ rescale_k=rescale_k_val,
621
+ rescale_sigma=rescale_sigma_val,
622
+ speaker_k_scale=speaker_k_scale_val,
623
+ speaker_k_min_t=speaker_k_min_t_val,
624
+ speaker_k_max_layers=speaker_k_max_layers_val,
625
+ apg_eta_text=None,
626
+ apg_eta_speaker=None,
627
+ apg_momentum_text=None,
628
+ apg_momentum_speaker=None,
629
+ apg_norm_text=None,
630
+ apg_norm_speaker=None,
631
+ block_size=sample_latent_len_val
632
+ )
633
+
634
+ # Load speaker audio if provided
635
+ if speaker_audio_path is not None:
636
+ speaker_audio = load_audio(speaker_audio_path).cuda()
637
+ else:
638
+ speaker_audio = None
639
+
640
+ # Generate audio
641
+ audio_out = sample_pipeline(
642
+ model=active_model,
643
+ fish_ae=fish_ae,
644
+ pca_state=pca_state,
645
+ sample_fn=sample_fn,
646
+ text_prompt=text_prompt,
647
+ speaker_audio=speaker_audio,
648
+ rng_seed=rng_seed_int,
649
+ pad_to_max_text_seq_len=pad_to_max_text_seq_len,
650
+ pad_to_max_speaker_latent_len=pad_to_max_speaker_latent_len,
651
+ )
652
+
653
+ # Apply silentcipher watermarking if enabled
654
+ audio_to_save = audio_out[0].cpu()
655
+ if USE_SILENTCIPHER and silentcipher_model is not None:
656
+ try:
657
+ audio_numpy = audio_to_save.squeeze(0).numpy()
658
+ encoded_audio, sdr = silentcipher_model.encode_wav(
659
+ audio_numpy,
660
+ 44100,
661
+ SILENTCIPHER_MESSAGE,
662
+ message_sdr=SILENTCIPHER_SDR
663
+ )
664
+ audio_to_save = torch.tensor(encoded_audio).unsqueeze(0)
665
+ except Exception as e:
666
+ print(f"Warning: Watermarking failed: {e}")
667
+ print("Saving audio without watermark...")
668
+
669
+ # Save generated audio as WAV (unique filename per session)
670
+ stem = make_stem("generated_simple", session_id)
671
+ output_path = save_audio_with_format(
672
+ audio_to_save,
673
+ TEMP_AUDIO_DIR,
674
+ stem,
675
+ 44100,
676
+ "wav"
677
+ )
678
+
679
+ # Calculate generation time
680
+ generation_time = time.time() - start_time
681
+ time_str = f"⏱️ Generated in {generation_time:.1f}s"
682
+
683
+ return (
684
+ gr.update(value=str(output_path), visible=True),
685
+ gr.update(value=time_str, visible=True)
686
+ )
687
+
688
+
689
  # UI Helper Functions
690
 
691
  def load_speaker_metadata(speaker_id):
 
1857
  # On Zero GPU, don't try to compile
1858
  return session_id, gr.update(), gr.update()
1859
 
1860
+ SIMPLE_CSS = """
1861
+ .simple-container {
1862
+ max-width: 1200px;
1863
+ margin: 0 auto;
1864
+ }
1865
+ .simple-generate-btn {
1866
+ font-size: 1.2rem !important;
1867
+ padding: 1rem 2rem !important;
1868
+ }
1869
+ .simple-output-container {
1870
+ min-height: 200px;
1871
+ display: flex;
1872
+ flex-direction: column;
1873
+ justify-content: center;
1874
+ }
1875
+ """
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1876
 
1877
+ with gr.Blocks(title="Echo-TTS", css=LINK_CSS + SIMPLE_CSS, js=JS_CODE) as demo:
1878
+ gr.Markdown("# Echo-TTS")
1879
+ gr.Markdown("*Jordan Darefsky, 2025. See technical details [here](https://jordandarefsky.com/blog/2025/echo/). All audio outputs are subject to non-commercial use [CC-BY-NC-SA-4.0](https://creativecommons.org/licenses/by-nc-sa/4.0/).*")
1880
 
1881
+ # Session state for per-user file management (shared between tabs)
1882
  session_id_state = gr.State(None)
1883
 
1884
+ # ==================== TABS ====================
1885
+ with gr.Tabs() as main_tabs:
1886
+ # ==================== SIMPLE VIEW TAB ====================
1887
+ with gr.TabItem("🎯 Simple", id="simple_tab"):
1888
+ gr.Markdown("Upload a voice reference (or select a voice from the library), enter text (or select a text preset), and generate!")
1889
+ gr.Markdown("Generate up to 30 seconds of audio. *If the generated voice does not match the reference speaker, enable Speaker KV in the Generation Parameters section.*")
1890
+
1891
+ with gr.Row():
1892
+ # LEFT: Inputs
1893
+ with gr.Column(scale=1):
1894
+ # Voice section - dropdown above audio
1895
+ with gr.Group():
1896
+ simple_audio_preset = gr.Dropdown(
1897
+ choices=["(upload your own or select from dropdown)"] + [f[0] for f in get_audio_prompt_files()],
1898
+ value="(upload your own or select from dropdown)",
1899
+ label="Voice",
1900
+ container=False
1901
+ )
1902
+ simple_audio_input = gr.Audio(
1903
+ sources=["upload", "microphone"],
1904
+ type="filepath",
1905
+ label=None,
1906
+ max_length=600
1907
+ )
1908
+
1909
+ gr.Markdown("---")
1910
+
1911
+ # Text input
1912
+ simple_text_prompt = gr.Textbox(
1913
+ label="Text",
1914
+ info="Enter the text you want the voice to say... or select a text preset below.",
1915
+ value="[S1] One of the cool things about Echo is that it can generate speech that sounds, I don't know, more human maybe? Like, uh, it can actually generate pretty natural disfluencies, well, at least some of the time. Like if you run it for a few different random seeds, a few different settings, there's uh, there's a decent chance that one of them will actually be pretty good. I mean, it's not perfect, obviously,",
1916
+ lines=4
1917
+ )
1918
+
1919
+ # RIGHT: Generate + Output
1920
+ with gr.Column(scale=1):
1921
+ # Generation parameters accordion
1922
+ with gr.Accordion("⚙️ Generation Parameters (optional)", open=True):
1923
+ # Only show independent mode presets in simple view
1924
+ simple_presets = {k: v for k, v in load_sampler_presets().items() if v.get("cfg_mode") == "independent"}
1925
+ with gr.Row():
1926
+ simple_preset = gr.Dropdown(
1927
+ choices=list(simple_presets.keys()),
1928
+ value=list(simple_presets.keys())[0] if simple_presets else None,
1929
+ label="Preset",
1930
+ scale=3,
1931
+ interactive=True
1932
+ )
1933
+ simple_rng_seed = gr.Number(
1934
+ label="Seed",
1935
+ value=0,
1936
+ precision=0,
1937
+ scale=1,
1938
+ min_width=60
1939
+ )
1940
+ simple_num_steps = gr.Number(
1941
+ label="Steps",
1942
+ value=40,
1943
+ precision=0,
1944
+ minimum=5,
1945
+ maximum=80,
1946
+ step=5,
1947
+ scale=1,
1948
+ min_width=60
1949
+ )
1950
+ with gr.Row():
1951
+ simple_speaker_kv_enable = gr.Checkbox(
1952
+ label="Enable Speaker KV",
1953
+ info="Check this if the generated voice does NOT match the reference speaker",
1954
+ value=False,
1955
+ scale=1
1956
+ )
1957
+ simple_speaker_kv_scale = gr.Number(
1958
+ label="KV Scale",
1959
+ info="(Try 1.5, 1.3, ..., 1.1)",
1960
+ value=1.5,
1961
+ step=0.1,
1962
+ visible=False,
1963
+ scale=1
1964
+ )
1965
+
1966
+ simple_generate_btn = gr.Button(
1967
+ "🎙️ Generate Audio",
1968
+ variant="primary",
1969
+ size="lg"
1970
+ )
1971
+ simple_time_display = gr.Markdown("", visible=False)
1972
+ simple_generated_audio = gr.Audio(
1973
+ label="Generated Audio",
1974
+ visible=True,
1975
+ interactive=False
1976
+ )
1977
+
1978
+ # Text presets - full width below
1979
+ with gr.Accordion("📝 Text Presets", open=False):
1980
+ simple_text_presets_table = gr.Dataframe(
1981
+ value=load_text_presets(),
1982
+ headers=["Category", "Words", "Text"],
1983
+ datatype=["str", "str", "str"],
1984
+ row_count=(4, "fixed"),
1985
+ col_count=(3, "fixed"),
1986
  interactive=False,
1987
+ column_widths=["10%", "6%", "84%"],
1988
+ wrap=True
1989
  )
1990
+
1991
+ gr.Markdown("---")
1992
+ gr.Markdown("*💡 For more control over generation parameters, switch to the **Advanced** tab.*")
1993
 
1994
+ # ==================== ADVANCED VIEW TAB ====================
1995
+ with gr.TabItem("⚙️ Advanced", id="advanced_tab"):
1996
+ # Instructions for Simple Mode
1997
+ with gr.Accordion("📖 Quick Start Instructions", open=False):
1998
+ gr.Markdown("""
1999
+ ### Simple Mode (Recommended for Beginners)
2000
+
2001
+ 1. **Pick or upload a voice** - Choose from the voicebank or upload your own audio (up to 2 minutes)
2002
+ 2. **Choose a text prompt preset or enter your own prompt** - What you want the voice to say (the presets are a good guide for format/style)
2003
+ 3. **Select a Sampling preset (optional) ** - The default preset "Independent (High Speaker CFG)" is usually good to start
2004
+ 4. **Click Generate Audio** - Wait for the model to generate your audio
2005
+
2006
+ <div class="tip-box">
2007
+
2008
+ 💡 **Tip:** If the generated voice doesn't match the reference speaker at all, enable "Speaker KV Attention Scaling" and click Generate Audio again.
2009
+
2010
+ </div>
2011
+
2012
+ ### Advanced Mode
2013
+
2014
+ Switch to Advanced mode for full control over all generation parameters including CFG scales, sampling steps, truncation, and more.
2015
+
2016
+ ### Other tips
2017
+
2018
+ High CFG settings are recommended but may lead to oversaturation; APG might help with this. Flat settings tend to reduce "impulse" artifacts but might result in worse (blunted/compressed/artifact-y) laughter, breathing, etc. generation.
2019
+
2020
+ Echo will try to fit the entire text-prompt into (<=) 30 seconds of audio. If your prompt is very long, the generated speech may be too quick (this is not an issue for shorter text-prompts). For disfluent, single-speaker speech, we recommend trying the reference text beginning with "[S1] ... explore how we can design" as a starting point.
2021
+ """)
2022
+
2023
+ # Hidden state variables to store paths and selection
2024
+ selected_speaker_state = gr.Textbox(visible=False, value="")
2025
+ speaker_st_path_state = gr.Textbox(visible=False, value="")
2026
+ speaker_audio_path_state = gr.Textbox(visible=False, value="")
2027
+
2028
+ gr.Markdown("# Voice Selection")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2029
 
2030
+ # Dataset selector
2031
+ dataset_selector = gr.Radio(
2032
+ choices=["Custom Audio Panel", "EARS", "VCTK", "Expresso", "HF-Custom"],
2033
+ value="Custom Audio Panel",
2034
+ label="Select Dataset",
2035
+ info="Choose which voicebank to use"
2036
  )
2037
 
2038
+ dataset_license_info = gr.Markdown(
2039
+ "",
2040
+ visible=False
 
 
 
2041
  )
2042
 
2043
+ # Custom Audio Panel UI (visible by default, takes full width)
2044
+ with gr.Row(visible=True) as custom_audio_row:
2045
+ # Optional: Audio prompt library table (only shown if AUDIO_PROMPT_FOLDER is configured)
2046
+ if AUDIO_PROMPT_FOLDER is not None and AUDIO_PROMPT_FOLDER.exists():
2047
+ with gr.Column(scale=1, min_width=200):
2048
+ gr.Markdown("#### Audio Library (favorite examples from voicebank datasets)")
2049
+ audio_prompt_table = gr.Dataframe(
2050
+ value=get_audio_prompt_files(),
2051
+ headers=["Filename"],
2052
+ datatype=["str"],
2053
+ row_count=(10, "dynamic"),
2054
+ col_count=(1, "fixed"),
2055
+ interactive=False,
2056
+ label="Click to select (or upload your own audio file directly on the right)"
2057
+ )
2058
+
2059
+ with gr.Column(scale=2):
2060
+ custom_audio_input = gr.Audio(
2061
+ sources=["upload", "microphone"],
2062
+ type="filepath",
2063
+ label="Speaker Reference Audio (only first two minutes will be used; leave empty for zero speaker conditioning)",
2064
+ max_length=600 # Maximum duration in seconds (10 minutes)
2065
+ )
2066
 
2067
+ with gr.Row(visible=False) as voicebank_row:
2068
+ # Voice selection UI for all voicebank datasets
2069
+
2070
+ # EARS UI (visible by default when voicebank_row is shown)
2071
+ with gr.Column(scale=2, visible=True) as ears_column:
2072
+ gr.Markdown("### 1. Speakers (EARS)")
2073
+ selected_speaker_display = gr.Textbox(
2074
+ value="",
2075
+ label="",
2076
+ show_label=False,
2077
+ interactive=False,
2078
+ visible=False,
2079
+ lines=2,
2080
+ max_lines=2
2081
+ )
2082
+ speaker_search = gr.Textbox(
2083
+ placeholder="Search speakers (by ID, gender, age, ethnicity, language)...",
2084
+ label="",
2085
+ show_label=False,
2086
+ container=False
2087
+ )
2088
+ speakers_table = gr.Dataframe(
2089
+ value=get_speakers_table(),
2090
+ headers=["ID", "G", "Age", "Ethnicity", "Native Lang"],
2091
+ datatype=["str", "str", "str", "str", "str"],
2092
+ row_count=(8, "dynamic"),
2093
+ col_count=(5, "fixed"),
2094
+ interactive=False,
2095
+ label="Click any cell to select",
2096
+ column_widths=["10%", "8%", "15%", "30%", "37%"]
2097
+ )
2098
 
2099
+ # VCTK UI (hidden by default)
2100
+ with gr.Column(scale=2, visible=False) as vctk_column:
2101
+ gr.Markdown("### 1. Speakers (VCTK)")
2102
+ vctk_speaker_display = gr.Textbox(
2103
+ value="",
2104
+ label="",
2105
+ show_label=False,
2106
+ interactive=False,
2107
+ visible=False,
2108
+ lines=2,
2109
+ max_lines=2
2110
+ )
2111
+ vctk_speaker_search = gr.Textbox(
2112
+ placeholder="Search speakers (by ID, gender, age, details)...",
2113
+ label="",
2114
+ show_label=False,
2115
+ container=False
2116
+ )
2117
+ vctk_speakers_table = gr.Dataframe(
2118
+ value=get_vctk_speakers_table(),
2119
+ headers=["ID", "G", "Age", "Details", "Length"],
2120
+ datatype=["str", "str", "str", "str", "str"],
2121
+ row_count=(8, "dynamic"),
2122
+ col_count=(5, "fixed"),
2123
+ interactive=False,
2124
+ label="Click any cell to select",
2125
+ column_widths=["10%", "8%", "12%", "50%", "20%"]
2126
+ )
2127
+
2128
+ # Expresso UI (hidden by default)
2129
+ with gr.Column(scale=2, visible=False) as expresso_column:
2130
+ gr.Markdown("### 1. Voices (Expresso)")
2131
+ expresso_speaker_display = gr.Textbox(
2132
+ value="",
2133
+ label="",
2134
+ show_label=False,
2135
+ interactive=False,
2136
+ visible=False,
2137
+ lines=2,
2138
+ max_lines=2
2139
+ )
2140
+ expresso_speaker_search = gr.Textbox(
2141
+ placeholder="Search voices (by ID, type, speakers, style)...",
2142
+ label="",
2143
+ show_label=False,
2144
+ container=False
2145
+ )
2146
+ expresso_speakers_table = gr.Dataframe(
2147
+ value=get_expresso_speakers_table(),
2148
+ headers=["ID", "Type", "Speakers", "Style", "Length"],
2149
+ datatype=["str", "str", "str", "str", "str"],
2150
+ row_count=(8, "dynamic"),
2151
+ col_count=(5, "fixed"),
2152
+ interactive=False,
2153
+ label="Click any cell to select",
2154
+ column_widths=["35%", "15%", "15%", "15%", "20%"]
2155
+ )
2156
+
2157
+ # HF-Custom UI (hidden by default)
2158
+ with gr.Column(scale=2, visible=False) as hf_custom_column:
2159
+ gr.Markdown("### 1. Voices (HF-Custom)")
2160
+ hf_custom_speaker_display = gr.Textbox(
2161
+ value="",
2162
+ label="",
2163
+ show_label=False,
2164
+ interactive=False,
2165
+ visible=False,
2166
+ lines=2,
2167
+ max_lines=2
2168
+ )
2169
+ hf_custom_speaker_search = gr.Textbox(
2170
+ placeholder="Search voices (by name, dataset, description)...",
2171
+ label="",
2172
+ show_label=False,
2173
+ container=False
2174
+ )
2175
+ hf_custom_speakers_table = gr.Dataframe(
2176
+ value=get_hf_custom_speakers_table(),
2177
+ headers=["Name", "Dataset", "Description", "Length"],
2178
+ datatype=["str", "str", "str", "str"],
2179
+ row_count=(8, "dynamic"),
2180
+ col_count=(4, "fixed"),
2181
+ interactive=False,
2182
+ label="Click any cell to select",
2183
+ column_widths=["15%", "15%", "50%", "20%"]
2184
+ )
2185
+
2186
+ with gr.Column(scale=1, visible=True) as voice_type_column:
2187
+ gr.Markdown("### 2. Voice Type")
2188
+ selected_voice_display = gr.Textbox(
2189
+ value="",
2190
+ label="",
2191
+ show_label=False,
2192
+ interactive=False,
2193
+ visible=False,
2194
+ lines=2,
2195
+ max_lines=2
2196
+ )
2197
+ freeform_table = gr.Dataframe(
2198
+ value=[],
2199
+ headers=["Type", "Length"],
2200
+ datatype=["str", "str"],
2201
+ row_count=(1, "fixed"),
2202
+ col_count=(2, "fixed"),
2203
+ interactive=False,
2204
+ label="Freeform voice",
2205
+ visible=True,
2206
+ column_widths=["60%", "40%"]
2207
+ )
2208
+ gr.Markdown("**Emotions:**")
2209
+ emotions_table = gr.Dataframe(
2210
+ value=[],
2211
+ headers=["Emotion", "Length"],
2212
+ datatype=["str", "str"],
2213
+ row_count=(8, "dynamic"),
2214
+ col_count=(2, "fixed"),
2215
+ interactive=False,
2216
+ visible=True,
2217
+ column_widths=["60%", "40%"]
2218
+ )
2219
+
2220
+ with gr.Column(scale=1):
2221
+ gr.Markdown("### 3. Audio Preview")
2222
+ audio_preview = gr.Audio(label="Voice Sample", type="filepath", interactive=False)
2223
+
2224
+ gr.HTML('<hr class="section-separator">')
2225
+ gr.Markdown("# Text Prompt")
2226
+ with gr.Accordion("Text Presets", open=True):
2227
+ text_presets_table = gr.Dataframe(
2228
+ value=load_text_presets(),
2229
+ headers=["Category", "Words", "Preset Text"],
2230
+ datatype=["str", "str", "str"],
2231
+ row_count=(3, "dynamic"),
2232
+ col_count=(3, "fixed"),
2233
+ interactive=False,
2234
+ column_widths=["12%", "6%", "82%"]
2235
  )
2236
+ text_prompt = gr.Textbox(
2237
+ label="Text Prompt",
2238
+ placeholder="[S1] Enter your text prompt here...",
2239
+ lines=4
2240
+ )
2241
 
2242
+ gr.HTML('<hr class="section-separator">')
2243
+ gr.Markdown("# Generation")
2244
 
2245
+ # Mode selector: Simple or Advanced (outside the accordion, centered and prominent)
2246
  with gr.Row():
 
2247
  with gr.Column(scale=1):
2248
+ pass # Empty column for spacing
2249
+ with gr.Column(scale=2):
2250
+ mode_selector = gr.Radio(
2251
+ choices=["Simple Mode", "Advanced Mode"],
2252
+ value="Simple Mode",
2253
+ label="",
2254
+ info=None,
2255
+ elem_id="component-mode-selector"
2256
+ )
2257
+ with gr.Column(scale=1):
2258
+ pass # Empty column for spacing
2259
+
2260
+ with gr.Accordion("⚙️ Generation Parameters", open=True):
2261
+
2262
+ with gr.Row():
2263
+ presets = load_sampler_presets()
2264
+ preset_keys = list(presets.keys())
2265
+ first_preset = preset_keys[0] if preset_keys else "Custom"
2266
+
2267
+ preset_dropdown = gr.Dropdown(
2268
+ choices=["Custom"] + preset_keys,
2269
+ value=first_preset, # Default to first preset instead of Custom
2270
+ label="Sampler Preset",
2271
+ info="Load preset configurations",
2272
+ scale=2
2273
+ )
2274
+
2275
+ rng_seed = gr.Number(
2276
+ label="RNG Seed",
2277
+ value=0,
2278
+ info="Random seed for starting noise",
2279
+ precision=0,
2280
+ scale=1
2281
+ )
2282
+
2283
+ # Simple mode: Speaker KV checkbox on same row (visible by default)
2284
+ with gr.Column(scale=1, visible=True) as simple_mode_row:
2285
+ speaker_kv_simple_checkbox = gr.Checkbox(
2286
+ label="\"Force Speaker\" (Enable Speaker KV Attention Scaling)",
2287
+ value=False,
2288
+ info="Enable if generation does not match reference voice (otherwise leave off)"
2289
  )
2290
 
2291
+ # Advanced mode: Compile and custom shapes checkboxes (hidden by default)
2292
+ with gr.Column(scale=1, visible=False) as advanced_mode_compile_column:
2293
+ compile_checkbox = gr.Checkbox(
2294
+ label="Compile Model",
2295
+ value=True, # Default to True in simple mode
2296
+ interactive=not IS_ZEROGPU,
2297
+ info="Compile disabled on Zero GPU" if IS_ZEROGPU else "~20-30% faster after initial compilation"
2298
+ )
2299
+ compile_status = gr.Markdown(
2300
+ value="⚠️ Compile disabled on Zero GPU" if IS_ZEROGPU else "",
2301
+ visible=IS_ZEROGPU
2302
+ )
2303
+ use_custom_shapes_checkbox = gr.Checkbox(
2304
+ label="Use Custom Shapes (Advanced)",
2305
+ value=False,
2306
+ info="Override default sequence lengths for text, speaker, and sample"
2307
+ )
 
 
 
 
 
2308
 
2309
+ # Advanced mode controls (hidden by default)
2310
+ with gr.Column(visible=False) as advanced_mode_column:
2311
+ with gr.Row(visible=False) as custom_shapes_row:
2312
+ max_text_byte_length = gr.Textbox(
2313
+ label="Max Text Byte Length (padded)",
2314
+ value="768",
2315
+ info="Maximum text utf-8 byte sequence length (blank -> no padding)",
2316
+ scale=1
2317
+ )
2318
+ max_speaker_latent_length = gr.Textbox(
2319
+ label="Max Speaker Latent Length (padded)",
2320
+ value="2560",
2321
+ info="Maximum (unpatched)speaker latent length (blank -> no padding), default 2560 = ~30s",
2322
+ scale=1
2323
+ )
2324
+ sample_latent_len = gr.Textbox(
2325
+ label="Sample Latent Length",
2326
+ value="640",
2327
+ info="Maximum sample latent length (EXPERIMENTAL!!! ONLY TRAINED WITH 640 BUT SOMEHOW WORKS WITH < 640 TO GENERATE PREFIXES)",
2328
+ scale=1
2329
+ )
2330
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2331
 
2332
+ with gr.Row():
2333
+ # Left column: Core Sampling Parameters
2334
+ with gr.Column(scale=1):
2335
+ with gr.Group():
2336
+ gr.HTML("""
2337
+ <div class="preset-inline">
2338
+ <span class="title">Core Sampling Parameters</span><span class="dim">(</span>
2339
+ <a href="javascript:void(0)" class="preset-link" data-fire="core_default">default</a>
2340
+ <span class="dim">)</span>
2341
+ </div>
2342
+ """)
2343
+ core_preset_default = gr.Button("", elem_id="core_default", elem_classes=["proxy-btn"])
2344
+ num_steps = gr.Number(label="Number of Steps", value=40, info="Number of sampling steps (consider 20 - 80) (capped at 80)", precision=0, minimum=1, step=5, maximum=80)
2345
+
2346
+ cfg_mode = gr.Radio(
2347
+ choices=[
2348
+ "independent",
2349
+ "apg-independent",
2350
+ "alternating",
2351
+ "joint-unconditional"
2352
+ ],
2353
+ value="independent",
2354
+ label="CFG Mode",
2355
+ info="Independent (3 NFE), Adaptive Projected Guidance (3 NFE, see https://arxiv.org/abs/2410.02416), Alternating (2 NFE), Joint-Unconditional (2 NFE)"
2356
+ )
2357
+
2358
+ with gr.Group():
2359
+ gr.HTML("""
2360
+ <div class="preset-inline">
2361
+ <span class="title">CFG Guidance</span><span class="dim">(</span>
2362
+ <a href="javascript:void(0)" class="preset-link" data-fire="cfg_default">default</a>
2363
+ <span class="dim">,</span>
2364
+ <a href="javascript:void(0)" class="preset-link" data-fire="cfg_higher">higher speaker</a>
2365
+ <span class="dim">,</span>
2366
+ <a href="javascript:void(0)" class="preset-link" data-fire="cfg_large">large guidances(works with apg)</a>
2367
+ <span class="dim">)</span>
2368
+ </div>
2369
+ """)
2370
+ cfg_preset_default = gr.Button("", elem_id="cfg_default", elem_classes=["proxy-btn"])
2371
+ cfg_preset_higher_speaker = gr.Button("", elem_id="cfg_higher", elem_classes=["proxy-btn"])
2372
+ cfg_preset_large_guidances = gr.Button("", elem_id="cfg_large", elem_classes=["proxy-btn"])
2373
+ with gr.Row():
2374
+ cfg_scale_text = gr.Number(label="Text CFG Scale", value=3.0, info="Guidance strength for text", minimum=0, step=0.5)
2375
+ cfg_scale_speaker = gr.Number(label="Speaker CFG Scale", value=5.0, info="Guidance strength for speaker", minimum=0, step=0.5)
2376
+
2377
+ with gr.Row():
2378
+ cfg_min_t = gr.Number(label="CFG Min t", value=0.5, info="(0-1), CFG applied when t >= val", minimum=0, maximum=1, step=0.05)
2379
+ cfg_max_t = gr.Number(label="CFG Max t", value=1.0, info="(0-1), CFG applied when t <= val", minimum=0, maximum=1, step=0.05)
2380
+
2381
+ # Right column: Speaker KV, Truncation + APG
2382
+ with gr.Column(scale=1):
2383
+ with gr.Group():
2384
+ gr.HTML("""
2385
+ <div class="preset-inline">
2386
+ <span class="title">Speaker KV Attention Scaling</span><span class="dim">(</span>
2387
+ <a href="javascript:void(0)" class="preset-link" data-fire="spk_kv_enable">enable if generation does not match reference</a>
2388
+ <span class="dim">,</span>
2389
+ <a href="javascript:void(0)" class="preset-link" data-fire="spk_kv_off">off</a>
2390
+ <span class="dim">)</span>
2391
+ </div>
2392
+ """)
2393
+ spk_kv_preset_enable = gr.Button("", elem_id="spk_kv_enable", elem_classes=["proxy-btn"])
2394
+ spk_kv_preset_off = gr.Button("", elem_id="spk_kv_off", elem_classes=["proxy-btn"])
2395
+ speaker_k_enable = gr.Checkbox(label="Enable Speaker KV Scaling", value=False, info="Scale speaker attention key-values; useful when the model-generated audio does not at all match the reference audio (i.e. ignores speaker-reference)")
2396
+
2397
+ with gr.Row(visible=False) as speaker_k_row:
2398
+ speaker_k_scale = gr.Number(label="KV Scale", value=1.5, info="Scale factor", minimum=0, step=0.1)
2399
+ speaker_k_min_t = gr.Number(label="KV Min t", value=0.9, info="(0-1), scale applied from steps t=1. to val", minimum=0, maximum=1, step=0.05)
2400
+ speaker_k_max_layers = gr.Number(label="Max Layers", value=24, info="(0-24), scale applied in first N layers", precision=0, minimum=0, maximum=24)
2401
+
2402
+ with gr.Group():
2403
+ gr.HTML("""
2404
+ <div class="preset-inline">
2405
+ <span class="title">Truncation &amp; Temporal Rescaling</span><span class="dim">(</span>
2406
+ <a href="javascript:void(0)" class="preset-link" data-fire="trunc_flat">flat</a>
2407
+ <span class="dim">,</span>
2408
+ <a href="javascript:void(0)" class="preset-link" data-fire="trunc_sharp">sharp</a>
2409
+ <span class="dim">,</span>
2410
+ <a href="javascript:void(0)" class="preset-link" data-fire="trunc_baseline">baseline(sharp)</a>
2411
+ <span class="dim">)</span>
2412
+ </div>
2413
+ """)
2414
+ trunc_preset_flat = gr.Button("", elem_id="trunc_flat", elem_classes=["proxy-btn"])
2415
+ trunc_preset_sharp = gr.Button("", elem_id="trunc_sharp", elem_classes=["proxy-btn"])
2416
+ trunc_preset_baseline = gr.Button("", elem_id="trunc_baseline", elem_classes=["proxy-btn"])
2417
+ with gr.Row():
2418
+ truncation_factor = gr.Number(label="Truncation Factor", value=0.8, info="Multiply initial noise (<1 helps artifacts)", minimum=0, step=0.05)
2419
+ rescale_k = gr.Number(label="Rescale k", value=1.2, info="<1=sharpen, >1=flatten, 1=off", minimum=0, step=0.05)
2420
+ rescale_sigma = gr.Number(label="Rescale σ", value=3.0, info="Sigma parameter", minimum=0, step=0.1)
2421
+
2422
+ with gr.Group(visible=False) as apg_row:
2423
+ gr.HTML("""
2424
+ <div class="preset-inline">
2425
+ <span class="title">APG Parameters</span><span class="dim">(</span>
2426
+ <a href="javascript:void(0)" class="preset-link" data-fire="apg_default">default</a>
2427
+ <span class="dim">,</span>
2428
+ <a href="javascript:void(0)" class="preset-link" data-fire="apg_no_momentum">no momentum</a>
2429
+ <span class="dim">,</span>
2430
+ <a href="javascript:void(0)" class="preset-link" data-fire="apg_norms">norms</a>
2431
+ <span class="dim">,</span>
2432
+ <a href="javascript:void(0)" class="preset-link" data-fire="apg_no_eta">no eta</a>
2433
+ <span class="dim">)</span>
2434
+ </div>
2435
+ """)
2436
+ apg_preset_default = gr.Button("", elem_id="apg_default", elem_classes=["proxy-btn"])
2437
+ apg_preset_no_momentum = gr.Button("", elem_id="apg_no_momentum", elem_classes=["proxy-btn"])
2438
+ apg_preset_norms = gr.Button("", elem_id="apg_norms", elem_classes=["proxy-btn"])
2439
+ apg_preset_no_eta = gr.Button("", elem_id="apg_no_eta", elem_classes=["proxy-btn"])
2440
+ with gr.Row():
2441
+ apg_eta_text = gr.Number(label="APG η (text)", value=0.5, info="Eta for text projection (0-1, higher -> more like CFG)", minimum=0, maximum=1, step=0.25)
2442
+ apg_eta_speaker = gr.Number(label="APG η (speaker)", value=0.5, info="Eta for speaker projection (0-1, higher -> more like CFG)", minimum=0, maximum=1, step=0.25)
2443
+
2444
+ with gr.Row() as apg_row2:
2445
+ apg_momentum_text = gr.Number(label="APG Momentum (text)", value=-0.25, info="Text momentum (can try 0., -.25, -0.5, -0.75...)", step=0.25)
2446
+ apg_momentum_speaker = gr.Number(label="APG Momentum (speaker)", value=-0.25, info="Speaker momentum (can try 0., -.25, -0.5, -0.75...)", step=0.25)
2447
+ with gr.Row():
2448
+ apg_norm_text = gr.Textbox(label="APG Norm (text)", value="", info="Text norm clip (leave blank to disable, can try 7.5, 15.0)")
2449
+ apg_norm_speaker = gr.Textbox(label="APG Norm (speaker)", value="", info="Speaker norm clip (leave blank to disable, can try 7.5, 15.0)")
2450
+ # End of advanced_mode_column
2451
+
2452
+ with gr.Row(equal_height=True):
2453
+ audio_format = gr.Radio(
2454
+ choices=["wav", "mp3"],
2455
+ value="wav",
2456
+ label="Format",
2457
+ scale=1,
2458
+ min_width=90
2459
+ )
2460
+ generate_btn = gr.Button("Generate Audio", variant="primary", size="lg", scale=10)
2461
+ with gr.Column(scale=1):
2462
+ show_original_audio = gr.Checkbox(
2463
+ label="Re-display original audio (full 2-minute cropped mono)",
2464
+ value=False
2465
+ )
2466
+ reconstruct_first_30_seconds = gr.Checkbox(
2467
+ label="Show Autoencoder Reconstruction (only first 30s of reference)",
2468
+ value=False
2469
+ )
2470
+
2471
+ gr.HTML('<hr class="section-separator">')
2472
+ with gr.Accordion("Generated Audio", open=True, visible=True) as generated_section:
2473
+ generation_time_display = gr.Markdown("", visible=False)
2474
+ with gr.Group(elem_classes=["generated-audio-player"]):
2475
+ generated_audio = gr.Audio(label="Generated Audio", visible=True)
2476
+ text_prompt_display = gr.Markdown("", visible=False)
2477
+
2478
+ gr.Markdown("---")
2479
+ reference_audio_header = gr.Markdown("#### Reference Audio", visible=False)
2480
+
2481
+ with gr.Accordion("Original Audio (2 min Cropped Mono)", open=False, visible=False) as original_accordion:
2482
+ original_audio = gr.Audio(label="Original Reference Audio (2 min)", visible=True)
2483
+
2484
+ with gr.Accordion("Autoencoder Reconstruction of First 30s of Reference", open=False, visible=False) as reference_accordion:
2485
+ reference_audio = gr.Audio(label="Decoded Reference Audio (30s)", visible=True)
2486
+
2487
+ # End of Advanced TabItem
2488
+ # End of Tabs
2489
 
2490
  # Event handlers
2491
+
2492
+ # Simple View - Generate button handler
2493
+ simple_generate_btn.click(
2494
+ generate_audio_simple,
2495
+ inputs=[
2496
+ simple_text_prompt,
2497
+ simple_audio_input,
2498
+ simple_preset,
2499
+ simple_rng_seed,
2500
+ simple_num_steps,
2501
+ simple_speaker_kv_enable,
2502
+ simple_speaker_kv_scale,
2503
+ session_id_state,
2504
+ ],
2505
+ outputs=[simple_generated_audio, simple_time_display]
2506
+ )
2507
+
2508
+ # Simple View - Speaker KV checkbox toggle
2509
+ simple_speaker_kv_enable.change(
2510
+ lambda enabled: gr.update(visible=enabled),
2511
+ inputs=[simple_speaker_kv_enable],
2512
+ outputs=[simple_speaker_kv_scale]
2513
+ )
2514
+
2515
+ # Simple View - Preset dropdown handler
2516
+ def apply_simple_preset(preset_name):
2517
+ if not preset_name:
2518
+ return [gr.update()] * 3
2519
+ presets = load_sampler_presets()
2520
+ if preset_name in presets:
2521
+ preset = presets[preset_name]
2522
+ steps = int(preset.get("num_steps", 40))
2523
+ speaker_kv = preset.get("speaker_k_enable", False)
2524
+ return [
2525
+ gr.update(value=steps),
2526
+ gr.update(value=speaker_kv),
2527
+ gr.update(visible=speaker_kv)
2528
+ ]
2529
+ return [gr.update()] * 3
2530
+
2531
+ simple_preset.change(
2532
+ apply_simple_preset,
2533
+ inputs=[simple_preset],
2534
+ outputs=[simple_num_steps, simple_speaker_kv_enable, simple_speaker_kv_scale]
2535
+ )
2536
+
2537
+ # Simple View - Audio preset dropdown handler
2538
+ def select_simple_audio_preset(preset_name):
2539
+ if preset_name == "(upload your own or select from dropdown)" or not preset_name:
2540
+ return gr.update(value=None) # Clear the audio input
2541
+ if AUDIO_PROMPT_FOLDER is not None:
2542
+ file_path = AUDIO_PROMPT_FOLDER / preset_name
2543
+ if file_path.exists():
2544
+ return gr.update(value=str(file_path))
2545
+ return gr.update()
2546
+
2547
+ simple_audio_preset.change(
2548
+ select_simple_audio_preset,
2549
+ inputs=[simple_audio_preset],
2550
+ outputs=[simple_audio_input]
2551
+ )
2552
+
2553
+ # Simple View - Text preset table selection handler
2554
+ def select_simple_text_preset(evt: gr.SelectData):
2555
+ text_presets = load_text_presets()
2556
+ if evt.index[0] < len(text_presets):
2557
+ return gr.update(value=text_presets[evt.index[0]][2])
2558
+ return gr.update()
2559
+
2560
+ simple_text_presets_table.select(
2561
+ select_simple_text_preset,
2562
+ outputs=[simple_text_prompt]
2563
+ )
2564
+
2565
+ # Simple View - Reset audio preset dropdown when audio is cleared
2566
+ simple_audio_input.clear(
2567
+ lambda: gr.update(value="(upload your own or select from dropdown)"),
2568
+ outputs=[simple_audio_preset]
2569
+ )
2570
+
2571
+ # Advanced View Event handlers
2572
  # Custom Audio Panel - handle audio change to update speaker_audio_path_state
2573
  custom_audio_input.change(
2574
  lambda audio: gr.update(value=audio if audio else ""),