Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
|
@@ -83,7 +83,7 @@ def load_examples(csv_path):
|
|
| 83 |
|
| 84 |
return examples
|
| 85 |
|
| 86 |
-
# Client wrapper for main generation - FIXED
|
| 87 |
def run_generation_pipeline_client(
|
| 88 |
raw_text,
|
| 89 |
audio_prompt,
|
|
@@ -94,10 +94,18 @@ def run_generation_pipeline_client(
|
|
| 94 |
use_chained_longform
|
| 95 |
):
|
| 96 |
try:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 97 |
# Call the backend API
|
| 98 |
result = client.predict(
|
| 99 |
raw_text,
|
| 100 |
-
|
| 101 |
num_candidates,
|
| 102 |
cfg_scale,
|
| 103 |
top_k,
|
|
@@ -126,7 +134,7 @@ def run_generation_pipeline_client(
|
|
| 126 |
except Exception as e:
|
| 127 |
return None, f"Status: Connection error: {str(e)}"
|
| 128 |
|
| 129 |
-
# Client wrapper for duration-aware generation - FIXED
|
| 130 |
def run_duration_generation_pipeline_client(
|
| 131 |
raw_text,
|
| 132 |
audio_prompt,
|
|
@@ -140,10 +148,18 @@ def run_duration_generation_pipeline_client(
|
|
| 140 |
chars_per_second
|
| 141 |
):
|
| 142 |
try:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 143 |
# Call the backend API
|
| 144 |
result = client.predict(
|
| 145 |
raw_text,
|
| 146 |
-
|
| 147 |
num_candidates,
|
| 148 |
cfg_scale,
|
| 149 |
top_k,
|
|
@@ -427,7 +443,7 @@ with gr.Blocks(theme="Respair/Shiki@9.1.0", css=css) as demo:
|
|
| 427 |
Both checkpoints have been fine-tuned on a subset of the dataset with only speaker tags. This will allow us to generate high quality samples without relying on audio prompts or dealing with random speaker attributes, but at the cost of tanking the zero-shot faithfulness of the model.
|
| 428 |
</p>
|
| 429 |
|
| 430 |
-
<p style="color: #1a1a1a; font-weight: 500; line-height: 1.8; margin-bottom: 20px; font-size: 16px;">
|
| 431 |
Takane also comes with an Anti-Hallucination Algorithm (AHA) that generates a few candidates in parallel and automatically returns the best one at the cost of introducing a small overhead.
|
| 432 |
If you need the fastest response time possible, feel free to enable the Turbo mode. It will disable AHA and tweak the parameters internally to produce samples as fast as 2-3 seconds.
|
| 433 |
</p>
|
|
|
|
| 83 |
|
| 84 |
return examples
|
| 85 |
|
| 86 |
+
# Client wrapper for main generation - FIXED for audio handling
|
| 87 |
def run_generation_pipeline_client(
|
| 88 |
raw_text,
|
| 89 |
audio_prompt,
|
|
|
|
| 94 |
use_chained_longform
|
| 95 |
):
|
| 96 |
try:
|
| 97 |
+
# Convert audio prompt to serializable format
|
| 98 |
+
if audio_prompt is not None:
|
| 99 |
+
sample_rate, audio_data = audio_prompt
|
| 100 |
+
# Convert numpy array to list for JSON serialization
|
| 101 |
+
audio_prompt_serializable = (sample_rate, audio_data.tolist())
|
| 102 |
+
else:
|
| 103 |
+
audio_prompt_serializable = None
|
| 104 |
+
|
| 105 |
# Call the backend API
|
| 106 |
result = client.predict(
|
| 107 |
raw_text,
|
| 108 |
+
audio_prompt_serializable,
|
| 109 |
num_candidates,
|
| 110 |
cfg_scale,
|
| 111 |
top_k,
|
|
|
|
| 134 |
except Exception as e:
|
| 135 |
return None, f"Status: Connection error: {str(e)}"
|
| 136 |
|
| 137 |
+
# Client wrapper for duration-aware generation - FIXED for audio handling
|
| 138 |
def run_duration_generation_pipeline_client(
|
| 139 |
raw_text,
|
| 140 |
audio_prompt,
|
|
|
|
| 148 |
chars_per_second
|
| 149 |
):
|
| 150 |
try:
|
| 151 |
+
# Convert audio prompt to serializable format
|
| 152 |
+
if audio_prompt is not None:
|
| 153 |
+
sample_rate, audio_data = audio_prompt
|
| 154 |
+
# Convert numpy array to list for JSON serialization
|
| 155 |
+
audio_prompt_serializable = (sample_rate, audio_data.tolist())
|
| 156 |
+
else:
|
| 157 |
+
audio_prompt_serializable = None
|
| 158 |
+
|
| 159 |
# Call the backend API
|
| 160 |
result = client.predict(
|
| 161 |
raw_text,
|
| 162 |
+
audio_prompt_serializable,
|
| 163 |
num_candidates,
|
| 164 |
cfg_scale,
|
| 165 |
top_k,
|
|
|
|
| 443 |
Both checkpoints have been fine-tuned on a subset of the dataset with only speaker tags. This will allow us to generate high quality samples without relying on audio prompts or dealing with random speaker attributes, but at the cost of tanking the zero-shot faithfulness of the model.
|
| 444 |
</p>
|
| 445 |
|
| 446 |
+
<p style="color: #1a1a1a; font-weight: 500; line-height: 1.8; margin-bottom: 20px; font-size: 16px;">
|
| 447 |
Takane also comes with an Anti-Hallucination Algorithm (AHA) that generates a few candidates in parallel and automatically returns the best one at the cost of introducing a small overhead.
|
| 448 |
If you need the fastest response time possible, feel free to enable the Turbo mode. It will disable AHA and tweak the parameters internally to produce samples as fast as 2-3 seconds.
|
| 449 |
</p>
|