voicekit / app.py
jjin6573's picture
Upload folder using huggingface_hub
7ae2c28 verified
"""
VoiceKit - MCP Server for Voice Analysis
6 MCP tools for voice processing (all accept base64 audio):
- Embedding extraction, voice comparison, acoustic analysis
- Speech-to-text, voice isolation, similarity analysis
MCP Endpoint: https://mcp-1st-birthday-voicekit.hf.space/gradio_api/mcp/sse
"""
import gradio as gr
import base64
import os
import json
import tempfile
import math
import re
# Set Gradio temp directory to current directory
GRADIO_TEMP_DIR = os.path.join(os.getcwd(), "gradio_temp")
os.makedirs(GRADIO_TEMP_DIR, exist_ok=True)
os.environ['GRADIO_TEMP_DIR'] = GRADIO_TEMP_DIR
tempfile.tempdir = GRADIO_TEMP_DIR
# Modal connection (requires MODAL_TOKEN_ID and MODAL_TOKEN_SECRET in HF Secrets)
try:
import modal
AudioAnalyzer = modal.Cls.from_name("voice-semantle", "AudioAnalyzer")
analyzer = AudioAnalyzer()
modal_available = True
print("Modal connected!")
except Exception as e:
modal_available = False
analyzer = None
print(f"Modal not available: {e}")
# Load README.md and convert to HTML
def load_readme_as_html():
"""Load README.md and convert markdown to HTML"""
try:
with open("README.md", "r", encoding="utf-8") as f:
content = f.read()
# Remove YAML front matter
content = re.sub(r'^---\n.*?\n---\n', '', content, flags=re.DOTALL)
html = content
# Headers
html = re.sub(r'^### (.+)$', r'<h3>\1</h3>', html, flags=re.MULTILINE)
html = re.sub(r'^## (.+)$', r'<h2>\1</h2>', html, flags=re.MULTILINE)
html = re.sub(r'^# (.+)$', r'<h1>\1</h1>', html, flags=re.MULTILINE)
# Code blocks - preserve content without adding extra newlines
def format_code_block(match):
code = match.group(2).strip()
# Replace internal newlines with a placeholder, then restore after processing
# This prevents the paragraph logic from adding extra breaks
code_escaped = code.replace('\n', '<!-- NEWLINE -->')
return f'<pre><code>{code_escaped}</code></pre>'
html = re.sub(r'```(\w*)\n(.*?)```', format_code_block, html, flags=re.DOTALL)
# Images - convert relative paths to HuggingFace raw file URLs
# Handle both <img> tags and markdown image syntax
HF_BASE_URL = "https://huggingface.co/spaces/MCP-1st-Birthday/voicekit/resolve/main"
def convert_image_path(match):
src = match.group(1)
# If it's a relative path (not starting with http), convert to HF URL
if not src.startswith('http'):
src = f"{HF_BASE_URL}/{src}"
return f'<img src="{src}" style="max-width:100%; height:auto; border-radius:8px; margin:12px 0;">'
html = re.sub(r'<img src="([^"]+)"[^>]*>', convert_image_path, html)
# Inline code (but not inside <pre><code> blocks)
html = re.sub(r'`([^`]+)`', r'<code>\1</code>', html)
# Bold
html = re.sub(r'\*\*(.+?)\*\*', r'<strong>\1</strong>', html)
# Links
html = re.sub(r'\[([^\]]+)\]\(([^)]+)\)', r'<a href="\2" target="_blank">\1</a>', html)
# Tables
lines = html.split('\n')
in_table = False
table_html = []
new_lines = []
for line in lines:
if '|' in line and line.strip().startswith('|'):
if not in_table:
in_table = True
table_html = ['<table>']
if re.match(r'^\|[\s\-:|]+\|$', line.strip()):
continue
cells = [c.strip() for c in line.strip().split('|')[1:-1]]
if len(table_html) == 1:
table_html.append('<thead><tr>')
for cell in cells:
table_html.append(f'<th>{cell}</th>')
table_html.append('</tr></thead><tbody>')
else:
table_html.append('<tr>')
for cell in cells:
table_html.append(f'<td>{cell}</td>')
table_html.append('</tr>')
else:
if in_table:
table_html.append('</tbody></table>')
new_lines.append(''.join(table_html))
table_html = []
in_table = False
new_lines.append(line)
if in_table:
table_html.append('</tbody></table>')
new_lines.append(''.join(table_html))
html = '\n'.join(new_lines)
# Lists
html = re.sub(r'^- (.+)$', r'<li>\1</li>', html, flags=re.MULTILINE)
html = re.sub(r'(<li>.*</li>\n?)+', r'<ul>\g<0></ul>', html)
# Paragraphs - skip lines that are inside pre/code blocks
lines = html.split('\n')
result = []
for line in lines:
stripped = line.strip()
if stripped and not stripped.startswith('<') and not stripped.startswith('```'):
result.append(f'<p>{stripped}</p>')
else:
result.append(line)
# Join and restore newlines in code blocks
final_html = '\n'.join(result)
final_html = final_html.replace('<!-- NEWLINE -->', '\n')
# Escape curly braces for f-string compatibility
final_html = final_html.replace('{', '{{').replace('}', '}}')
return final_html
except Exception as e:
return f"<p>Error loading README: {e}</p>"
readme_html = load_readme_as_html()
def file_to_base64(file_path: str) -> str:
"""Convert file path to base64 string"""
if not file_path:
return ""
with open(file_path, "rb") as f:
return base64.b64encode(f.read()).decode()
# ============================================================================
# MCP Tools (all accept base64 directly)
# ============================================================================
def extract_embedding(audio_base64: str) -> str:
"""
Extract voice embedding using Wav2Vec2.
Returns a 768-dimensional vector representing voice characteristics.
Useful for voice comparison, speaker identification, etc.
Args:
audio_base64: Audio file as base64 encoded string
Returns:
embedding (768-dim list), model, dim
"""
if not modal_available:
return json.dumps({"error": "Modal not available. Please set MODAL_TOKEN_ID and MODAL_TOKEN_SECRET in HF Secrets."})
if not audio_base64:
return json.dumps({"error": "No audio provided"})
try:
result = analyzer.extract_embedding.remote(audio_base64)
if "embedding" in result:
result["embedding_preview"] = result["embedding"][:5] + ["..."]
result["embedding_length"] = len(result["embedding"])
del result["embedding"]
return json.dumps(result, ensure_ascii=False, indent=2)
except Exception as e:
return json.dumps({"error": str(e)})
def match_voice(audio1_base64: str, audio2_base64: str) -> str:
"""
Compare similarity between two voices.
Extracts Wav2Vec2 embeddings and calculates cosine similarity.
Useful for checking if the same person spoke with similar tone.
Args:
audio1_base64: First audio as base64 encoded string
audio2_base64: Second audio as base64 encoded string
Returns:
similarity (0-1), tone_score (0-100)
"""
if not modal_available:
return json.dumps({"error": "Modal not available. Please set MODAL_TOKEN_ID and MODAL_TOKEN_SECRET in HF Secrets."})
if not audio1_base64 or not audio2_base64:
return json.dumps({"error": "Both audio files required"})
try:
result = analyzer.compare_voices.remote(audio1_base64, audio2_base64)
return json.dumps(result, ensure_ascii=False, indent=2)
except Exception as e:
return json.dumps({"error": str(e)})
def analyze_acoustics(audio_base64: str) -> str:
"""
Analyze acoustic features of audio.
Extracts pitch, energy, rhythm, tempo, and spectral characteristics.
Useful for understanding voice expressiveness and characteristics.
Args:
audio_base64: Audio file as base64 encoded string
Returns:
pitch, energy, rhythm, tempo, spectral information
"""
if not modal_available:
return json.dumps({"error": "Modal not available. Please set MODAL_TOKEN_ID and MODAL_TOKEN_SECRET in HF Secrets."})
if not audio_base64:
return json.dumps({"error": "No audio provided"})
try:
result = analyzer.analyze_acoustic_features.remote(audio_base64)
return json.dumps(result, ensure_ascii=False, indent=2)
except Exception as e:
return json.dumps({"error": str(e)})
def transcribe_audio(audio_base64: str, language: str = "en") -> str:
"""
Convert audio to text (Speech-to-Text).
Uses ElevenLabs Scribe v1 model for high-quality speech recognition.
Supports various languages.
Args:
audio_base64: Audio file as base64 encoded string
language: Language code (e.g., "en", "ko", "ja"). Default is "en"
Returns:
text, language, model
"""
if not modal_available:
return json.dumps({"error": "Modal not available. Please set MODAL_TOKEN_ID and MODAL_TOKEN_SECRET in HF Secrets."})
if not audio_base64:
return json.dumps({"error": "No audio provided"})
try:
result = analyzer.transcribe_audio.remote(audio_base64, language)
return json.dumps(result, ensure_ascii=False, indent=2)
except Exception as e:
return json.dumps({"error": str(e)})
def isolate_voice(audio_base64: str) -> str:
"""
Remove background music (BGM) and extract voice only.
Uses ElevenLabs Voice Isolator to remove music, noise, etc.
Useful for memes, songs, and other audio with background sounds.
Args:
audio_base64: Audio file as base64 encoded string
Returns:
isolated_audio_base64, metadata (bgm_detected, sizes, duration)
"""
if not modal_available:
return json.dumps({"error": "Modal not available. Please set MODAL_TOKEN_ID and MODAL_TOKEN_SECRET in HF Secrets."})
if not audio_base64:
return json.dumps({"error": "No audio provided"})
try:
result = analyzer.isolate_voice.remote(audio_base64)
return json.dumps(result, ensure_ascii=False, indent=2)
except Exception as e:
return json.dumps({"error": str(e)})
def grade_voice(
user_audio_base64: str,
reference_audio_base64: str,
reference_text: str = "",
category: str = "meme"
) -> str:
"""
Comprehensively compare and analyze user voice with reference voice.
Evaluates with 5 metrics:
- pronunciation: Pronunciation accuracy (STT-based)
- tone: Voice timbre similarity (Wav2Vec2 embedding)
- pitch: Pitch matching
- rhythm: Rhythm sense
- energy: Energy expressiveness
Args:
user_audio_base64: User audio as base64 encoded string
reference_audio_base64: Reference audio as base64 encoded string
reference_text: Reference text (optional, enables pronunciation scoring)
category: Category (meme, song, movie) - determines weights
Returns:
overall_score, metrics, weak_points, strong_points, feedback
"""
if not modal_available:
return json.dumps({"error": "Modal not available. Please set MODAL_TOKEN_ID and MODAL_TOKEN_SECRET in HF Secrets."})
if not user_audio_base64 or not reference_audio_base64:
return json.dumps({"error": "Both user and reference audio required"})
try:
result = analyzer.analyze_audio.remote(
user_audio_base64=user_audio_base64,
reference_audio_base64=reference_audio_base64,
reference_text=reference_text if reference_text else None,
challenge_id="mcp_analysis",
category=category,
)
# Simplify output for backend/API use
metrics = result.get("metrics", {})
simple_result = {
"pitch": metrics.get("pitch", 0),
"rhythm": metrics.get("rhythm", 0),
"energy": metrics.get("energy", 0),
"pronunciation": metrics.get("pronunciation", 0),
"transcript": metrics.get("transcript", 0),
"overall": result.get("overall_score", 0),
"user_text": result.get("user_text", "")
}
return json.dumps(simple_result, ensure_ascii=False, indent=2)
except Exception as e:
return json.dumps({"error": str(e)})
# ============================================================================
# Demo Functions for UI
# ============================================================================
def demo_acoustic_analysis(audio_file):
"""Acoustic Analysis - Analyze pitch, energy, rhythm, tempo"""
if not audio_file:
return create_acoustic_empty()
audio_b64 = file_to_base64(audio_file)
result_json = analyze_acoustics(audio_b64)
try:
result = json.loads(result_json)
if "error" in result:
return f'''<div style="color: #ef4444; padding: 20px; background: #fee; border-radius: 12px; border: 1px solid #fca5a5;">
<strong>Error in result:</strong><br>{result.get("error", "Unknown error")}
</div>'''
return create_acoustic_visualization(result)
except Exception as e:
return f'''<div style="color: #ef4444; padding: 20px; background: #fee; border-radius: 12px; border: 1px solid #fca5a5;">
<strong>Parsing Error:</strong> {str(e)}<br><br>
<strong>Raw Result (first 500 chars):</strong><br>
<code style="display: block; padding: 10px; background: white; border-radius: 4px; overflow-x: auto; font-size: 12px;">{result_json[:500]}</code>
</div>'''
def demo_transcribe_audio(audio_file, language):
"""Audio Transcription"""
if not audio_file:
return create_transcription_empty()
audio_b64 = file_to_base64(audio_file)
result_json = transcribe_audio(audio_b64, language)
try:
result = json.loads(result_json)
if "error" in result:
return create_transcription_empty()
text = result.get("text", "")
return create_transcription_visualization(text)
except:
return create_transcription_empty()
def demo_clean_extraction(audio_file):
"""Clean Audio Extraction - returns audio file only"""
if not audio_file:
return None
audio_b64 = file_to_base64(audio_file)
result_json = isolate_voice(audio_b64)
try:
result = json.loads(result_json)
if "error" in result:
return None
# Convert isolated audio base64 back to file
import tempfile
isolated_audio_bytes = base64.b64decode(result["isolated_audio_base64"])
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp:
tmp.write(isolated_audio_bytes)
isolated_audio_path = tmp.name
return isolated_audio_path
except:
return None
def demo_extract_embedding(audio_file):
"""Extract Embedding - extract voice fingerprint"""
if not audio_file:
return create_embedding_empty()
audio_b64 = file_to_base64(audio_file)
result_json = extract_embedding(audio_b64)
try:
result = json.loads(result_json)
if "error" in result:
return f'''<div style="color: #ef4444; padding: 20px; background: #fee; border-radius: 12px; border: 1px solid #fca5a5;">
<strong>Error in result:</strong><br>{result.get("error", "Unknown error")}
</div>'''
return create_embedding_visualization(result)
except Exception as e:
return f'''<div style="color: #ef4444; padding: 20px; background: #fee; border-radius: 12px; border: 1px solid #fca5a5;">
<strong>Parsing Error:</strong> {str(e)}<br><br>
<strong>Raw Result (first 500 chars):</strong><br>
<code style="display: block; padding: 10px; background: white; border-radius: 4px; overflow-x: auto; font-size: 12px;">{result_json[:500]}</code>
</div>'''
def demo_match_voice(audio1, audio2):
"""Compare Voices - compare two voice similarities"""
if not audio1 or not audio2:
return create_compare_empty()
audio1_b64 = file_to_base64(audio1)
audio2_b64 = file_to_base64(audio2)
result_json = match_voice(audio1_b64, audio2_b64)
try:
result = json.loads(result_json)
if "error" in result:
return create_compare_empty()
return create_compare_visualization(result)
except:
return create_compare_empty()
def demo_voice_similarity(user_audio, ref_audio):
"""Voice Similarity - comprehensive 5-metric analysis"""
if not user_audio or not ref_audio:
return create_similarity_empty()
user_b64 = file_to_base64(user_audio)
ref_b64 = file_to_base64(ref_audio)
result_json = grade_voice(user_b64, ref_b64, "", "meme")
try:
result = json.loads(result_json)
if "error" in result:
return create_similarity_empty()
return create_similarity_visualization(result)
except:
return create_similarity_empty()
# ============================================================================
# Visualization Functions
# ============================================================================
def create_acoustic_empty():
"""Empty state for acoustic analysis"""
return """
<div style="
background: rgba(10, 10, 26, 0.4);
border: 1px solid rgba(124, 58, 237, 0.2);
border-radius: 16px;
padding: 30px 20px;
text-align: center;
height: 100%;
display: flex;
flex-direction: column;
align-items: center;
justify-content: center;
">
<div style="margin-bottom: 12px; opacity: 0.5;">
<svg width="48" height="48" viewBox="0 0 24 24" fill="none" style="margin: 0 auto; display: block;">
<path d="M22 10C22 10 20 4 17 4C14 4 12 16 9 16C6 16 4 10 2 10" stroke="#7c3aed" stroke-width="2" stroke-linecap="round" stroke-linejoin="round"/>
<g transform="translate(13, 11)">
<circle cx="5" cy="5" r="4" stroke="#7c3aed" stroke-width="1.5"/>
<path d="M8 8L11 11" stroke="#7c3aed" stroke-width="1.5" stroke-linecap="round"/>
</g>
</svg>
</div>
<div style="color: #a5b4fc; font-size: 12px; line-height: 1.5;">
Upload audio to analyze acoustic features
</div>
</div>
"""
def create_acoustic_visualization(result):
"""Acoustic analysis visualization with radar chart"""
pitch = result.get("pitch", {})
energy = result.get("energy", {})
rhythm = result.get("rhythm", {})
tempo = result.get("tempo", 0)
spectral = result.get("spectral", {})
# Use pre-calculated scores from Modal backend (already 0-100)
pitch_norm = pitch.get("score", 0)
energy_norm = energy.get("score", 0)
rhythm_norm = rhythm.get("score", 0)
spectral_norm = spectral.get("score", 0)
# Tempo: normalize BPM to 0-100 (60-180 BPM range)
tempo_bpm = tempo
tempo_norm = min(100, max(0, (tempo_bpm - 60) / 120 * 100)) if tempo_bpm > 0 else 0
# Radar chart calculation
center_x, center_y = 150, 150
radius = 110
# 5 metrics in order: Pitch(top), Energy(top-right), Rhythm(bottom-right), Tempo(bottom-left), Spectral(top-left)
metrics = [
("Pitch", pitch_norm, -90), # 0° - 90° = -90° (top)
("Energy", energy_norm, -18), # 72° - 90° = -18° (top-right)
("Rhythm", rhythm_norm, 54), # 144° - 90° = 54° (bottom-right)
("Tempo", tempo_norm, 126), # 216° - 90° = 126° (bottom-left)
("Spectral", spectral_norm, 198) # 288° - 90° = 198° (top-left)
]
# Calculate polygon points for data
data_points = []
for _, value, angle_deg in metrics:
angle_rad = math.radians(angle_deg)
point_radius = (value / 100) * radius
x = center_x + point_radius * math.cos(angle_rad)
y = center_y + point_radius * math.sin(angle_rad)
data_points.append(f"{x:.2f},{y:.2f}")
# Background concentric pentagons (20, 40, 60, 80, 100)
def create_pentagon_points(scale):
points = []
for _, _, angle_deg in metrics:
angle_rad = math.radians(angle_deg)
r = radius * scale
x = center_x + r * math.cos(angle_rad)
y = center_y + r * math.sin(angle_rad)
points.append(f"{x:.2f},{y:.2f}")
return " ".join(points)
background_pentagons = ""
for scale in [0.2, 0.4, 0.6, 0.8, 1.0]:
background_pentagons += f'<polygon points="{create_pentagon_points(scale)}" fill="none" stroke="rgba(124, 58, 237, 0.15)" stroke-width="1"/>'
# Axis lines from center to vertices
axis_lines = ""
for _, _, angle_deg in metrics:
angle_rad = math.radians(angle_deg)
x = center_x + radius * math.cos(angle_rad)
y = center_y + radius * math.sin(angle_rad)
axis_lines += f'<line x1="{center_x}" y1="{center_y}" x2="{x:.2f}" y2="{y:.2f}" stroke="rgba(124, 58, 237, 0.3)" stroke-width="1"/>'
# Labels at vertices
labels = ""
for label, value, angle_deg in metrics:
angle_rad = math.radians(angle_deg)
# Position label outside the pentagon
label_radius = radius + 25
x = center_x + label_radius * math.cos(angle_rad)
y = center_y + label_radius * math.sin(angle_rad)
labels += f'''<text x="{x:.2f}" y="{y:.2f}" text-anchor="middle" dominant-baseline="middle" fill="#a5b4fc" font-size="11" font-weight="600">
{label}
<tspan x="{x:.2f}" dy="12" fill="#a855f7" font-size="13" font-weight="700">{int(value)}</tspan>
</text>'''
return f"""
<div style="
background: rgba(10, 10, 26, 0.6);
border: 1px solid rgba(124, 58, 237, 0.3);
border-radius: 16px;
padding: 20px;
display: flex;
align-items: center;
justify-content: center;
">
<svg width="300" height="300" viewBox="0 0 300 300">
<!-- Background pentagons -->
{background_pentagons}
<!-- Axis lines -->
{axis_lines}
<!-- Data polygon -->
<polygon points="{' '.join(data_points)}"
fill="rgba(124, 58, 237, 0.3)"
stroke="#a855f7"
stroke-width="2"/>
<!-- Data points -->
{''.join([f'<circle cx="{pt.split(",")[0]}" cy="{pt.split(",")[1]}" r="4" fill="#a855f7"/>' for pt in data_points])}
<!-- Labels -->
{labels}
</svg>
</div>
"""
def create_mimicry_empty():
"""Empty state for voice mimicry game"""
return """
<div style="
background: rgba(10, 10, 26, 0.4);
border: 1px solid rgba(124, 58, 237, 0.2);
border-radius: 16px;
padding: 30px 20px;
text-align: center;
height: 100%;
display: flex;
flex-direction: column;
align-items: center;
justify-content: center;
">
<div style="margin-bottom: 12px; opacity: 0.5;">
<svg width="48" height="48" viewBox="0 0 24 24" fill="none" style="margin: 0 auto; display: block;">
<defs>
<linearGradient id="micGradEmpty" x1="0%" y1="0%" x2="100%" y2="100%">
<stop offset="0%" style="stop-color:#8b5cf6"/>
<stop offset="100%" style="stop-color:#6366f1"/>
</linearGradient>
</defs>
<path d="M12 14c1.66 0 3-1.34 3-3V5c0-1.66-1.34-3-3-3S9 3.34 9 5v6c0 1.66 1.34 3 3 3z" fill="url(#micGradEmpty)"/>
<path d="M17 11c0 2.76-2.24 5-5 5s-5-2.24-5-5H5c0 3.53 2.61 6.43 6 6.92V21h2v-3.08c3.39-.49 6-3.39 6-6.92h-2z" fill="url(#micGradEmpty)"/>
</svg>
</div>
<div style="color: #a5b4fc; font-size: 12px; line-height: 1.5;">
Upload reference and your voice to see similarity scores
</div>
</div>
"""
def create_mimicry_visualization(result):
"""Voice mimicry score visualization with progress bars"""
pronunciation = result.get("pronunciation", 0)
tone = result.get("transcript", 0) # Tone score
pitch = result.get("pitch", 0)
rhythm = result.get("rhythm", 0)
energy = result.get("energy", 0)
def create_progress_bar(label, value):
return f"""
<div style="display: flex; align-items: center; gap: 12px; margin-bottom: 10px;">
<div style="flex: 1;">
<div style="font-size: 11px; color: #cbd5e1; margin-bottom: 4px;">{label}</div>
<div style="
height: 6px;
background: rgba(124, 58, 237, 0.2);
border-radius: 3px;
overflow: hidden;
">
<div style="
height: 100%;
width: {value}%;
background: linear-gradient(90deg, #6366f1, #22d3ee);
border-radius: 3px;
"></div>
</div>
</div>
<div style="
font-size: 14px;
font-weight: 700;
color: #22d3ee;
min-width: 32px;
text-align: right;
">{value}</div>
</div>
"""
return f"""
<div style="
background: rgba(10, 10, 26, 0.6);
border: 1px solid rgba(124, 58, 237, 0.3);
border-radius: 16px;
padding: 20px;
height: 100%;
display: flex;
flex-direction: column;
">
<div style="
display: flex;
align-items: center;
gap: 10px;
margin-bottom: 16px;
padding-bottom: 14px;
border-bottom: 1px solid rgba(124, 58, 237, 0.2);
">
<div style="
width: 40px;
height: 40px;
border-radius: 10px;
background: linear-gradient(135deg, #7c3aed, #6366f1);
display: flex;
align-items: center;
justify-content: center;
flex-shrink: 0;
">
<svg width="24" height="24" viewBox="0 0 24 24" fill="none">
<circle cx="12" cy="12" r="10" fill="rgba(255, 255, 255, 0.2)" stroke="white" stroke-width="1.5"/>
<text x="12" y="16" text-anchor="middle" font-size="10" fill="white" font-weight="bold">AI</text>
</svg>
</div>
<div style="flex: 1; min-width: 0;">
<div style="font-size: 10px; color: #a5b4fc; text-transform: uppercase; letter-spacing: 1px;">CLAUDE</div>
<div style="font-size: 11px; color: #cbd5e1; line-height: 1.4;">
Wow, that voice input, takes analytical skills of course but I'll handle it
</div>
</div>
</div>
<div style="flex: 1;">
{create_progress_bar("Pronunciation", pronunciation)}
{create_progress_bar("Tone", tone)}
{create_progress_bar("Pitch", pitch)}
{create_progress_bar("Rhythm", rhythm)}
{create_progress_bar("Energy", energy)}
</div>
</div>
"""
def create_transcription_empty():
"""Empty state for transcription"""
return """
<div style="
background: rgba(10, 10, 26, 0.4);
border: 1px solid rgba(124, 58, 237, 0.2);
border-radius: 12px;
padding: 20px;
text-align: center;
color: #a5b4fc;
font-size: 13px;
">
Upload audio to transcribe
</div>
"""
def create_transcription_visualization(text):
"""Simple text display for transcription result"""
return f"""
<div style="
background: rgba(10, 10, 26, 0.6);
border: 1px solid rgba(124, 58, 237, 0.3);
border-radius: 12px;
padding: 20px;
color: #e2e8f0;
font-size: 20px;
line-height: 1.6;
white-space: pre-wrap;
word-wrap: break-word;
">
{text if text else "Transcription completed"}
</div>
"""
def create_embedding_empty():
"""Empty state for embedding extraction"""
return """
<div style="
background: rgba(10, 10, 26, 0.4);
border: 1px solid rgba(124, 58, 237, 0.2);
border-radius: 16px;
padding: 30px 20px;
text-align: center;
height: 100%;
display: flex;
flex-direction: column;
align-items: center;
justify-content: center;
">
<div style="margin-bottom: 12px; opacity: 0.5;">
<svg width="48" height="48" viewBox="0 0 24 24" fill="none" style="margin: 0 auto; display: block;">
<path d="M21 16V8L12 4L3 8V16L12 20L21 16Z" stroke="#A855F7" stroke-width="2" stroke-linecap="round" stroke-linejoin="round"/>
<path d="M12 4V12M12 12V20M12 12L21 8M12 12L3 8" stroke="#A855F7" stroke-width="2" stroke-linecap="round" stroke-linejoin="round"/>
<circle cx="12" cy="12" r="2" fill="#A855F7"/>
</svg>
</div>
<div style="color: #a5b4fc; font-size: 12px; line-height: 1.5;">
Upload audio to extract voice embedding
</div>
</div>
"""
def create_embedding_visualization(result):
"""Embedding extraction visualization"""
model = result.get("model", "Wav2Vec2")
dim = result.get("embedding_length", result.get("dim", 768))
preview = result.get("embedding_preview", [])
# Filter only numeric values to avoid format errors with strings like "..."
if preview:
numeric_preview = [v for v in preview if isinstance(v, (int, float))]
preview_str = ", ".join([f"{v:.4f}" for v in numeric_preview]) if numeric_preview else "..."
else:
preview_str = "..."
return f"""
<div style="
background: rgba(10, 10, 26, 0.6);
border: 1px solid rgba(124, 58, 237, 0.3);
border-radius: 16px;
padding: 20px;
height: 100%;
display: flex;
flex-direction: column;
">
<div style="display: flex; justify-content: space-between; align-items: center; margin-bottom: 10px; padding: 10px; background: rgba(124, 58, 237, 0.1); border-radius: 8px;">
<div style="font-size: 16px; color: #cbd5e1; font-weight: 600;">Model</div>
<div style="font-size: 18px; font-weight: 700; color: #22d3ee;">{model}</div>
</div>
<div style="display: flex; justify-content: space-between; align-items: center; margin-bottom: 10px; padding: 10px; background: rgba(124, 58, 237, 0.1); border-radius: 8px;">
<div style="font-size: 16px; color: #cbd5e1; font-weight: 600;">Dimensions</div>
<div style="font-size: 18px; font-weight: 700; color: #22d3ee;">{dim}</div>
</div>
<div style="padding: 10px; background: rgba(124, 58, 237, 0.1); border-radius: 8px;">
<div style="font-size: 16px; color: #cbd5e1; font-weight: 600; margin-bottom: 8px;">Preview</div>
<div style="font-size: 14px; font-family: monospace; color: #22d3ee; overflow: hidden; text-overflow: ellipsis; white-space: nowrap;">
[{preview_str}]
</div>
</div>
</div>
"""
def create_compare_empty():
"""Empty state for voice comparison"""
return """
<div style="
background: rgba(10, 10, 26, 0.4);
border: 1px solid rgba(124, 58, 237, 0.2);
border-radius: 16px;
padding: 30px 20px;
text-align: center;
height: 100%;
display: flex;
flex-direction: column;
align-items: center;
justify-content: center;
">
<div style="margin-bottom: 12px; opacity: 0.5;">
<svg width="48" height="48" viewBox="0 0 24 24" fill="none" style="margin: 0 auto; display: block;">
<path d="M2 10V14" stroke="#A855F7" stroke-width="2" stroke-linecap="round"/>
<path d="M5 8V16" stroke="#A855F7" stroke-width="2" stroke-linecap="round"/>
<path d="M8 11V13" stroke="#A855F7" stroke-width="2" stroke-linecap="round"/>
<path d="M22 10V14" stroke="#A855F7" stroke-width="2" stroke-linecap="round"/>
<path d="M19 7V17" stroke="#A855F7" stroke-width="2" stroke-linecap="round"/>
<path d="M16 11V13" stroke="#A855F7" stroke-width="2" stroke-linecap="round"/>
<path d="M10 12H14" stroke="#A855F7" stroke-width="2" stroke-linecap="round"/>
</svg>
</div>
<div style="color: #a5b4fc; font-size: 12px; line-height: 1.5;">
Upload two audio files to compare voices
</div>
</div>
"""
def create_compare_visualization(result):
"""Voice comparison visualization with similarity score"""
similarity = result.get("similarity", 0)
tone_score = result.get("tone_score", 0)
# Convert similarity to percentage
similarity_pct = int(similarity * 100)
# Color based on similarity - Purple theme matching VOICE SIMILARITY
if similarity_pct >= 80:
color = "#c084fc" # Light purple (high score)
elif similarity_pct >= 60:
color = "#a855f7" # Medium purple (medium score)
else:
color = "#7c3aed" # Dark purple (low score)
return f"""
<div style="
background: rgba(10, 10, 26, 0.6);
border: 1px solid rgba(124, 58, 237, 0.3);
border-radius: 16px;
padding: 20px;
height: 100%;
display: flex;
align-items: flex-end;
justify-content: center;
padding-bottom: 40px;
">
<div style="
width: 160px;
height: 160px;
border-radius: 50%;
background: conic-gradient({color} 0deg {similarity_pct * 3.6}deg, rgba(124, 58, 237, 0.2) {similarity_pct * 3.6}deg 360deg);
display: flex;
align-items: center;
justify-content: center;
">
<div style="
width: 130px;
height: 130px;
border-radius: 50%;
background: rgba(10, 10, 26, 0.9);
display: flex;
flex-direction: column;
align-items: center;
justify-content: center;
">
<span style="font-size: 40px; font-weight: 700; color: {color};">{similarity_pct}</span>
<span style="font-size: 11px; color: #a5b4fc; letter-spacing: 0.5px;">SIMILARITY</span>
</div>
</div>
</div>
"""
def create_similarity_empty():
"""Empty state for voice similarity analysis"""
return """
<div style="
background: rgba(10, 10, 26, 0.4);
border: 1px solid rgba(124, 58, 237, 0.2);
border-radius: 16px;
padding: 30px 20px;
text-align: center;
height: 100%;
display: flex;
flex-direction: column;
align-items: center;
justify-content: center;
">
<div style="margin-bottom: 12px; opacity: 0.5;">
<svg width="48" height="48" viewBox="0 0 24 24" fill="none" style="margin: 0 auto; display: block;">
<circle cx="12" cy="12" r="9" stroke="#A855F7" stroke-width="1" opacity="0.3"/>
<path d="M12 5L18 9L16.5 18H7.5L6 9L12 5Z" fill="#A855F7" fill-opacity="0.4" stroke="#A855F7" stroke-width="2" stroke-linejoin="round"/>
<circle cx="12" cy="5" r="1.5" fill="#A855F7"/>
</svg>
</div>
<div style="color: #a5b4fc; font-size: 12px; line-height: 1.5;">
Upload audio files for comprehensive similarity analysis
</div>
</div>
"""
def create_similarity_visualization(result):
"""Voice similarity visualization with radar chart"""
overall = result.get("overall", 0)
pronunciation = result.get("pronunciation", 0)
transcript = result.get("transcript", 0)
pitch = result.get("pitch", 0)
rhythm = result.get("rhythm", 0)
energy = result.get("energy", 0)
# Color based on overall score - Purple theme
if overall >= 80:
color = "#c084fc" # Light purple (high score)
elif overall >= 60:
color = "#a855f7" # Medium purple (medium score)
else:
color = "#7c3aed" # Dark purple (low score)
# Radar chart calculation
center_x, center_y = 150, 150
radius = 110
# 5 metrics in order: Pronunciation(top), Transcript(top-right), Pitch(bottom-right), Energy(bottom-left), Rhythm(top-left)
metrics = [
("Pronunciation", pronunciation, -90), # 0° - 90° = -90° (top)
("Transcript", transcript, -18), # 72° - 90° = -18° (top-right)
("Pitch", pitch, 54), # 144° - 90° = 54° (bottom-right)
("Energy", energy, 126), # 216° - 90° = 126° (bottom-left)
("Rhythm", rhythm, 198) # 288° - 90° = 198° (top-left)
]
# Calculate polygon points for data
data_points = []
for _, value, angle_deg in metrics:
angle_rad = math.radians(angle_deg)
point_radius = (value / 100) * radius
x = center_x + point_radius * math.cos(angle_rad)
y = center_y + point_radius * math.sin(angle_rad)
data_points.append(f"{x:.2f},{y:.2f}")
# Background concentric pentagons (20, 40, 60, 80, 100)
def create_pentagon_points(scale):
points = []
for _, _, angle_deg in metrics:
angle_rad = math.radians(angle_deg)
r = radius * scale
x = center_x + r * math.cos(angle_rad)
y = center_y + r * math.sin(angle_rad)
points.append(f"{x:.2f},{y:.2f}")
return " ".join(points)
background_pentagons = ""
for scale in [0.2, 0.4, 0.6, 0.8, 1.0]:
background_pentagons += f'<polygon points="{create_pentagon_points(scale)}" fill="none" stroke="rgba(124, 58, 237, 0.15)" stroke-width="1"/>'
# Axis lines from center to vertices
axis_lines = ""
for _, _, angle_deg in metrics:
angle_rad = math.radians(angle_deg)
x = center_x + radius * math.cos(angle_rad)
y = center_y + radius * math.sin(angle_rad)
axis_lines += f'<line x1="{center_x}" y1="{center_y}" x2="{x:.2f}" y2="{y:.2f}" stroke="rgba(124, 58, 237, 0.3)" stroke-width="1"/>'
# Labels at vertices
labels = ""
for label, value, angle_deg in metrics:
angle_rad = math.radians(angle_deg)
# Position label outside the pentagon
label_radius = radius + 25
x = center_x + label_radius * math.cos(angle_rad)
y = center_y + label_radius * math.sin(angle_rad)
labels += f'''<text x="{x:.2f}" y="{y:.2f}" text-anchor="middle" dominant-baseline="middle" fill="#a5b4fc" font-size="11" font-weight="600">
{label}
<tspan x="{x:.2f}" dy="12" fill="#a855f7" font-size="13" font-weight="700">{value}</tspan>
</text>'''
return f"""
<div style="
background: rgba(10, 10, 26, 0.6);
border: 1px solid rgba(124, 58, 237, 0.3);
border-radius: 16px;
padding: 20px;
display: flex;
align-items: center;
gap: 30px;
">
<!-- Left: Overall Score Donut -->
<div style="flex: 1; display: flex; align-items: center; justify-content: center;">
<div style="
width: 160px;
height: 160px;
border-radius: 50%;
background: conic-gradient({color} 0deg {overall * 3.6}deg, rgba(124, 58, 237, 0.2) {overall * 3.6}deg 360deg);
display: flex;
align-items: center;
justify-content: center;
">
<div style="
width: 130px;
height: 130px;
border-radius: 50%;
background: rgba(10, 10, 26, 0.9);
display: flex;
flex-direction: column;
align-items: center;
justify-content: center;
">
<span style="font-size: 40px; font-weight: 700; color: {color};">{overall}</span>
<span style="font-size: 11px; color: #a5b4fc; letter-spacing: 0.5px;">OVERALL</span>
</div>
</div>
</div>
<!-- Right: Radar Chart -->
<div style="flex: 1; display: flex; align-items: center; justify-content: center;">
<svg width="300" height="300" viewBox="0 0 300 300">
<!-- Background pentagons -->
{background_pentagons}
<!-- Axis lines -->
{axis_lines}
<!-- Data polygon -->
<polygon points="{' '.join(data_points)}"
fill="rgba(124, 58, 237, 0.3)"
stroke="#a855f7"
stroke-width="2"/>
<!-- Data points -->
{''.join([f'<circle cx="{pt.split(",")[0]}" cy="{pt.split(",")[1]}" r="4" fill="#a855f7"/>' for pt in data_points])}
<!-- Labels -->
{labels}
</svg>
</div>
</div>
"""
# Clean audio functions removed - using gr.Audio component directly
# ============================================================================
# Gradio Interface with Dark Theme
# ============================================================================
custom_css = """
/* ===== DARK THEME STYLING (CSS-ONLY) ===== */
/* This CSS forces dark mode appearance regardless of system/Gradio theme */
/* All colors are SOLID (not rgba/transparent) to ensure consistent appearance */
:root {
color-scheme: dark !important;
--body-background-fill: #0a0a1a !important;
--background-fill-primary: #0d0d1a !important;
--background-fill-secondary: #12122a !important;
--block-background-fill: #0d0d1a !important;
--input-background-fill: #1a1a35 !important;
--body-text-color: #e0e7ff !important;
--block-title-text-color: #a5b4fc !important;
--block-label-text-color: #a5b4fc !important;
--input-text-color: #e0e7ff !important;
--neutral-50: #0a0a1a !important;
--neutral-100: #0d0d1a !important;
--neutral-200: #12122a !important;
--neutral-300: #1a1a35 !important;
--neutral-400: #2d2d4a !important;
--neutral-500: #4a4a6a !important;
--neutral-600: #7c7c9a !important;
--neutral-700: #a5b4fc !important;
--neutral-800: #c7d2fe !important;
--neutral-900: #e0e7ff !important;
--neutral-950: #ffffff !important;
}
/* Force dark mode on html and body */
html, body {
background: #0a0a1a !important;
background-color: #0a0a1a !important;
color: #e0e7ff !important;
}
/* ===== GLOBAL STYLES ===== */
body {
background: linear-gradient(180deg, #0a0a1a 0%, #0f0f23 100%) !important;
background-color: #0a0a1a !important;
color: #ffffff !important;
font-family: system-ui, -apple-system, sans-serif;
}
/* Override Gradio's light mode backgrounds AND text colors */
.dark, .light, [data-theme="light"], [data-theme="dark"],
html[data-theme="light"], html[data-theme="dark"],
body.light, body.dark {
--body-background-fill: #0a0a1a !important;
--background-fill-primary: #0d0d1a !important;
--background-fill-secondary: #12122a !important;
--block-background-fill: #0d0d1a !important;
--input-background-fill: #1a1a35 !important;
--body-text-color: #e0e7ff !important;
--block-title-text-color: #a5b4fc !important;
--block-label-text-color: #a5b4fc !important;
--input-text-color: #e0e7ff !important;
--neutral-50: #0a0a1a !important;
--neutral-100: #0d0d1a !important;
--neutral-200: #12122a !important;
--neutral-300: #1a1a35 !important;
--neutral-400: #2d2d4a !important;
--neutral-500: #4a4a6a !important;
--neutral-600: #7c7c9a !important;
--neutral-700: #a5b4fc !important;
--neutral-800: #c7d2fe !important;
--neutral-900: #e0e7ff !important;
--neutral-950: #ffffff !important;
color: #e0e7ff !important;
background: #0a0a1a !important;
background-color: #0a0a1a !important;
}
.gradio-container {
max-width: 100% !important;
width: 100% !important;
padding: 0px 16px 20px 16px !important;
background: #0a0a1a !important;
background-color: #0a0a1a !important;
margin: 0 !important;
}
.gradio-container > .main,
.gradio-container .main,
.main {
max-width: 100% !important;
width: 100% !important;
padding-left: 0 !important;
padding-right: 0 !important;
margin: 0 auto !important;
}
.contain {
max-width: 100% !important;
padding: 0 !important;
}
/* Force full width on all Gradio internal containers */
.gradio-container > div,
.gradio-container > div > div,
#component-0,
.wrap,
.app,
.contain,
footer,
.gradio-row,
.gradio-column,
.svelte-1gfkn6j,
[class*="svelte-"] {
max-width: 100% !important;
}
.gradio-row {
max-width: 100% !important;
width: 100% !important;
margin: 0 !important;
padding: 0 !important;
}
/* ===== HEADER (FLOATING, NO CARD) ===== */
.header-main {
display: flex;
justify-content: space-between;
align-items: center;
margin-bottom: 0;
padding: 0;
}
.header-left {
display: flex;
align-items: center;
gap: 16px;
}
.header-icon {
font-size: 48px;
filter: drop-shadow(0 4px 12px rgba(99, 102, 241, 0.6));
}
.header-title {
font-size: 42px;
font-weight: 900;
color: #e0e7ff;
margin: 0;
letter-spacing: -0.5px;
}
.header-subtitle {
color: #c7d2fe;
font-size: 20px;
font-weight: 700;
margin-left: 6px;
}
/* ===== DOCS BUTTON ===== */
.docs-button {
display: flex;
align-items: center;
gap: 8px;
padding: 10px 20px;
background: linear-gradient(135deg, rgba(124, 58, 237, 0.3), rgba(99, 102, 241, 0.3));
border: 1px solid rgba(124, 58, 237, 0.5);
border-radius: 12px;
color: #e0e7ff;
font-size: 14px;
font-weight: 600;
cursor: pointer;
transition: all 0.3s ease;
text-transform: uppercase;
letter-spacing: 0.5px;
}
.docs-button:hover {
background: linear-gradient(135deg, rgba(124, 58, 237, 0.5), rgba(99, 102, 241, 0.5));
border-color: rgba(124, 58, 237, 0.8);
transform: translateY(-2px);
box-shadow: 0 4px 20px rgba(124, 58, 237, 0.4);
}
.docs-button svg {
width: 18px;
height: 18px;
}
/* ===== DOCS MODAL ===== */
.docs-modal-overlay {
display: none;
position: fixed !important;
inset: 0 !important;
width: 100vw !important;
height: 100vh !important;
background: rgba(0, 0, 0, 0.85) !important;
backdrop-filter: blur(10px) !important;
z-index: 99999 !important;
justify-content: center !important;
align-items: flex-start !important;
padding-top: 60px !important;
box-sizing: border-box !important;
/* Modal positioned near top of viewport */
overflow: hidden !important;
}
.docs-modal-overlay.active {
display: flex !important;
}
.docs-modal {
position: relative !important;
background: #0d0d1a !important;
border: 2px solid #7c3aed !important;
border-radius: 20px !important;
width: calc(100vw - 80px) !important;
max-width: 1200px !important;
max-height: 55vh !important;
overflow: hidden !important;
box-shadow: 0 25px 80px rgba(0, 0, 0, 0.9) !important;
/* Remove margin that could affect centering */
margin: 0 !important;
/* Prevent any transform inheritance issues */
transform: none !important;
}
.docs-modal-header {
display: flex !important;
justify-content: space-between !important;
align-items: center !important;
padding: 20px 24px !important;
border-bottom: 2px solid #7c3aed !important;
background: #1a1a2e !important;
}
.docs-modal-title {
font-size: 20px;
font-weight: 700;
color: #e0e7ff;
display: flex;
align-items: center;
gap: 10px;
}
.docs-modal-close {
background: rgba(124, 58, 237, 0.3);
border: 2px solid rgba(124, 58, 237, 0.5);
border-radius: 12px;
color: #e0e7ff;
font-size: 28px;
font-weight: 300;
cursor: pointer;
padding: 4px 14px;
line-height: 1;
transition: all 0.2s;
}
.docs-modal-close:hover {
background: rgba(124, 58, 237, 0.4);
border-color: rgba(124, 58, 237, 0.6);
}
.docs-modal-content {
padding: 24px !important;
overflow-y: auto !important;
max-height: calc(55vh - 80px) !important;
color: #c7d2fe !important;
font-size: 15px !important;
line-height: 1.7 !important;
background: #0d0d1a !important;
}
.docs-modal-content h1 { font-size: 28px; color: #e0e7ff; margin: 0 0 16px 0; padding-bottom: 12px; border-bottom: 2px solid rgba(124, 58, 237, 0.3); }
.docs-modal-content h2 { font-size: 22px; color: #e0e7ff; margin: 24px 0 12px 0; }
.docs-modal-content h3 { font-size: 18px; color: #a5b4fc; margin: 20px 0 10px 0; }
.docs-modal-content p { margin: 12px 0; }
.docs-modal-content ul, .docs-modal-content ol { margin: 12px 0; padding-left: 24px; }
.docs-modal-content li { margin: 6px 0; }
.docs-modal-content code { background: rgba(124, 58, 237, 0.2); padding: 2px 6px; border-radius: 4px; font-family: 'SF Mono', 'Monaco', 'Consolas', monospace; font-size: 13px; color: #c4b5fd; }
.docs-modal-content pre { background: rgba(0, 0, 0, 0.4); border: 1px solid rgba(124, 58, 237, 0.2); border-radius: 12px; padding: 16px; overflow-x: auto; margin: 16px 0; white-space: pre; }
.docs-modal-content pre code { background: transparent; padding: 0; color: #a5b4fc; white-space: pre; display: block; }
.docs-modal-content table { width: 100%; border-collapse: collapse; margin: 16px 0; }
.docs-modal-content th, .docs-modal-content td { padding: 10px 12px; text-align: left; border: 1px solid rgba(124, 58, 237, 0.2); }
.docs-modal-content th { background: rgba(124, 58, 237, 0.15); color: #e0e7ff; font-weight: 600; }
.docs-modal-content td { color: #c7d2fe; }
.docs-modal-content a { color: #a78bfa; text-decoration: none; }
.docs-modal-content a:hover { text-decoration: underline; }
.docs-modal-content strong { color: #e0e7ff; }
.docs-modal-content img { max-width: 100%; max-height: 400px; height: auto; border-radius: 8px; margin: 12px 0; object-fit: contain; }
/* ===== CARD STYLES ===== */
.card {
background: #0f0f23 !important;
background-color: #0f0f23 !important;
border: 1px solid #3d2a6b !important;
border-radius: 20px;
padding: 30px;
box-shadow: 0 8px 32px rgba(0, 0, 0, 0.4);
transition: all 0.3s ease;
height: 100%;
display: flex;
flex-direction: column;
}
.card:hover {
border-color: #5b3d99 !important;
box-shadow: 0 12px 48px rgba(124, 58, 237, 0.3);
}
/* Ensure columns in top row have equal height */
.gradio-row:first-of-type .gradio-column {
display: flex !important;
flex-direction: column !important;
}
.gradio-row:first-of-type .gradio-column > div {
flex: 1 !important;
display: flex !important;
flex-direction: column !important;
}
/* Set minimum height for top row cards */
.gradio-row:first-of-type .card {
min-height: 550px;
}
.card-title {
font-size: 16px;
font-weight: 700;
color: #a5b4fc;
text-transform: uppercase;
letter-spacing: 1px;
margin-bottom: 20px;
display: flex;
align-items: center;
}
/* ===== ROW SPACING ===== */
.gradio-row {
gap: 24px !important;
}
/* ===== QUICK START - CODE BLOCK (TERMINAL/IDE STYLE) ===== */
.terminal-window {
background: #1a1b26;
border: 1px solid rgba(124, 58, 237, 0.3);
border-radius: 12px;
overflow: hidden;
margin-bottom: 16px;
box-shadow: 0 8px 32px rgba(0, 0, 0, 0.6);
}
.terminal-header {
background: #16161e;
padding: 12px 16px;
display: flex;
align-items: center;
justify-content: space-between;
border-bottom: 1px solid rgba(124, 58, 237, 0.2);
}
.terminal-dots {
display: flex;
gap: 8px;
}
.terminal-dot {
width: 12px;
height: 12px;
border-radius: 50%;
}
.terminal-dot.red {
background: #ff5f56 !important;
box-shadow: 0 0 8px rgba(255, 95, 86, 0.8) !important;
}
.terminal-dot.yellow {
background: #ffbd2e !important;
box-shadow: 0 0 8px rgba(255, 189, 46, 0.8) !important;
}
.terminal-dot.green {
background: #27c93f !important;
box-shadow: 0 0 8px rgba(39, 201, 63, 0.8) !important;
}
.terminal-title {
font-size: 12px;
color: #6b7280;
font-family: 'SF Mono', 'Monaco', 'Consolas', monospace;
font-weight: 500;
}
.terminal-body {
background: #1a1b26;
padding: 0;
display: flex;
}
.line-numbers {
background: #16161e;
padding: 16px 12px;
border-right: 1px solid rgba(124, 58, 237, 0.15);
user-select: none;
text-align: right;
min-width: 48px;
}
.line-num {
display: block;
color: #4a5568;
font-family: 'SF Mono', 'Monaco', 'Consolas', monospace;
font-size: 14px;
line-height: 1.8;
}
.code-content {
flex: 1;
padding: 16px 20px;
overflow-x: auto;
}
.code-line {
display: block;
white-space: pre;
font-family: 'SF Mono', 'Monaco', 'Consolas', monospace;
font-size: 14px;
line-height: 1.8;
color: #a9b1d6;
}
.json-key {
color: #7dcfff;
font-weight: 500;
}
.json-string {
color: #9ece6a;
}
.json-bracket {
color: #bb9af7;
font-weight: 600;
}
.json-colon {
color: #c0caf5;
}
.json-comma {
color: #c0caf5;
}
.copy-button {
width: 100%;
background: linear-gradient(135deg, #7c3aed, #6366f1) !important;
border: none !important;
border-radius: 12px !important;
padding: 14px 24px !important;
font-weight: 700 !important;
font-size: 13px !important;
color: white !important;
text-transform: uppercase;
letter-spacing: 1px;
cursor: pointer;
box-shadow: 0 4px 16px rgba(124, 58, 237, 0.4) !important;
transition: all 0.3s ease !important;
display: flex;
align-items: center;
justify-content: center;
gap: 8px;
}
.copy-button:hover {
transform: translateY(-2px) !important;
box-shadow: 0 6px 24px rgba(124, 58, 237, 0.6) !important;
}
/* ===== TOOLS TABLE ===== */
.tools-table,
table.tools-table,
.light .tools-table,
.dark .tools-table,
[data-theme="light"] .tools-table,
[data-theme="dark"] .tools-table {
width: 100%;
border-collapse: separate;
border-spacing: 0;
background: #0d0d1f !important;
background-color: #0d0d1f !important;
border-radius: 12px;
overflow: hidden;
border: 1px solid #3d2a6b !important;
margin-bottom: 0;
flex: 1;
color: #cbd5e1 !important;
}
.tools-table th,
table.tools-table th,
.light .tools-table th,
.dark .tools-table th,
[data-theme="light"] .tools-table th,
[data-theme="dark"] .tools-table th {
background: #1f1545 !important;
background-color: #1f1545 !important;
color: #a5b4fc !important;
font-weight: 700;
font-size: 16px;
text-transform: uppercase;
letter-spacing: 1.5px;
padding: 20px 14px;
text-align: left;
border-bottom: 1px solid #3d2a6b !important;
}
.tools-table td,
table.tools-table td,
.light .tools-table td,
.dark .tools-table td,
[data-theme="light"] .tools-table td,
[data-theme="dark"] .tools-table td {
padding: 20px 14px;
color: #cbd5e1 !important;
background: #0d0d1f !important;
background-color: #0d0d1f !important;
font-size: 16px;
border-bottom: 1px solid #1a1535 !important;
}
.tools-table tr:last-child td {
border-bottom: none;
}
.tools-table tr:hover,
.tools-table tr:hover td {
background: #1a1540 !important;
background-color: #1a1540 !important;
}
.tool-name,
.light .tool-name,
.dark .tool-name,
[data-theme="light"] .tool-name,
[data-theme="dark"] .tool-name {
color: #22d3ee !important;
font-family: 'SF Mono', 'Monaco', 'Consolas', monospace;
font-weight: 600;
font-size: 13px;
vertical-align: middle;
}
/* ===== COMPOSITE SECTION ===== */
.composite-section,
.light .composite-section,
.dark .composite-section,
[data-theme="light"] .composite-section,
[data-theme="dark"] .composite-section {
background: #0d0d1f !important;
background-color: #0d0d1f !important;
border: 1px solid #3d2a6b !important;
border-radius: 12px;
padding: 20px;
color: #cbd5e1 !important;
}
.composite-header,
.light .composite-header,
.dark .composite-header,
[data-theme="light"] .composite-header,
[data-theme="dark"] .composite-header {
font-size: 11px;
font-weight: 700;
color: #a5b4fc !important;
text-transform: uppercase;
letter-spacing: 1.5px;
margin-bottom: 12px;
}
.composite-content,
.light .composite-content,
.dark .composite-content,
[data-theme="light"] .composite-content,
[data-theme="dark"] .composite-content {
color: #cbd5e1 !important;
font-size: 12px;
line-height: 1.6;
margin-bottom: 16px;
}
.try-demo-button {
width: 100%;
background: transparent !important;
border: 2px solid #7c3aed !important;
border-radius: 12px !important;
padding: 12px 24px !important;
font-weight: 700 !important;
font-size: 12px !important;
color: #7c3aed !important;
text-transform: uppercase;
letter-spacing: 1px;
cursor: pointer;
transition: all 0.3s ease !important;
}
.try-demo-button:hover {
background: rgba(124, 58, 237, 0.1) !important;
border-color: #7c3aed !important;
color: #8b5cf6 !important;
}
/* ===== BUTTONS ===== */
button[variant="primary"] {
background: linear-gradient(135deg, #7c3aed, #6366f1) !important;
border: none !important;
border-radius: 12px !important;
padding: 14px 32px !important;
font-weight: 700 !important;
font-size: 14px !important;
color: white !important;
box-shadow: 0 4px 20px rgba(124, 58, 237, 0.4) !important;
transition: all 0.3s ease !important;
}
button[variant="primary"]:hover {
transform: translateY(-2px) !important;
box-shadow: 0 8px 32px rgba(124, 58, 237, 0.6) !important;
}
/* ===== AUDIO COMPONENT ===== */
.gradio-audio {
background: rgba(30, 27, 75, 0.6) !important;
border: 1px solid rgba(124, 58, 237, 0.3) !important;
border-radius: 12px !important;
}
/* ===== TEXTBOX ===== */
textarea {
background: rgba(30, 27, 75, 0.6) !important;
border: 1px solid rgba(124, 58, 237, 0.3) !important;
border-radius: 12px !important;
color: #e0e7ff !important;
font-size: 13px !important;
}
/* ===== DROPDOWN ===== */
select {
background: rgba(30, 27, 75, 0.6) !important;
border: 1px solid rgba(124, 58, 237, 0.3) !important;
border-radius: 12px !important;
color: #e0e7ff !important;
}
/* ===== LABELS ===== */
label {
color: #a5b4fc !important;
font-weight: 600 !important;
font-size: 12px !important;
text-transform: uppercase;
letter-spacing: 0.5px;
}
/* ===== HTML OUTPUT ===== */
.gradio-html {
background: transparent !important;
border: none !important;
}
/* ===== DEMO ROW LAYOUT ===== */
.demo-row {
display: flex !important;
gap: 24px !important;
align-items: stretch !important;
}
/* Only apply card style to the outer column (demo-card-column) */
.demo-card-column {
display: flex !important;
flex-direction: column !important;
height: 700px !important;
min-height: 700px !important;
max-height: 700px !important;
background: rgba(15, 15, 35, 0.8) !important;
backdrop-filter: blur(20px) !important;
border: 1px solid rgba(124, 58, 237, 0.3) !important;
border-radius: 20px !important;
padding: 4px 4px 2px 4px !important;
box-shadow: 0 8px 32px rgba(0, 0, 0, 0.4) !important;
transition: all 0.3s ease !important;
gap: 2px !important;
overflow-y: auto !important;
}
.demo-card-column:hover {
border-color: rgba(124, 58, 237, 0.5) !important;
box-shadow: 0 12px 48px rgba(124, 58, 237, 0.3) !important;
}
/* Remove any border/background from inner elements */
.demo-card-column > div,
.demo-card-column > div > div,
.demo-row > div > div {
background: transparent !important;
border: none !important;
box-shadow: none !important;
padding: 0 !important;
border-radius: 0 !important;
}
/* Remove card background from inner HTML - we use column background instead */
.demo-row .card {
background: transparent !important;
backdrop-filter: none !important;
border: none !important;
border-radius: 0 !important;
padding: 0 !important;
box-shadow: none !important;
margin-bottom: 12px !important;
}
.demo-row .card:hover {
border: none !important;
box-shadow: none !important;
}
/* Ensure all inner components have transparent background */
.demo-row .gradio-audio,
.demo-row .gradio-dropdown,
.demo-row .gradio-textbox,
.demo-row .gradio-button {
background: transparent !important;
}
/* Create a wrapper for input elements (flex container) */
.demo-card-column > div:not(:last-child) {
flex: 0 0 auto !important;
}
/* Adjust spacing for input elements in demo cards */
.demo-row .gradio-audio {
margin-top: 6px !important;
margin-bottom: 0px !important;
max-height: 50px !important;
min-height: 40px !important;
height: 45px !important;
}
/* Target all child elements of audio component */
.demo-row .gradio-audio > div,
.demo-row .gradio-audio .wrap,
.demo-row .gradio-audio .upload-container,
.demo-row .gradio-audio .record-container,
.demo-row .gradio-audio * {
max-height: 50px !important;
}
/* Audio player specific height reduction */
.demo-row .gradio-audio audio {
height: 26px !important;
max-height: 26px !important;
min-height: 26px !important;
}
/* Upload/record button container height */
.demo-row .gradio-audio .upload-container,
.demo-row .gradio-audio .record-container {
min-height: 38px !important;
max-height: 38px !important;
padding: 4px !important;
}
/* Audio component buttons */
.demo-row .gradio-audio button {
height: 28px !important;
min-height: 28px !important;
max-height: 28px !important;
padding: 4px 10px !important;
font-size: 10px !important;
}
/* Hide text nodes in audio upload area - keep icons */
.demo-row .gradio-audio .upload-text {
display: none !important;
}
.demo-row .gradio-audio .placeholder {
display: none !important;
}
.demo-row .gradio-audio span:not(:has(svg)) {
font-size: 0 !important;
}
.demo-row .gradio-audio p {
display: none !important;
}
/* Hide "Drop Audio Here", "- or -", "Click to Upload" text */
.demo-row .gradio-audio .upload-container span,
.demo-row .gradio-audio .upload-container p {
font-size: 0 !important;
line-height: 0 !important;
}
/* Keep SVG icons visible */
.demo-row .gradio-audio svg {
font-size: initial !important;
}
/* ADDITIONAL METHODS: Hide all text in audio upload area */
.demo-row .gradio-audio label {
font-size: 0 !important;
}
.demo-row .gradio-audio label span:not(:has(svg)) {
display: none !important;
}
.demo-row .gradio-audio .file-preview {
font-size: 0 !important;
}
.demo-row .gradio-audio .file-preview span {
font-size: 0 !important;
display: none !important;
}
.demo-row .gradio-audio [data-testid="upload-text"],
.demo-row .gradio-audio [data-testid="file-preview-text"],
.demo-row .gradio-audio .upload-text,
.demo-row .gradio-audio .file-preview-text {
display: none !important;
visibility: hidden !important;
font-size: 0 !important;
}
/* Target all text nodes (more aggressive) */
.demo-row .gradio-audio *:not(svg):not(path):not(circle):not(rect):not(line) {
color: transparent !important;
}
.demo-row .gradio-audio button {
color: white !important;
}
/* Ensure icons remain visible */
.demo-row .gradio-audio svg,
.demo-row .gradio-audio svg * {
color: initial !important;
fill: currentColor !important;
stroke: currentColor !important;
}
/* NUCLEAR OPTION: Hide everything in label, then show only necessary elements */
.demo-row .gradio-audio label > div > div {
display: none !important;
}
.demo-row .gradio-audio label::before {
content: '' !important;
}
.demo-row .gradio-audio label * {
visibility: hidden !important;
}
.demo-row .gradio-audio label svg {
visibility: visible !important;
}
.demo-row .gradio-audio label button {
visibility: visible !important;
}
.demo-row .gradio-audio label audio {
visibility: visible !important;
}
/* Force hide any text content */
.demo-row .gradio-audio label > div::after,
.demo-row .gradio-audio label > div::before {
content: '' !important;
display: none !important;
}
/* Additional override for upload text elements */
.demo-row .gradio-audio [class*="upload"],
.demo-row .gradio-audio [class*="placeholder"],
.demo-row .gradio-audio [class*="text"] {
font-size: 0 !important;
line-height: 0 !important;
width: 0 !important;
height: 0 !important;
opacity: 0 !important;
visibility: hidden !important;
position: absolute !important;
left: -9999px !important;
}
/* NUCLEAR OPTION 2: Complete removal of label content */
.demo-row .gradio-audio label.block {
display: none !important;
}
.demo-row .gradio-audio .file-upload {
display: none !important;
}
/* Hide all direct text children */
.demo-row .gradio-audio label > span:not(:has(button)):not(:has(audio)):not(:has(svg)) {
display: none !important;
}
/* Gradio 6.0 specific selectors - upload area */
.demo-row .gradio-audio [data-testid="upload-button"],
.demo-row .gradio-audio [data-testid="file-upload"],
.demo-row .gradio-audio .upload-area {
display: none !important;
}
/* Hide all paragraph elements in audio component */
.demo-row .gradio-audio label p,
.demo-row .gradio-audio label span.text,
.demo-row .gradio-audio label div.text {
display: none !important;
}
/* More aggressive text hiding - target by content */
.demo-row .gradio-audio *::before,
.demo-row .gradio-audio *::after {
content: '' !important;
display: none !important;
}
/* Make sure only buttons and audio players are visible */
.demo-row .gradio-audio > label > div > div:not(:has(button)):not(:has(audio)) {
display: none !important;
}
/* Gradio Blocks specific - Hide wrapper divs that contain text */
.demo-row .gradio-audio .wrap > div:not(:has(button)):not(:has(audio)):not(:has(svg)) {
display: none !important;
}
/* Override for Gradio 6.x structure */
.demo-row .gradio-audio [class*="svelte-"] span:not(:has(svg)):not(:has(button)) {
display: none !important;
}
.demo-row .gradio-dropdown,
.demo-row .gradio-textbox {
margin-bottom: 2px !important;
}
.demo-row .gradio-row {
margin-bottom: 2px !important;
}
/* IMPORTANT: Button alignment - push buttons to bottom with margin-top: auto */
.demo-row .gradio-button {
margin-top: auto !important;
margin-bottom: 0px !important;
flex-shrink: 0 !important;
}
/* Output area should not push button down - set flex: 1 */
.demo-row .gradio-html {
flex: 1 !important;
margin-bottom: 0 !important;
display: flex !important;
flex-direction: column !important;
max-height: 300px !important;
overflow-y: auto !important;
}
/* Output audio component (clean_audio_output) height limit */
.demo-row .gradio-audio[data-testid="audio-output"],
.demo-row > div:last-child .gradio-audio {
max-height: 120px !important;
min-height: 60px !important;
height: auto !important;
margin-bottom: 0px !important;
}
/* ===== CUSTOM ACTION BUTTONS (DEMO CARDS) ===== */
.custom-action-btn,
.custom-action-btn button,
.custom-action-btn button[data-testid="button"],
button.custom-action-btn,
.demo-row .custom-action-btn,
.demo-row .custom-action-btn button {
width: 100% !important;
min-width: 100% !important;
max-width: 100% !important;
background: linear-gradient(135deg, #6366f1, #7c3aed) !important;
border: none !important;
border-radius: 12px !important;
padding: 8px 16px !important;
height: 38px !important;
min-height: 38px !important;
max-height: 38px !important;
font-weight: 700 !important;
font-size: 16px !important;
letter-spacing: 1.5px !important;
text-transform: uppercase !important;
color: white !important;
box-shadow: 0 4px 20px rgba(124, 58, 237, 0.4) !important;
transition: all 0.3s ease !important;
}
.custom-action-btn:hover,
.custom-action-btn button:hover,
.custom-action-btn button[data-testid="button"]:hover,
button.custom-action-btn:hover,
.demo-row .custom-action-btn:hover,
.demo-row .custom-action-btn button:hover {
transform: translateY(-2px) !important;
box-shadow: 0 8px 32px rgba(124, 58, 237, 0.6) !important;
background: linear-gradient(135deg, #6366f1, #7c3aed) !important;
}
/* ===== DECORATIVE ELEMENTS ===== */
.diamond-decoration {
position: fixed;
bottom: 40px;
right: 40px;
width: 80px;
height: 80px;
border: 2px solid rgba(124, 58, 237, 0.2);
transform: rotate(45deg);
pointer-events: none;
z-index: 1;
}
.star-decoration {
display: none;
}
"""
with gr.Blocks() as demo:
# Inject custom CSS and decorative elements (positioned fixed, no DOM space)
gr.HTML(f"""
<style>{custom_css}</style>
<div class="diamond-decoration"></div>
<div class="star-decoration">
<svg width="24" height="24" viewBox="0 0 24 24" fill="none">
<path d="M12 2l3.09 6.26L22 9.27l-5 4.87 1.18 6.88L12 17.77l-6.18 3.25L7 14.14 2 9.27l6.91-1.01L12 2z" fill="#a5b4fc" opacity="0.4"/>
</svg>
</div>
<script>
// Force dark mode styling (no redirect, just add classes)
(function() {{
// Add dark mode classes immediately
document.documentElement.classList.add('dark');
document.documentElement.setAttribute('data-theme', 'dark');
document.body.classList.add('dark');
document.body.setAttribute('data-theme', 'dark');
// Also observe for Gradio container and add dark class there
const observer = new MutationObserver(function(mutations) {{
const container = document.querySelector('.gradio-container');
if (container && !container.classList.contains('dark')) {{
container.classList.add('dark');
container.setAttribute('data-theme', 'dark');
}}
}});
observer.observe(document.body, {{ childList: true, subtree: true }});
}})();
// JavaScript to completely remove upload text from Audio components in demo-row
function removeAudioUploadText() {{
// Find all audio components in demo-row
const demoRows = document.querySelectorAll('.demo-row');
demoRows.forEach(row => {{
const audioComponents = row.querySelectorAll('.gradio-audio');
audioComponents.forEach(audio => {{
// METHOD 1: Remove ALL text nodes (most aggressive)
const walker = document.createTreeWalker(
audio,
NodeFilter.SHOW_TEXT,
null,
false
);
const textNodesToRemove = [];
while(walker.nextNode()) {{
const node = walker.currentNode;
// Only keep text that's inside button or audio elements
const parentTag = node.parentElement?.tagName?.toLowerCase();
if (parentTag !== 'button' && parentTag !== 'audio') {{
textNodesToRemove.push(node);
}}
}}
textNodesToRemove.forEach(node => {{
if (node.parentNode) {{
node.parentNode.removeChild(node);
}}
}});
// METHOD 2: Hide elements by class/data attributes
const elementsToHide = audio.querySelectorAll(
'[class*="upload"], [class*="placeholder"], [class*="text"], ' +
'[data-testid*="upload"], [data-testid*="file"], ' +
'label.block, .file-upload, p, span:not(:has(button)):not(:has(svg))'
);
elementsToHide.forEach(el => {{
el.style.display = 'none';
el.style.visibility = 'hidden';
el.style.fontSize = '0';
el.style.lineHeight = '0';
el.style.width = '0';
el.style.height = '0';
el.style.opacity = '0';
el.style.position = 'absolute';
el.style.left = '-9999px';
}});
// METHOD 3: Remove label.block entirely if it exists
const labelBlocks = audio.querySelectorAll('label.block');
labelBlocks.forEach(label => {{
// Only remove if it doesn't contain button or audio
if (!label.querySelector('button') && !label.querySelector('audio')) {{
label.remove();
}}
}});
// METHOD 4: Clear innerHTML of divs that don't contain buttons/audio
const allDivs = audio.querySelectorAll('div');
allDivs.forEach(div => {{
if (!div.querySelector('button') && !div.querySelector('audio') && !div.querySelector('svg')) {{
// Check if div only contains text
const hasOnlyText = Array.from(div.childNodes).every(node =>
node.nodeType === Node.TEXT_NODE ||
(node.nodeType === Node.ELEMENT_NODE && !node.querySelector('button, audio, svg'))
);
if (hasOnlyText) {{
div.innerHTML = '';
}}
}}
}});
}});
}});
}}
// Run immediately
removeAudioUploadText();
// Run after DOM changes (MutationObserver)
const observer = new MutationObserver(() => {{
removeAudioUploadText();
}});
// Start observing after a short delay to ensure Gradio has loaded
setTimeout(() => {{
observer.observe(document.body, {{
childList: true,
subtree: true
}});
}}, 500);
// Also run on window load
window.addEventListener('load', removeAudioUploadText);
// Run periodically for the first 5 seconds (catch late renders)
let attempts = 0;
const interval = setInterval(() => {{
removeAudioUploadText();
attempts++;
if (attempts > 10) {{
clearInterval(interval);
}}
}}, 500);
</script>
""")
# ==================== HEADER (FLOATING) ====================
gr.HTML(f"""
<div class="header-main">
<div class="header-left">
<span class="header-icon">
<svg width="72" height="72" viewBox="0 0 52 52" fill="none">
<defs>
<linearGradient id="logoGradHeader" x1="0%" y1="0%" x2="100%" y2="100%">
<stop offset="0%" style="stop-color:#7c3aed"/>
<stop offset="100%" style="stop-color:#6366f1"/>
</linearGradient>
</defs>
<!-- Left: Microphone (rounded capsule + stand) -->
<!-- Microphone capsule (rounded rect) -->
<rect x="8" y="12" width="9" height="14" rx="4.5" fill="url(#logoGradHeader)"/>
<!-- Microphone grill lines (horizontal detail) -->
<line x1="9" y1="16" x2="14" y2="16" stroke="#000000" stroke-width="0.8" stroke-linecap="round"/>
<line x1="9" y1="19.5" x2="14" y2="19.5" stroke="#000000" stroke-width="0.8" stroke-linecap="round"/>
<line x1="9" y1="23" x2="14" y2="23" stroke="#000000" stroke-width="0.8" stroke-linecap="round"/>
<!-- Arc stand -->
<path d="M6.5 26c0 2.5 2.2 5 6 5s6-2.5 6-5" stroke="url(#logoGradHeader)" stroke-width="2" fill="none" stroke-linecap="round"/>
<!-- Pole -->
<rect x="11.5" y="31" width="2" height="5" fill="url(#logoGradHeader)"/>
<!-- Base -->
<rect x="7.5" y="36" width="9" height="2" rx="1" fill="url(#logoGradHeader)"/>
<!-- Right: Audio Wave Bars (4 vertical bars with different heights) -->
<rect x="28" y="18" width="3" height="16" rx="1.5" fill="url(#logoGradHeader)" opacity="0.9"/>
<rect x="34" y="14" width="3" height="24" rx="1.5" fill="url(#logoGradHeader)" opacity="0.95"/>
<rect x="40" y="20" width="3" height="12" rx="1.5" fill="url(#logoGradHeader)" opacity="0.85"/>
<rect x="46" y="22" width="3" height="8" rx="1.5" fill="url(#logoGradHeader)" opacity="0.8"/>
</svg>
</span>
<div>
<span class="header-title">VoiceKit</span>
<span class="header-subtitle">MCP Server</span>
</div>
</div>
<button class="docs-button" onclick="document.getElementById('docsModal').classList.add('active')">
<svg viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2">
<path d="M14 2H6a2 2 0 0 0-2 2v16a2 2 0 0 0 2 2h12a2 2 0 0 0 2-2V8z"/>
<polyline points="14 2 14 8 20 8"/>
<line x1="16" y1="13" x2="8" y2="13"/>
<line x1="16" y1="17" x2="8" y2="17"/>
<polyline points="10 9 9 9 8 9"/>
</svg>
DOCS
</button>
</div>
<!-- DOCS Modal -->
<div id="docsModal" class="docs-modal-overlay" onclick="if(event.target === this) this.classList.remove('active')">
<div class="docs-modal">
<div class="docs-modal-header">
<div class="docs-modal-title">
<svg width="24" height="24" viewBox="0 0 24 24" fill="none" stroke="#a78bfa" stroke-width="2">
<path d="M14 2H6a2 2 0 0 0-2 2v16a2 2 0 0 0 2 2h12a2 2 0 0 0 2-2V8z"/>
<polyline points="14 2 14 8 20 8"/>
</svg>
Documentation
</div>
<button class="docs-modal-close" onclick="document.getElementById('docsModal').classList.remove('active')">&times;</button>
</div>
<div class="docs-modal-content">
{readme_html}
</div>
</div>
</div>
""")
# ==================== TOP ROW: QUICK START + AVAILABLE TOOLS ====================
with gr.Row(equal_height=True):
# QUICK START CARD
with gr.Column(scale=1):
gr.HTML("""
<div class="card" style="min-height: 550px;">
<div class="card-title">
<svg width="18" height="18" viewBox="0 0 24 24" fill="#7c3aed" style="margin-right: 8px;">
<path d="M19.14 12.94c.04-.31.06-.63.06-.94 0-.31-.02-.63-.06-.94l2.03-1.58c.18-.14.23-.41.12-.61l-1.92-3.32c-.12-.22-.37-.29-.59-.22l-2.39.96c-.5-.38-1.03-.7-1.62-.94l-.36-2.54c-.04-.24-.24-.41-.48-.41h-3.84c-.24 0-.43.17-.47.41l-.36 2.54c-.59.24-1.13.57-1.62.94l-2.39-.96c-.22-.08-.47 0-.59.22L2.74 8.87c-.12.21-.08.47.12.61l2.03 1.58c-.04.31-.06.63-.06.94s.02.63.06.94l-2.03 1.58c-.18.14-.23.41-.12.61l1.92 3.32c.12.22.37.29.59.22l2.39-.96c.5.38 1.03.7 1.62.94l.36 2.54c.05.24.24.41.48.41h3.84c.24 0 .44-.17.47-.41l.36-2.54c.59-.24 1.13-.56 1.62-.94l2.39.96c.22.08.47 0 .59-.22l1.92-3.32c.12-.22.07-.47-.12-.61l-2.01-1.58zM12 15.6c-1.98 0-3.6-1.62-3.6-3.6s1.62-3.6 3.6-3.6 3.6 1.62 3.6 3.6-1.62 3.6-3.6 3.6z"/>
</svg>
QUICK START
</div>
<div class="terminal-window">
<!-- Terminal Header with Dots and Filename -->
<div class="terminal-header">
<div class="terminal-dots">
<div class="terminal-dot red"></div>
<div class="terminal-dot yellow"></div>
<div class="terminal-dot green"></div>
</div>
<div class="terminal-title">claude_desktop_config.json</div>
<div style="width: 60px;"></div> <!-- Spacer for center alignment -->
</div>
<!-- Terminal Body with Line Numbers and Code -->
<div class="terminal-body">
<div class="line-numbers">
<div class="line-num">1</div>
<div class="line-num">2</div>
<div class="line-num">3</div>
<div class="line-num">4</div>
<div class="line-num">5</div>
<div class="line-num">6</div>
<div class="line-num">7</div>
<div class="line-num">8</div>
<div class="line-num">9</div>
<div class="line-num">10</div>
<div class="line-num">11</div>
<div class="line-num">12</div>
</div>
<div class="code-content">
<div class="code-line"><span class="json-bracket">{</span></div>
<div class="code-line"> <span class="json-key">"mcpServers"</span><span class="json-colon">:</span> <span class="json-bracket">{</span></div>
<div class="code-line"> <span class="json-key">"voicekit"</span><span class="json-colon">:</span> <span class="json-bracket">{</span></div>
<div class="code-line"> <span class="json-key">"command"</span><span class="json-colon">:</span> <span class="json-string">"npx"</span><span class="json-comma">,</span></div>
<div class="code-line"> <span class="json-key">"args"</span><span class="json-colon">:</span> <span class="json-bracket">[</span></div>
<div class="code-line"> <span class="json-string">"-y"</span><span class="json-comma">,</span></div>
<div class="code-line"> <span class="json-string">"mcp-remote"</span><span class="json-comma">,</span></div>
<div class="code-line"> <span class="json-string">"https://mcp-1st-birthday-voicekit.hf.space/gradio_api/mcp/sse"</span></div>
<div class="code-line"> <span class="json-bracket">]</span></div>
<div class="code-line"> <span class="json-bracket">}</span></div>
<div class="code-line"> <span class="json-bracket">}</span></div>
<div class="code-line"><span class="json-bracket">}</span></div>
</div>
</div>
</div>
<button class="copy-button" onclick="navigator.clipboard.writeText(JSON.stringify({mcpServers:{voicekit:{command:'npx',args:['-y','mcp-remote','https://mcp-1st-birthday-voicekit.hf.space/gradio_api/mcp/sse']}}},null,2))">
<svg width="16" height="16" viewBox="0 0 24 24" fill="white" style="display: inline-block; vertical-align: middle;">
<rect x="9" y="9" width="13" height="13" rx="2" ry="2" fill="white"/>
<path d="M5 15H4a2 2 0 0 1-2-2V4a2 2 0 0 1 2-2h9a2 2 0 0 1 2 2v1" fill="none" stroke="white" stroke-width="2"/>
</svg>
COPY CONFIG
</button>
</div>
""")
# AVAILABLE TOOLS CARD
with gr.Column(scale=1):
gr.HTML("""
<div class="card" style="min-height: 550px;">
<div class="card-title">
<svg width="18" height="18" viewBox="0 0 24 24" fill="#7c3aed" style="margin-right: 8px;">
<path d="M22.7 19l-9.1-9.1c.9-2.3.4-5-1.5-6.9-2-2-5-2.4-7.4-1.3L9 6 6 9 1.6 4.7C.4 7.1.9 10.1 2.9 12.1c1.9 1.9 4.6 2.4 6.9 1.5l9.1 9.1c.4.4 1 .4 1.4 0l2.3-2.3c.5-.4.5-1.1.1-1.4z"/>
</svg>
AVAILABLE TOOLS
</div>
<table class="tools-table">
<thead>
<tr>
<th>TOOL</th>
<th>PURPOSE</th>
<th>INPUT</th>
<th>OUTPUT</th>
</tr>
</thead>
<tbody>
<tr>
<td>
<div style="display: flex; align-items: center; gap: 12px;">
<svg width="24" height="24" viewBox="0 0 24 24" fill="none" style="flex-shrink: 0;">
<path d="M21 16V8L12 4L3 8V16L12 20L21 16Z" stroke="#A855F7" stroke-width="2" stroke-linecap="round" stroke-linejoin="round"/>
<path d="M12 4V12M12 12V20M12 12L21 8M12 12L3 8" stroke="#A855F7" stroke-width="2" stroke-linecap="round" stroke-linejoin="round"/>
<circle cx="12" cy="12" r="2" fill="#A855F7"/>
<circle cx="16.5" cy="10" r="1.5" fill="#A855F7"/>
<circle cx="7.5" cy="14" r="1.5" fill="#A855F7"/>
<path d="M12 12L16.5 10M12 12L7.5 14" stroke="#A855F7" stroke-width="1.5" stroke-linecap="round"/>
</svg>
<span class="tool-name">extract_embedding</span>
</div>
</td>
<td>Extract 768-dim voice fingerprint</td>
<td>audio_base64</td>
<td>embedding, model, dim</td>
</tr>
<tr>
<td>
<div style="display: flex; align-items: center; gap: 12px;">
<svg width="24" height="24" viewBox="0 0 24 24" fill="none" style="flex-shrink: 0;">
<path d="M2 10V14" stroke="#A855F7" stroke-width="2" stroke-linecap="round"/>
<path d="M5 8V16" stroke="#A855F7" stroke-width="2" stroke-linecap="round"/>
<path d="M8 11V13" stroke="#A855F7" stroke-width="2" stroke-linecap="round"/>
<path d="M22 10V14" stroke="#A855F7" stroke-width="2" stroke-linecap="round"/>
<path d="M19 7V17" stroke="#A855F7" stroke-width="2" stroke-linecap="round"/>
<path d="M16 11V13" stroke="#A855F7" stroke-width="2" stroke-linecap="round"/>
<path d="M10 12H14" stroke="#A855F7" stroke-width="2" stroke-linecap="round"/>
<path d="M10 12L11.5 10.5" stroke="#A855F7" stroke-width="2" stroke-linecap="round"/>
<path d="M10 12L11.5 13.5" stroke="#A855F7" stroke-width="2" stroke-linecap="round"/>
<path d="M14 12L12.5 10.5" stroke="#A855F7" stroke-width="2" stroke-linecap="round"/>
<path d="M14 12L12.5 13.5" stroke="#A855F7" stroke-width="2" stroke-linecap="round"/>
</svg>
<span class="tool-name">match_voice</span>
</div>
</td>
<td>Compare two voice similarities</td>
<td>audio1_base64, audio2_base64</td>
<td>similarity, tone_score</td>
</tr>
<tr>
<td>
<div style="display: flex; align-items: center; gap: 12px;">
<svg width="24" height="24" viewBox="0 0 24 24" fill="none" style="flex-shrink: 0;">
<path d="M22 10C22 10 20 4 17 4C14 4 12 16 9 16C6 16 4 10 2 10" stroke="#A855F7" stroke-width="2" stroke-linecap="round" stroke-linejoin="round"/>
<g transform="translate(13, 11)">
<circle cx="5" cy="5" r="4" stroke="#A855F7" stroke-width="1.5"/>
<path d="M8 8L11 11" stroke="#A855F7" stroke-width="1.5" stroke-linecap="round"/>
</g>
</svg>
<span class="tool-name">analyze_acoustics</span>
</div>
</td>
<td>Analyze pitch, energy, rhythm, tempo</td>
<td>audio_base64</td>
<td>pitch, energy, rhythm, tempo</td>
</tr>
<tr>
<td>
<div style="display: flex; align-items: center; gap: 12px;">
<svg width="24" height="24" viewBox="0 0 24 24" fill="none" style="flex-shrink: 0;">
<path d="M2 12C2 12 4 5 7 5C10 5 11 19 14 19C15.5 19 16.5 15 16.5 15" stroke="#A855F7" stroke-width="2" stroke-linecap="round" stroke-linejoin="round"/>
<path d="M19 7H22" stroke="#A855F7" stroke-width="2" stroke-linecap="round"/>
<path d="M19 12H22" stroke="#A855F7" stroke-width="2" stroke-linecap="round"/>
<path d="M19 17H22" stroke="#A855F7" stroke-width="2" stroke-linecap="round"/>
</svg>
<span class="tool-name">transcribe_audio</span>
</div>
</td>
<td>Convert speech to text</td>
<td>audio_base64, language</td>
<td>text, language, model</td>
</tr>
<tr>
<td>
<div style="display: flex; align-items: center; gap: 12px;">
<svg width="24" height="24" viewBox="0 0 24 24" fill="none" style="flex-shrink: 0;">
<path d="M12 5V19" stroke="#A855F7" stroke-width="2.5" stroke-linecap="round"/>
<path d="M9 8V16" stroke="#A855F7" stroke-width="2.5" stroke-linecap="round"/>
<path d="M15 8V16" stroke="#A855F7" stroke-width="2.5" stroke-linecap="round"/>
<path d="M5 4H3V20H5" stroke="#A855F7" stroke-width="1.5" stroke-linecap="round" stroke-linejoin="round"/>
<path d="M19 4H21V20H19" stroke="#A855F7" stroke-width="1.5" stroke-linecap="round" stroke-linejoin="round"/>
</svg>
<span class="tool-name">isolate_voice</span>
</div>
</td>
<td>Remove background music/noise</td>
<td>audio_base64</td>
<td>isolated_audio_base64, metadata</td>
</tr>
<tr>
<td>
<div style="display: flex; align-items: center; gap: 12px;">
<svg width="24" height="24" viewBox="0 0 24 24" fill="none" style="flex-shrink: 0;">
<circle cx="12" cy="12" r="9" stroke="#A855F7" stroke-width="1" opacity="0.3"/>
<path d="M12 3V21" stroke="#A855F7" stroke-width="1" opacity="0.3"/>
<path d="M4.2 7.5L19.8 16.5" stroke="#A855F7" stroke-width="1" opacity="0.3"/>
<path d="M19.8 7.5L4.2 16.5" stroke="#A855F7" stroke-width="1" opacity="0.3"/>
<path d="M12 5L18 9L16.5 18H7.5L6 9L12 5Z" fill="#A855F7" fill-opacity="0.4" stroke="#A855F7" stroke-width="2" stroke-linejoin="round"/>
<circle cx="12" cy="5" r="1.5" fill="#A855F7"/>
<circle cx="18" cy="9" r="1.5" fill="#A855F7"/>
<circle cx="16.5" cy="18" r="1.5" fill="#A855F7"/>
<circle cx="7.5" cy="18" r="1.5" fill="#A855F7"/>
<circle cx="6" cy="9" r="1.5" fill="#A855F7"/>
</svg>
<span class="tool-name">grade_voice</span>
</div>
</td>
<td>5-metric comprehensive analysis</td>
<td>user_audio, reference_audio, text, category</td>
<td>overall, metrics, feedback</td>
</tr>
</tbody>
</table>
</div>
""")
# ==================== FIRST ROW: 3 DEMO CARDS ====================
with gr.Row(equal_height=True, elem_classes="demo-row"):
# EXTRACT EMBEDDING
with gr.Column(scale=1, elem_classes="demo-card-column"):
gr.HTML("""
<div style="display: flex; align-items: center; gap: 6px; margin-bottom: 8px; padding-left: 18px; padding-top: 10px;">
<svg width="20" height="20" viewBox="0 0 24 24" fill="none" style="flex-shrink: 0;">
<path d="M21 16V8L12 4L3 8V16L12 20L21 16Z" stroke="#7c3aed" stroke-width="2" stroke-linecap="round" stroke-linejoin="round"/>
<path d="M12 4V12M12 12V20M12 12L21 8M12 12L3 8" stroke="#7c3aed" stroke-width="2" stroke-linecap="round" stroke-linejoin="round"/>
<circle cx="12" cy="12" r="2" fill="#7c3aed"/>
<circle cx="16.5" cy="10" r="1.5" fill="#7c3aed"/>
<circle cx="7.5" cy="14" r="1.5" fill="#7c3aed"/>
<path d="M12 12L16.5 10M12 12L7.5 14" stroke="#7c3aed" stroke-width="1.5" stroke-linecap="round"/>
</svg>
<div style="font-size: 16px; font-weight: 700; color: #a5b4fc; text-transform: uppercase; letter-spacing: 1px;">
EXTRACT EMBEDDING
</div>
</div>
""")
embedding_audio = gr.Audio(
type="filepath",
label="Audio Input",
show_label=False,
format="wav"
)
embedding_btn = gr.Button("EXTRACT", variant="primary", size="lg", elem_classes="custom-action-btn")
embedding_output = gr.HTML(value=create_embedding_empty())
embedding_btn.click(
demo_extract_embedding,
inputs=[embedding_audio],
outputs=[embedding_output],
api_visibility="private"
)
# COMPARE VOICES
with gr.Column(scale=1, elem_classes="demo-card-column"):
gr.HTML("""
<div style="display: flex; align-items: center; gap: 6px; margin-bottom: 8px; padding-left: 18px; padding-top: 10px;">
<svg width="20" height="20" viewBox="0 0 24 24" fill="none" style="flex-shrink: 0;">
<path d="M2 10V14" stroke="#7c3aed" stroke-width="2" stroke-linecap="round"/>
<path d="M5 8V16" stroke="#7c3aed" stroke-width="2" stroke-linecap="round"/>
<path d="M8 11V13" stroke="#7c3aed" stroke-width="2" stroke-linecap="round"/>
<path d="M22 10V14" stroke="#7c3aed" stroke-width="2" stroke-linecap="round"/>
<path d="M19 7V17" stroke="#7c3aed" stroke-width="2" stroke-linecap="round"/>
<path d="M16 11V13" stroke="#7c3aed" stroke-width="2" stroke-linecap="round"/>
<path d="M10 12H14" stroke="#7c3aed" stroke-width="2" stroke-linecap="round"/>
<path d="M10 12L11.5 10.5" stroke="#7c3aed" stroke-width="2" stroke-linecap="round"/>
<path d="M10 12L11.5 13.5" stroke="#7c3aed" stroke-width="2" stroke-linecap="round"/>
<path d="M14 12L12.5 10.5" stroke="#7c3aed" stroke-width="2" stroke-linecap="round"/>
<path d="M14 12L12.5 13.5" stroke="#7c3aed" stroke-width="2" stroke-linecap="round"/>
</svg>
<div style="font-size: 16px; font-weight: 700; color: #a5b4fc; text-transform: uppercase; letter-spacing: 1px;">
MATCH VOICE
</div>
</div>
""")
with gr.Row():
compare_audio1 = gr.Audio(
type="filepath",
label="Audio 1",
show_label=False,
format="wav"
)
compare_audio2 = gr.Audio(
type="filepath",
label="Audio 2",
show_label=False,
format="wav"
)
compare_btn = gr.Button("COMPARE", variant="primary", size="lg", elem_classes="custom-action-btn")
compare_output = gr.HTML(value=create_compare_empty())
compare_btn.click(
demo_match_voice,
inputs=[compare_audio1, compare_audio2],
outputs=[compare_output],
api_visibility="private"
)
# ACOUSTIC ANALYSIS
with gr.Column(scale=1, elem_classes="demo-card-column"):
gr.HTML("""
<div style="display: flex; align-items: center; gap: 6px; margin-bottom: 8px; padding-left: 18px; padding-top: 10px;">
<svg width="20" height="20" viewBox="0 0 24 24" fill="none" style="flex-shrink: 0;">
<path d="M22 10C22 10 20 4 17 4C14 4 12 16 9 16C6 16 4 10 2 10" stroke="#7c3aed" stroke-width="2" stroke-linecap="round" stroke-linejoin="round"/>
<g transform="translate(13, 11)">
<circle cx="5" cy="5" r="4" stroke="#7c3aed" stroke-width="1.5"/>
<path d="M8 8L11 11" stroke="#7c3aed" stroke-width="1.5" stroke-linecap="round"/>
</g>
</svg>
<div style="font-size: 16px; font-weight: 700; color: #a5b4fc; text-transform: uppercase; letter-spacing: 1px;">
ANALYZE ACOUSTICS
</div>
</div>
""")
acoustic_audio = gr.Audio(
type="filepath",
label="Audio Input",
show_label=False,
format="wav"
)
acoustic_btn = gr.Button("ANALYZE", variant="primary", size="lg", elem_classes="custom-action-btn")
acoustic_output = gr.HTML(value=create_acoustic_empty())
acoustic_btn.click(
demo_acoustic_analysis,
inputs=[acoustic_audio],
outputs=[acoustic_output],
api_visibility="private"
)
# ==================== SECOND ROW: 3 MORE DEMO CARDS ====================
with gr.Row(equal_height=True, elem_classes="demo-row"):
# AUDIO TRANSCRIPTION
with gr.Column(scale=1, elem_classes="demo-card-column"):
gr.HTML("""
<div style="display: flex; align-items: center; gap: 6px; margin-bottom: 8px; padding-left: 18px; padding-top: 10px;">
<svg width="20" height="20" viewBox="0 0 24 24" fill="none" style="flex-shrink: 0;">
<path d="M2 12C2 12 4 5 7 5C10 5 11 19 14 19C15.5 19 16.5 15 16.5 15" stroke="#7c3aed" stroke-width="2" stroke-linecap="round" stroke-linejoin="round"/>
<path d="M19 7H22" stroke="#7c3aed" stroke-width="2" stroke-linecap="round"/>
<path d="M19 12H22" stroke="#7c3aed" stroke-width="2" stroke-linecap="round"/>
<path d="M19 17H22" stroke="#7c3aed" stroke-width="2" stroke-linecap="round"/>
</svg>
<div style="font-size: 16px; font-weight: 700; color: #a5b4fc; text-transform: uppercase; letter-spacing: 1px;">
TRANSCRIBE AUDIO
</div>
</div>
""")
transcribe_audio_input = gr.Audio(
type="filepath",
label="Audio Input",
show_label=False,
format="wav"
)
transcribe_btn = gr.Button("TRANSCRIBE", variant="primary", size="lg", elem_classes="custom-action-btn")
transcribe_output = gr.HTML(value=create_transcription_empty())
transcribe_btn.click(
lambda audio: demo_transcribe_audio(audio, "en"),
inputs=[transcribe_audio_input],
outputs=[transcribe_output],
api_visibility="private"
)
# CLEAN AUDIO EXTRACTION
with gr.Column(scale=1, elem_classes="demo-card-column"):
gr.HTML("""
<div style="display: flex; align-items: center; gap: 6px; margin-bottom: 8px; padding-left: 18px; padding-top: 10px;">
<svg width="20" height="20" viewBox="0 0 24 24" fill="none" style="flex-shrink: 0;">
<path d="M12 5V19" stroke="#7c3aed" stroke-width="2.5" stroke-linecap="round"/>
<path d="M9 8V16" stroke="#7c3aed" stroke-width="2.5" stroke-linecap="round"/>
<path d="M15 8V16" stroke="#7c3aed" stroke-width="2.5" stroke-linecap="round"/>
<path d="M5 4H3V20H5" stroke="#7c3aed" stroke-width="1.5" stroke-linecap="round" stroke-linejoin="round"/>
<path d="M19 4H21V20H19" stroke="#7c3aed" stroke-width="1.5" stroke-linecap="round" stroke-linejoin="round"/>
</svg>
<div style="font-size: 16px; font-weight: 700; color: #a5b4fc; text-transform: uppercase; letter-spacing: 1px;">
ISOLATE VOICE
</div>
</div>
""")
clean_audio_input = gr.Audio(
type="filepath",
label="Audio with Background",
show_label=False,
format="wav"
)
clean_btn = gr.Button("EXTRACT VOICE", variant="primary", size="lg", elem_classes="custom-action-btn")
clean_audio_output = gr.Audio(label="Clean Audio", type="filepath", visible=True)
clean_btn.click(
demo_clean_extraction,
inputs=[clean_audio_input],
outputs=[clean_audio_output],
api_visibility="private"
)
# VOICE SIMILARITY
with gr.Column(scale=1, elem_classes="demo-card-column"):
gr.HTML("""
<div style="display: flex; align-items: center; gap: 6px; margin-bottom: 8px; padding-left: 18px; padding-top: 10px;">
<svg width="20" height="20" viewBox="0 0 24 24" fill="none" style="flex-shrink: 0;">
<circle cx="12" cy="12" r="9" stroke="#7c3aed" stroke-width="1" opacity="0.3"/>
<path d="M12 3V21" stroke="#7c3aed" stroke-width="1" opacity="0.3"/>
<path d="M4.2 7.5L19.8 16.5" stroke="#7c3aed" stroke-width="1" opacity="0.3"/>
<path d="M19.8 7.5L4.2 16.5" stroke="#7c3aed" stroke-width="1" opacity="0.3"/>
<path d="M12 5L18 9L16.5 18H7.5L6 9L12 5Z" fill="#7c3aed" fill-opacity="0.4" stroke="#7c3aed" stroke-width="2" stroke-linejoin="round"/>
<circle cx="12" cy="5" r="1.5" fill="#7c3aed"/>
<circle cx="18" cy="9" r="1.5" fill="#7c3aed"/>
<circle cx="16.5" cy="18" r="1.5" fill="#7c3aed"/>
<circle cx="7.5" cy="18" r="1.5" fill="#7c3aed"/>
<circle cx="6" cy="9" r="1.5" fill="#7c3aed"/>
</svg>
<div style="font-size: 16px; font-weight: 700; color: #a5b4fc; text-transform: uppercase; letter-spacing: 1px;">
GRADE VOICE
</div>
</div>
""")
with gr.Row():
similarity_user_audio = gr.Audio(
type="filepath",
label="User Audio",
show_label=False,
format="wav"
)
similarity_ref_audio = gr.Audio(
type="filepath",
label="Reference Audio",
show_label=False,
format="wav"
)
similarity_btn = gr.Button("ANALYZE", variant="primary", size="lg", elem_classes="custom-action-btn")
similarity_output = gr.HTML(value=create_similarity_empty())
similarity_btn.click(
demo_voice_similarity,
inputs=[similarity_user_audio, similarity_ref_audio],
outputs=[similarity_output],
api_visibility="private"
)
# ==================== MCP TOOL INTERFACES (HIDDEN, API ONLY) ====================
with gr.Row(visible=False):
# extract_embedding
mcp_emb_input = gr.Textbox()
mcp_emb_output = gr.Textbox()
mcp_emb_btn = gr.Button()
mcp_emb_btn.click(extract_embedding, inputs=[mcp_emb_input], outputs=[mcp_emb_output])
# match_voice
mcp_cmp_input1 = gr.Textbox()
mcp_cmp_input2 = gr.Textbox()
mcp_cmp_output = gr.Textbox()
mcp_cmp_btn = gr.Button()
mcp_cmp_btn.click(match_voice, inputs=[mcp_cmp_input1, mcp_cmp_input2], outputs=[mcp_cmp_output])
# analyze_acoustics
mcp_ac_input = gr.Textbox()
mcp_ac_output = gr.Textbox()
mcp_ac_btn = gr.Button()
mcp_ac_btn.click(analyze_acoustics, inputs=[mcp_ac_input], outputs=[mcp_ac_output])
# transcribe_audio
mcp_tr_input = gr.Textbox()
mcp_tr_lang = gr.Textbox(value="en")
mcp_tr_output = gr.Textbox()
mcp_tr_btn = gr.Button()
mcp_tr_btn.click(transcribe_audio, inputs=[mcp_tr_input, mcp_tr_lang], outputs=[mcp_tr_output])
# isolate_voice
mcp_iso_input = gr.Textbox()
mcp_iso_output = gr.Textbox()
mcp_iso_btn = gr.Button()
mcp_iso_btn.click(isolate_voice, inputs=[mcp_iso_input], outputs=[mcp_iso_output])
# grade_voice
mcp_sim_user = gr.Textbox()
mcp_sim_ref = gr.Textbox()
mcp_sim_text = gr.Textbox()
mcp_sim_cat = gr.Textbox(value="meme")
mcp_sim_output = gr.Textbox()
mcp_sim_btn = gr.Button()
mcp_sim_btn.click(grade_voice, inputs=[mcp_sim_user, mcp_sim_ref, mcp_sim_text, mcp_sim_cat], outputs=[mcp_sim_output])
if __name__ == "__main__":
demo.launch(
server_name="0.0.0.0",
server_port=7860,
mcp_server=True
)