ensemble-tts-annotation / notebooks /quickstart_example.py
marcosremar
Add complete infrastructure and updated README for OPTION A
fe63a26
"""
Quickstart Example - Ensemble TTS Annotation
This notebook demonstrates how to use the EnsembleAnnotator for Portuguese BR
TTS annotation with OPTION A (3-model ensemble).
"""
# %% Install dependencies (if needed)
# !pip install -r ../requirements.txt
# %% Imports
from ensemble_tts import EnsembleAnnotator
import librosa
import pandas as pd
from pathlib import Path
# %% Example 1: Annotate a single audio file
print("=" * 60)
print("Example 1: Single Audio File Annotation")
print("=" * 60)
# Create annotator (OPTION A - Balanced mode)
annotator = EnsembleAnnotator(
mode='balanced', # 3 models: emotion2vec (fine-tuned) + Whisper + SenseVoice
device='cpu', # Use 'cuda' if GPU available
voting_strategy='weighted',
enable_events=True
)
# Annotate audio file
# result = annotator.annotate('path/to/audio.wav')
# Example with dummy audio
import numpy as np
dummy_audio = np.random.randn(16000) # 1 second of audio
result = annotator.annotate(dummy_audio, sample_rate=16000)
print("\nResult:")
print(f"Emotion: {result['emotion']['label']}")
print(f"Confidence: {result['emotion']['confidence']:.2%}")
print(f"Agreement: {result['emotion']['agreement']:.2%}")
print(f"Events detected: {result['events']['detected']}")
# %% Example 2: Annotate multiple files
print("\n" + "=" * 60)
print("Example 2: Batch Annotation")
print("=" * 60)
# List of audio files
audio_files = [
'audio1.wav',
'audio2.wav',
'audio3.wav'
]
# For this example, use dummy audio
dummy_audios = [np.random.randn(16000 * (i + 1)) for i in range(3)]
# Annotate batch
results = annotator.annotate_batch(
dummy_audios,
sample_rates=[16000] * 3
)
print(f"\nAnnotated {len(results)} files")
for i, result in enumerate(results):
print(f" File {i+1}: {result['emotion']['label']} ({result['emotion']['confidence']:.2%})")
# %% Example 3: Annotate HuggingFace dataset
print("\n" + "=" * 60)
print("Example 3: HuggingFace Dataset Annotation")
print("=" * 60)
from datasets import load_dataset
# Load dataset
dataset = load_dataset('marcosremar2/orpheus-tts-portuguese-dataset', split='train')
# Annotate first 10 samples
results = annotator.annotate_dataset(
dataset,
audio_column='audio',
text_column='text',
max_samples=10
)
print(f"\nAnnotated {len(results)} samples from dataset")
# Get statistics
stats = annotator.get_stats(results)
print("\nStatistics:")
print(f" Emotion distribution: {stats['emotion_distribution']}")
print(f" Average confidence: {stats['avg_confidence']:.2%}")
print(f" Average agreement: {stats['avg_agreement']:.2%}")
print(f" Events detected: {stats['total_events_detected']}")
# %% Example 4: Save results to file
print("\n" + "=" * 60)
print("Example 4: Save Results")
print("=" * 60)
# Convert to DataFrame
df = pd.DataFrame([
{
'sample_id': r.get('sample_id', i),
'text': r.get('text', ''),
'emotion': r['emotion']['label'],
'confidence': r['emotion']['confidence'],
'agreement': r['emotion']['agreement'],
'events': ','.join(r['events']['detected'])
}
for i, r in enumerate(results)
])
# Save to parquet
output_file = '../data/annotated/example_annotations.parquet'
df.to_parquet(output_file)
print(f"✅ Saved {len(df)} annotations to {output_file}")
# %% Example 5: Compare modes (Quick vs Balanced vs Full)
print("\n" + "=" * 60)
print("Example 5: Compare Modes")
print("=" * 60)
modes = ['quick', 'balanced', 'full']
test_audio = np.random.randn(16000 * 3) # 3 seconds
for mode in modes:
print(f"\n{mode.upper()} MODE:")
print("-" * 40)
annotator_mode = EnsembleAnnotator(mode=mode, device='cpu')
import time
start = time.time()
result = annotator_mode.annotate(test_audio, sample_rate=16000)
elapsed = time.time() - start
print(f" Emotion: {result['emotion']['label']}")
print(f" Confidence: {result['emotion']['confidence']:.2%}")
print(f" Agreement: {result['emotion']['agreement']:.2%}")
print(f" Time: {elapsed:.2f}s")
# %% Example 6: Access individual model predictions
print("\n" + "=" * 60)
print("Example 6: Individual Model Predictions")
print("=" * 60)
result = annotator.annotate(dummy_audio, sample_rate=16000)
print("\nEnsemble decision:")
print(f" Final: {result['emotion']['label']} ({result['emotion']['confidence']:.2%})")
print("\nIndividual predictions:")
for pred in result['emotion']['predictions']:
print(f" {pred['model_name']}: {pred['label']} ({pred['confidence']:.2%}) [weight: {pred['model_weight']:.2f}]")
print("\nVoting breakdown:")
for emotion, count in result['emotion']['votes'].items():
print(f" {emotion}: {count} vote(s)")
# %% Example 7: Use quick annotation function
print("\n" + "=" * 60)
print("Example 7: Quick Annotation Function")
print("=" * 60)
from ensemble_tts import annotate_file
# Quick annotation (one-liner)
# result = annotate_file('audio.wav', mode='balanced', device='cuda')
# With dummy audio (for demo)
annotator_quick = EnsembleAnnotator(mode='quick', device='cpu')
result = annotator_quick.annotate(dummy_audio, sample_rate=16000)
print(f"Quick result: {result['emotion']['label']} ({result['emotion']['confidence']:.2%})")
print("\n" + "=" * 60)
print("✅ All examples completed!")
print("=" * 60)