File size: 5,310 Bytes

fe63a26

"""
Quickstart Example - Ensemble TTS Annotation

This notebook demonstrates how to use the EnsembleAnnotator for Portuguese BR
TTS annotation with OPTION A (3-model ensemble).
"""

# %% Install dependencies (if needed)
# !pip install -r ../requirements.txt

# %% Imports
from ensemble_tts import EnsembleAnnotator
import librosa
import pandas as pd
from pathlib import Path

# %% Example 1: Annotate a single audio file

print("=" * 60)
print("Example 1: Single Audio File Annotation")
print("=" * 60)

# Create annotator (OPTION A - Balanced mode)
annotator = EnsembleAnnotator(
    mode='balanced',        # 3 models: emotion2vec (fine-tuned) + Whisper + SenseVoice
    device='cpu',          # Use 'cuda' if GPU available
    voting_strategy='weighted',
    enable_events=True
)

# Annotate audio file
# result = annotator.annotate('path/to/audio.wav')

# Example with dummy audio
import numpy as np
dummy_audio = np.random.randn(16000)  # 1 second of audio
result = annotator.annotate(dummy_audio, sample_rate=16000)

print("\nResult:")
print(f"Emotion: {result['emotion']['label']}")
print(f"Confidence: {result['emotion']['confidence']:.2%}")
print(f"Agreement: {result['emotion']['agreement']:.2%}")
print(f"Events detected: {result['events']['detected']}")

# %% Example 2: Annotate multiple files

print("\n" + "=" * 60)
print("Example 2: Batch Annotation")
print("=" * 60)

# List of audio files
audio_files = [
    'audio1.wav',
    'audio2.wav',
    'audio3.wav'
]

# For this example, use dummy audio
dummy_audios = [np.random.randn(16000 * (i + 1)) for i in range(3)]

# Annotate batch
results = annotator.annotate_batch(
    dummy_audios,
    sample_rates=[16000] * 3
)

print(f"\nAnnotated {len(results)} files")
for i, result in enumerate(results):
    print(f"  File {i+1}: {result['emotion']['label']} ({result['emotion']['confidence']:.2%})")

# %% Example 3: Annotate HuggingFace dataset

print("\n" + "=" * 60)
print("Example 3: HuggingFace Dataset Annotation")
print("=" * 60)

from datasets import load_dataset

# Load dataset
dataset = load_dataset('marcosremar2/orpheus-tts-portuguese-dataset', split='train')

# Annotate first 10 samples
results = annotator.annotate_dataset(
    dataset,
    audio_column='audio',
    text_column='text',
    max_samples=10
)

print(f"\nAnnotated {len(results)} samples from dataset")

# Get statistics
stats = annotator.get_stats(results)
print("\nStatistics:")
print(f"  Emotion distribution: {stats['emotion_distribution']}")
print(f"  Average confidence: {stats['avg_confidence']:.2%}")
print(f"  Average agreement: {stats['avg_agreement']:.2%}")
print(f"  Events detected: {stats['total_events_detected']}")

# %% Example 4: Save results to file

print("\n" + "=" * 60)
print("Example 4: Save Results")
print("=" * 60)

# Convert to DataFrame
df = pd.DataFrame([
    {
        'sample_id': r.get('sample_id', i),
        'text': r.get('text', ''),
        'emotion': r['emotion']['label'],
        'confidence': r['emotion']['confidence'],
        'agreement': r['emotion']['agreement'],
        'events': ','.join(r['events']['detected'])
    }
    for i, r in enumerate(results)
])

# Save to parquet
output_file = '../data/annotated/example_annotations.parquet'
df.to_parquet(output_file)
print(f"✅ Saved {len(df)} annotations to {output_file}")

# %% Example 5: Compare modes (Quick vs Balanced vs Full)

print("\n" + "=" * 60)
print("Example 5: Compare Modes")
print("=" * 60)

modes = ['quick', 'balanced', 'full']
test_audio = np.random.randn(16000 * 3)  # 3 seconds

for mode in modes:
    print(f"\n{mode.upper()} MODE:")
    print("-" * 40)

    annotator_mode = EnsembleAnnotator(mode=mode, device='cpu')

    import time
    start = time.time()
    result = annotator_mode.annotate(test_audio, sample_rate=16000)
    elapsed = time.time() - start

    print(f"  Emotion: {result['emotion']['label']}")
    print(f"  Confidence: {result['emotion']['confidence']:.2%}")
    print(f"  Agreement: {result['emotion']['agreement']:.2%}")
    print(f"  Time: {elapsed:.2f}s")

# %% Example 6: Access individual model predictions

print("\n" + "=" * 60)
print("Example 6: Individual Model Predictions")
print("=" * 60)

result = annotator.annotate(dummy_audio, sample_rate=16000)

print("\nEnsemble decision:")
print(f"  Final: {result['emotion']['label']} ({result['emotion']['confidence']:.2%})")

print("\nIndividual predictions:")
for pred in result['emotion']['predictions']:
    print(f"  {pred['model_name']}: {pred['label']} ({pred['confidence']:.2%}) [weight: {pred['model_weight']:.2f}]")

print("\nVoting breakdown:")
for emotion, count in result['emotion']['votes'].items():
    print(f"  {emotion}: {count} vote(s)")

# %% Example 7: Use quick annotation function

print("\n" + "=" * 60)
print("Example 7: Quick Annotation Function")
print("=" * 60)

from ensemble_tts import annotate_file

# Quick annotation (one-liner)
# result = annotate_file('audio.wav', mode='balanced', device='cuda')

# With dummy audio (for demo)
annotator_quick = EnsembleAnnotator(mode='quick', device='cpu')
result = annotator_quick.annotate(dummy_audio, sample_rate=16000)

print(f"Quick result: {result['emotion']['label']} ({result['emotion']['confidence']:.2%})")

print("\n" + "=" * 60)
print("✅ All examples completed!")
print("=" * 60)