File size: 12,129 Bytes

13e402e

"""
Create synthetic audio samples for testing fine-tuning and annotation.

This script generates synthetic audio samples with different characteristics
to simulate emotional speech for testing purposes before real datasets are available.
"""

import numpy as np
import soundfile as sf
from pathlib import Path
import logging
from typing import Dict, List
import librosa

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)


class SyntheticAudioGenerator:
    """Generate synthetic audio samples with emotion-like characteristics."""

    def __init__(self, sample_rate: int = 16000):
        self.sample_rate = sample_rate

    def generate_base_tone(self, duration: float, frequency: float) -> np.ndarray:
        """Generate a base tone with given frequency."""
        t = np.linspace(0, duration, int(duration * self.sample_rate))
        tone = np.sin(2 * np.pi * frequency * t)
        return tone

    def add_harmonics(self, tone: np.ndarray, frequencies: List[float],
                     amplitudes: List[float]) -> np.ndarray:
        """Add harmonic frequencies to simulate voice complexity."""
        duration = len(tone) / self.sample_rate
        t = np.linspace(0, duration, len(tone))

        for freq, amp in zip(frequencies, amplitudes):
            harmonic = amp * np.sin(2 * np.pi * freq * t)
            tone = tone + harmonic

        return tone

    def apply_envelope(self, audio: np.ndarray, attack: float = 0.1,
                      decay: float = 0.1, sustain: float = 0.7,
                      release: float = 0.2) -> np.ndarray:
        """Apply ADSR envelope to audio."""
        n_samples = len(audio)
        envelope = np.ones(n_samples)

        # Attack
        attack_samples = int(attack * n_samples)
        envelope[:attack_samples] = np.linspace(0, 1, attack_samples)

        # Decay
        decay_samples = int(decay * n_samples)
        decay_end = attack_samples + decay_samples
        envelope[attack_samples:decay_end] = np.linspace(1, sustain, decay_samples)

        # Sustain (already at sustain level)
        sustain_end = n_samples - int(release * n_samples)
        envelope[decay_end:sustain_end] = sustain

        # Release
        envelope[sustain_end:] = np.linspace(sustain, 0, n_samples - sustain_end)

        return audio * envelope

    def generate_neutral(self, duration: float = 3.0) -> np.ndarray:
        """
        Generate neutral emotion audio.
        Characteristics: Medium pitch, steady rhythm, minimal variation.
        """
        # Base frequency: medium pitch (male: ~120Hz, female: ~220Hz)
        base_freq = 150.0
        tone = self.generate_base_tone(duration, base_freq)

        # Add subtle harmonics
        harmonics = [base_freq * 2, base_freq * 3, base_freq * 4]
        amplitudes = [0.3, 0.15, 0.08]
        tone = self.add_harmonics(tone, harmonics, amplitudes)

        # Steady envelope
        tone = self.apply_envelope(tone, attack=0.1, decay=0.05,
                                  sustain=0.8, release=0.15)

        # Normalize
        tone = tone / np.max(np.abs(tone)) * 0.7

        return tone.astype(np.float32)

    def generate_happy(self, duration: float = 3.0) -> np.ndarray:
        """
        Generate happy emotion audio.
        Characteristics: Higher pitch, faster rhythm, more energy.
        """
        # Higher pitch
        base_freq = 200.0
        tone = self.generate_base_tone(duration, base_freq)

        # More pronounced harmonics
        harmonics = [base_freq * 2, base_freq * 3, base_freq * 4, base_freq * 5]
        amplitudes = [0.4, 0.25, 0.15, 0.1]
        tone = self.add_harmonics(tone, harmonics, amplitudes)

        # Add vibrato (pitch modulation)
        t = np.linspace(0, duration, len(tone))
        vibrato = 1 + 0.02 * np.sin(2 * np.pi * 5 * t)  # 5Hz vibrato
        tone = tone * vibrato

        # Energetic envelope
        tone = self.apply_envelope(tone, attack=0.05, decay=0.05,
                                  sustain=0.9, release=0.1)

        # Higher energy
        tone = tone / np.max(np.abs(tone)) * 0.85

        return tone.astype(np.float32)

    def generate_sad(self, duration: float = 3.0) -> np.ndarray:
        """
        Generate sad emotion audio.
        Characteristics: Lower pitch, slower rhythm, less energy.
        """
        # Lower pitch
        base_freq = 100.0
        tone = self.generate_base_tone(duration, base_freq)

        # Fewer harmonics (less bright)
        harmonics = [base_freq * 2, base_freq * 3]
        amplitudes = [0.25, 0.12]
        tone = self.add_harmonics(tone, harmonics, amplitudes)

        # Add tremolo (amplitude modulation)
        t = np.linspace(0, duration, len(tone))
        tremolo = 1 - 0.05 * np.sin(2 * np.pi * 3 * t)  # 3Hz tremolo
        tone = tone * tremolo

        # Slower envelope
        tone = self.apply_envelope(tone, attack=0.15, decay=0.1,
                                  sustain=0.6, release=0.25)

        # Lower energy
        tone = tone / np.max(np.abs(tone)) * 0.6

        return tone.astype(np.float32)

    def generate_angry(self, duration: float = 3.0) -> np.ndarray:
        """
        Generate angry emotion audio.
        Characteristics: Variable pitch, harsh harmonics, high energy.
        """
        # Medium-high pitch with variations
        base_freq = 180.0
        tone = self.generate_base_tone(duration, base_freq)

        # Harsh harmonics
        harmonics = [base_freq * 2, base_freq * 3, base_freq * 4, base_freq * 6]
        amplitudes = [0.5, 0.3, 0.2, 0.15]
        tone = self.add_harmonics(tone, harmonics, amplitudes)

        # Add roughness (noise)
        noise = np.random.randn(len(tone)) * 0.1
        tone = tone + noise

        # Aggressive envelope
        tone = self.apply_envelope(tone, attack=0.02, decay=0.05,
                                  sustain=0.95, release=0.08)

        # High energy
        tone = tone / np.max(np.abs(tone)) * 0.9

        return tone.astype(np.float32)

    def generate_fearful(self, duration: float = 3.0) -> np.ndarray:
        """
        Generate fearful emotion audio.
        Characteristics: Variable pitch, trembling, high frequency.
        """
        # Higher pitch with instability
        base_freq = 220.0
        tone = self.generate_base_tone(duration, base_freq)

        # Unstable harmonics
        harmonics = [base_freq * 2, base_freq * 3, base_freq * 5]
        amplitudes = [0.35, 0.2, 0.15]
        tone = self.add_harmonics(tone, harmonics, amplitudes)

        # Add trembling (fast amplitude modulation)
        t = np.linspace(0, duration, len(tone))
        trembling = 1 - 0.08 * np.sin(2 * np.pi * 8 * t)  # 8Hz trembling
        tone = tone * trembling

        # Unstable envelope
        tone = self.apply_envelope(tone, attack=0.08, decay=0.12,
                                  sustain=0.7, release=0.15)

        tone = tone / np.max(np.abs(tone)) * 0.75

        return tone.astype(np.float32)

    def generate_disgusted(self, duration: float = 3.0) -> np.ndarray:
        """
        Generate disgusted emotion audio.
        Characteristics: Lower pitch, nasal quality, reduced energy.
        """
        # Lower-medium pitch
        base_freq = 130.0
        tone = self.generate_base_tone(duration, base_freq)

        # Nasal harmonics (odd harmonics emphasized)
        harmonics = [base_freq * 3, base_freq * 5, base_freq * 7]
        amplitudes = [0.4, 0.25, 0.15]
        tone = self.add_harmonics(tone, harmonics, amplitudes)

        # Add slight roughness
        noise = np.random.randn(len(tone)) * 0.05
        tone = tone + noise

        # Reduced energy envelope
        tone = self.apply_envelope(tone, attack=0.12, decay=0.1,
                                  sustain=0.65, release=0.2)

        tone = tone / np.max(np.abs(tone)) * 0.65

        return tone.astype(np.float32)

    def generate_surprised(self, duration: float = 3.0) -> np.ndarray:
        """
        Generate surprised emotion audio.
        Characteristics: Sudden onset, high pitch, short duration tendency.
        """
        # High pitch
        base_freq = 250.0
        tone = self.generate_base_tone(duration, base_freq)

        # Bright harmonics
        harmonics = [base_freq * 2, base_freq * 3, base_freq * 4]
        amplitudes = [0.45, 0.3, 0.2]
        tone = self.add_harmonics(tone, harmonics, amplitudes)

        # Very fast attack envelope
        tone = self.apply_envelope(tone, attack=0.01, decay=0.15,
                                  sustain=0.8, release=0.12)

        tone = tone / np.max(np.abs(tone)) * 0.8

        return tone.astype(np.float32)


def create_test_dataset(output_dir: Path, samples_per_emotion: int = 10):
    """
    Create a synthetic test dataset with multiple samples per emotion.

    Args:
        output_dir: Directory to save audio files
        samples_per_emotion: Number of samples to generate per emotion
    """
    logger.info("🎵 Creating synthetic test dataset...")
    logger.info(f"Output: {output_dir}")
    logger.info(f"Samples per emotion: {samples_per_emotion}")

    output_dir.mkdir(parents=True, exist_ok=True)

    generator = SyntheticAudioGenerator(sample_rate=16000)

    emotions = {
        "neutral": generator.generate_neutral,
        "happy": generator.generate_happy,
        "sad": generator.generate_sad,
        "angry": generator.generate_angry,
        "fearful": generator.generate_fearful,
        "disgusted": generator.generate_disgusted,
        "surprised": generator.generate_surprised
    }

    total_files = 0

    for emotion, generate_fn in emotions.items():
        emotion_dir = output_dir / emotion
        emotion_dir.mkdir(exist_ok=True)

        logger.info(f"\n  Generating {emotion}...")

        for i in range(samples_per_emotion):
            # Vary duration slightly
            duration = 2.5 + np.random.rand() * 1.0  # 2.5 to 3.5 seconds

            audio = generate_fn(duration)

            filename = emotion_dir / f"{emotion}_{i:03d}.wav"
            sf.write(filename, audio, 16000)
            total_files += 1

        logger.info(f"    ✓ {samples_per_emotion} files created")

    logger.info(f"\n✅ Total: {total_files} synthetic audio files created")
    logger.info(f"📁 Location: {output_dir}")

    # Create metadata file
    metadata = {
        "dataset_name": "synthetic_emotions_test",
        "total_samples": total_files,
        "samples_per_emotion": samples_per_emotion,
        "emotions": list(emotions.keys()),
        "sample_rate": 16000,
        "description": "Synthetic audio samples for testing emotion recognition"
    }

    import json
    with open(output_dir / "metadata.json", "w") as f:
        json.dump(metadata, f, indent=2)

    logger.info(f"📄 Metadata saved to: {output_dir / 'metadata.json'}")

    return output_dir


def main():
    import argparse

    parser = argparse.ArgumentParser(description="Create synthetic test audio data")
    parser.add_argument("--output", type=str, default="data/raw/synthetic/",
                       help="Output directory")
    parser.add_argument("--samples", type=int, default=10,
                       help="Samples per emotion (default: 10)")

    args = parser.parse_args()

    output_dir = Path(args.output)
    create_test_dataset(output_dir, args.samples)

    logger.info("\n" + "="*60)
    logger.info("Next steps:")
    logger.info("="*60)
    logger.info("\n1. Prepare dataset for training:")
    logger.info(f"\n   python scripts/data/download_ptbr_datasets.py \\")
    logger.info(f"       --prepare-local {output_dir}")
    logger.info("\n2. Fine-tune with synthetic data:")
    logger.info("\n   python scripts/training/finetune_emotion2vec.py \\")
    logger.info("       --dataset data/prepared/synthetic_prepared \\")
    logger.info("       --epochs 5 \\")
    logger.info("       --device cpu")
    logger.info("\n💡 Note: This is synthetic data for testing only.")
    logger.info("   Use real datasets (VERBO, emoUERJ) for production fine-tuning.")


if __name__ == "__main__":
    main()