|
|
""" |
|
|
Create synthetic audio samples for testing fine-tuning and annotation. |
|
|
|
|
|
This script generates synthetic audio samples with different characteristics |
|
|
to simulate emotional speech for testing purposes before real datasets are available. |
|
|
""" |
|
|
|
|
|
import numpy as np |
|
|
import soundfile as sf |
|
|
from pathlib import Path |
|
|
import logging |
|
|
from typing import Dict, List |
|
|
import librosa |
|
|
|
|
|
logging.basicConfig(level=logging.INFO) |
|
|
logger = logging.getLogger(__name__) |
|
|
|
|
|
|
|
|
class SyntheticAudioGenerator: |
|
|
"""Generate synthetic audio samples with emotion-like characteristics.""" |
|
|
|
|
|
def __init__(self, sample_rate: int = 16000): |
|
|
self.sample_rate = sample_rate |
|
|
|
|
|
def generate_base_tone(self, duration: float, frequency: float) -> np.ndarray: |
|
|
"""Generate a base tone with given frequency.""" |
|
|
t = np.linspace(0, duration, int(duration * self.sample_rate)) |
|
|
tone = np.sin(2 * np.pi * frequency * t) |
|
|
return tone |
|
|
|
|
|
def add_harmonics(self, tone: np.ndarray, frequencies: List[float], |
|
|
amplitudes: List[float]) -> np.ndarray: |
|
|
"""Add harmonic frequencies to simulate voice complexity.""" |
|
|
duration = len(tone) / self.sample_rate |
|
|
t = np.linspace(0, duration, len(tone)) |
|
|
|
|
|
for freq, amp in zip(frequencies, amplitudes): |
|
|
harmonic = amp * np.sin(2 * np.pi * freq * t) |
|
|
tone = tone + harmonic |
|
|
|
|
|
return tone |
|
|
|
|
|
def apply_envelope(self, audio: np.ndarray, attack: float = 0.1, |
|
|
decay: float = 0.1, sustain: float = 0.7, |
|
|
release: float = 0.2) -> np.ndarray: |
|
|
"""Apply ADSR envelope to audio.""" |
|
|
n_samples = len(audio) |
|
|
envelope = np.ones(n_samples) |
|
|
|
|
|
|
|
|
attack_samples = int(attack * n_samples) |
|
|
envelope[:attack_samples] = np.linspace(0, 1, attack_samples) |
|
|
|
|
|
|
|
|
decay_samples = int(decay * n_samples) |
|
|
decay_end = attack_samples + decay_samples |
|
|
envelope[attack_samples:decay_end] = np.linspace(1, sustain, decay_samples) |
|
|
|
|
|
|
|
|
sustain_end = n_samples - int(release * n_samples) |
|
|
envelope[decay_end:sustain_end] = sustain |
|
|
|
|
|
|
|
|
envelope[sustain_end:] = np.linspace(sustain, 0, n_samples - sustain_end) |
|
|
|
|
|
return audio * envelope |
|
|
|
|
|
def generate_neutral(self, duration: float = 3.0) -> np.ndarray: |
|
|
""" |
|
|
Generate neutral emotion audio. |
|
|
Characteristics: Medium pitch, steady rhythm, minimal variation. |
|
|
""" |
|
|
|
|
|
base_freq = 150.0 |
|
|
tone = self.generate_base_tone(duration, base_freq) |
|
|
|
|
|
|
|
|
harmonics = [base_freq * 2, base_freq * 3, base_freq * 4] |
|
|
amplitudes = [0.3, 0.15, 0.08] |
|
|
tone = self.add_harmonics(tone, harmonics, amplitudes) |
|
|
|
|
|
|
|
|
tone = self.apply_envelope(tone, attack=0.1, decay=0.05, |
|
|
sustain=0.8, release=0.15) |
|
|
|
|
|
|
|
|
tone = tone / np.max(np.abs(tone)) * 0.7 |
|
|
|
|
|
return tone.astype(np.float32) |
|
|
|
|
|
def generate_happy(self, duration: float = 3.0) -> np.ndarray: |
|
|
""" |
|
|
Generate happy emotion audio. |
|
|
Characteristics: Higher pitch, faster rhythm, more energy. |
|
|
""" |
|
|
|
|
|
base_freq = 200.0 |
|
|
tone = self.generate_base_tone(duration, base_freq) |
|
|
|
|
|
|
|
|
harmonics = [base_freq * 2, base_freq * 3, base_freq * 4, base_freq * 5] |
|
|
amplitudes = [0.4, 0.25, 0.15, 0.1] |
|
|
tone = self.add_harmonics(tone, harmonics, amplitudes) |
|
|
|
|
|
|
|
|
t = np.linspace(0, duration, len(tone)) |
|
|
vibrato = 1 + 0.02 * np.sin(2 * np.pi * 5 * t) |
|
|
tone = tone * vibrato |
|
|
|
|
|
|
|
|
tone = self.apply_envelope(tone, attack=0.05, decay=0.05, |
|
|
sustain=0.9, release=0.1) |
|
|
|
|
|
|
|
|
tone = tone / np.max(np.abs(tone)) * 0.85 |
|
|
|
|
|
return tone.astype(np.float32) |
|
|
|
|
|
def generate_sad(self, duration: float = 3.0) -> np.ndarray: |
|
|
""" |
|
|
Generate sad emotion audio. |
|
|
Characteristics: Lower pitch, slower rhythm, less energy. |
|
|
""" |
|
|
|
|
|
base_freq = 100.0 |
|
|
tone = self.generate_base_tone(duration, base_freq) |
|
|
|
|
|
|
|
|
harmonics = [base_freq * 2, base_freq * 3] |
|
|
amplitudes = [0.25, 0.12] |
|
|
tone = self.add_harmonics(tone, harmonics, amplitudes) |
|
|
|
|
|
|
|
|
t = np.linspace(0, duration, len(tone)) |
|
|
tremolo = 1 - 0.05 * np.sin(2 * np.pi * 3 * t) |
|
|
tone = tone * tremolo |
|
|
|
|
|
|
|
|
tone = self.apply_envelope(tone, attack=0.15, decay=0.1, |
|
|
sustain=0.6, release=0.25) |
|
|
|
|
|
|
|
|
tone = tone / np.max(np.abs(tone)) * 0.6 |
|
|
|
|
|
return tone.astype(np.float32) |
|
|
|
|
|
def generate_angry(self, duration: float = 3.0) -> np.ndarray: |
|
|
""" |
|
|
Generate angry emotion audio. |
|
|
Characteristics: Variable pitch, harsh harmonics, high energy. |
|
|
""" |
|
|
|
|
|
base_freq = 180.0 |
|
|
tone = self.generate_base_tone(duration, base_freq) |
|
|
|
|
|
|
|
|
harmonics = [base_freq * 2, base_freq * 3, base_freq * 4, base_freq * 6] |
|
|
amplitudes = [0.5, 0.3, 0.2, 0.15] |
|
|
tone = self.add_harmonics(tone, harmonics, amplitudes) |
|
|
|
|
|
|
|
|
noise = np.random.randn(len(tone)) * 0.1 |
|
|
tone = tone + noise |
|
|
|
|
|
|
|
|
tone = self.apply_envelope(tone, attack=0.02, decay=0.05, |
|
|
sustain=0.95, release=0.08) |
|
|
|
|
|
|
|
|
tone = tone / np.max(np.abs(tone)) * 0.9 |
|
|
|
|
|
return tone.astype(np.float32) |
|
|
|
|
|
def generate_fearful(self, duration: float = 3.0) -> np.ndarray: |
|
|
""" |
|
|
Generate fearful emotion audio. |
|
|
Characteristics: Variable pitch, trembling, high frequency. |
|
|
""" |
|
|
|
|
|
base_freq = 220.0 |
|
|
tone = self.generate_base_tone(duration, base_freq) |
|
|
|
|
|
|
|
|
harmonics = [base_freq * 2, base_freq * 3, base_freq * 5] |
|
|
amplitudes = [0.35, 0.2, 0.15] |
|
|
tone = self.add_harmonics(tone, harmonics, amplitudes) |
|
|
|
|
|
|
|
|
t = np.linspace(0, duration, len(tone)) |
|
|
trembling = 1 - 0.08 * np.sin(2 * np.pi * 8 * t) |
|
|
tone = tone * trembling |
|
|
|
|
|
|
|
|
tone = self.apply_envelope(tone, attack=0.08, decay=0.12, |
|
|
sustain=0.7, release=0.15) |
|
|
|
|
|
tone = tone / np.max(np.abs(tone)) * 0.75 |
|
|
|
|
|
return tone.astype(np.float32) |
|
|
|
|
|
def generate_disgusted(self, duration: float = 3.0) -> np.ndarray: |
|
|
""" |
|
|
Generate disgusted emotion audio. |
|
|
Characteristics: Lower pitch, nasal quality, reduced energy. |
|
|
""" |
|
|
|
|
|
base_freq = 130.0 |
|
|
tone = self.generate_base_tone(duration, base_freq) |
|
|
|
|
|
|
|
|
harmonics = [base_freq * 3, base_freq * 5, base_freq * 7] |
|
|
amplitudes = [0.4, 0.25, 0.15] |
|
|
tone = self.add_harmonics(tone, harmonics, amplitudes) |
|
|
|
|
|
|
|
|
noise = np.random.randn(len(tone)) * 0.05 |
|
|
tone = tone + noise |
|
|
|
|
|
|
|
|
tone = self.apply_envelope(tone, attack=0.12, decay=0.1, |
|
|
sustain=0.65, release=0.2) |
|
|
|
|
|
tone = tone / np.max(np.abs(tone)) * 0.65 |
|
|
|
|
|
return tone.astype(np.float32) |
|
|
|
|
|
def generate_surprised(self, duration: float = 3.0) -> np.ndarray: |
|
|
""" |
|
|
Generate surprised emotion audio. |
|
|
Characteristics: Sudden onset, high pitch, short duration tendency. |
|
|
""" |
|
|
|
|
|
base_freq = 250.0 |
|
|
tone = self.generate_base_tone(duration, base_freq) |
|
|
|
|
|
|
|
|
harmonics = [base_freq * 2, base_freq * 3, base_freq * 4] |
|
|
amplitudes = [0.45, 0.3, 0.2] |
|
|
tone = self.add_harmonics(tone, harmonics, amplitudes) |
|
|
|
|
|
|
|
|
tone = self.apply_envelope(tone, attack=0.01, decay=0.15, |
|
|
sustain=0.8, release=0.12) |
|
|
|
|
|
tone = tone / np.max(np.abs(tone)) * 0.8 |
|
|
|
|
|
return tone.astype(np.float32) |
|
|
|
|
|
|
|
|
def create_test_dataset(output_dir: Path, samples_per_emotion: int = 10): |
|
|
""" |
|
|
Create a synthetic test dataset with multiple samples per emotion. |
|
|
|
|
|
Args: |
|
|
output_dir: Directory to save audio files |
|
|
samples_per_emotion: Number of samples to generate per emotion |
|
|
""" |
|
|
logger.info("π΅ Creating synthetic test dataset...") |
|
|
logger.info(f"Output: {output_dir}") |
|
|
logger.info(f"Samples per emotion: {samples_per_emotion}") |
|
|
|
|
|
output_dir.mkdir(parents=True, exist_ok=True) |
|
|
|
|
|
generator = SyntheticAudioGenerator(sample_rate=16000) |
|
|
|
|
|
emotions = { |
|
|
"neutral": generator.generate_neutral, |
|
|
"happy": generator.generate_happy, |
|
|
"sad": generator.generate_sad, |
|
|
"angry": generator.generate_angry, |
|
|
"fearful": generator.generate_fearful, |
|
|
"disgusted": generator.generate_disgusted, |
|
|
"surprised": generator.generate_surprised |
|
|
} |
|
|
|
|
|
total_files = 0 |
|
|
|
|
|
for emotion, generate_fn in emotions.items(): |
|
|
emotion_dir = output_dir / emotion |
|
|
emotion_dir.mkdir(exist_ok=True) |
|
|
|
|
|
logger.info(f"\n Generating {emotion}...") |
|
|
|
|
|
for i in range(samples_per_emotion): |
|
|
|
|
|
duration = 2.5 + np.random.rand() * 1.0 |
|
|
|
|
|
audio = generate_fn(duration) |
|
|
|
|
|
filename = emotion_dir / f"{emotion}_{i:03d}.wav" |
|
|
sf.write(filename, audio, 16000) |
|
|
total_files += 1 |
|
|
|
|
|
logger.info(f" β {samples_per_emotion} files created") |
|
|
|
|
|
logger.info(f"\nβ
Total: {total_files} synthetic audio files created") |
|
|
logger.info(f"π Location: {output_dir}") |
|
|
|
|
|
|
|
|
metadata = { |
|
|
"dataset_name": "synthetic_emotions_test", |
|
|
"total_samples": total_files, |
|
|
"samples_per_emotion": samples_per_emotion, |
|
|
"emotions": list(emotions.keys()), |
|
|
"sample_rate": 16000, |
|
|
"description": "Synthetic audio samples for testing emotion recognition" |
|
|
} |
|
|
|
|
|
import json |
|
|
with open(output_dir / "metadata.json", "w") as f: |
|
|
json.dump(metadata, f, indent=2) |
|
|
|
|
|
logger.info(f"π Metadata saved to: {output_dir / 'metadata.json'}") |
|
|
|
|
|
return output_dir |
|
|
|
|
|
|
|
|
def main(): |
|
|
import argparse |
|
|
|
|
|
parser = argparse.ArgumentParser(description="Create synthetic test audio data") |
|
|
parser.add_argument("--output", type=str, default="data/raw/synthetic/", |
|
|
help="Output directory") |
|
|
parser.add_argument("--samples", type=int, default=10, |
|
|
help="Samples per emotion (default: 10)") |
|
|
|
|
|
args = parser.parse_args() |
|
|
|
|
|
output_dir = Path(args.output) |
|
|
create_test_dataset(output_dir, args.samples) |
|
|
|
|
|
logger.info("\n" + "="*60) |
|
|
logger.info("Next steps:") |
|
|
logger.info("="*60) |
|
|
logger.info("\n1. Prepare dataset for training:") |
|
|
logger.info(f"\n python scripts/data/download_ptbr_datasets.py \\") |
|
|
logger.info(f" --prepare-local {output_dir}") |
|
|
logger.info("\n2. Fine-tune with synthetic data:") |
|
|
logger.info("\n python scripts/training/finetune_emotion2vec.py \\") |
|
|
logger.info(" --dataset data/prepared/synthetic_prepared \\") |
|
|
logger.info(" --epochs 5 \\") |
|
|
logger.info(" --device cpu") |
|
|
logger.info("\nπ‘ Note: This is synthetic data for testing only.") |
|
|
logger.info(" Use real datasets (VERBO, emoUERJ) for production fine-tuning.") |
|
|
|
|
|
|
|
|
if __name__ == "__main__": |
|
|
main() |
|
|
|