File size: 12,129 Bytes
13e402e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
"""
Create synthetic audio samples for testing fine-tuning and annotation.

This script generates synthetic audio samples with different characteristics
to simulate emotional speech for testing purposes before real datasets are available.
"""

import numpy as np
import soundfile as sf
from pathlib import Path
import logging
from typing import Dict, List
import librosa

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)


class SyntheticAudioGenerator:
    """Generate synthetic audio samples with emotion-like characteristics."""

    def __init__(self, sample_rate: int = 16000):
        self.sample_rate = sample_rate

    def generate_base_tone(self, duration: float, frequency: float) -> np.ndarray:
        """Generate a base tone with given frequency."""
        t = np.linspace(0, duration, int(duration * self.sample_rate))
        tone = np.sin(2 * np.pi * frequency * t)
        return tone

    def add_harmonics(self, tone: np.ndarray, frequencies: List[float],
                     amplitudes: List[float]) -> np.ndarray:
        """Add harmonic frequencies to simulate voice complexity."""
        duration = len(tone) / self.sample_rate
        t = np.linspace(0, duration, len(tone))

        for freq, amp in zip(frequencies, amplitudes):
            harmonic = amp * np.sin(2 * np.pi * freq * t)
            tone = tone + harmonic

        return tone

    def apply_envelope(self, audio: np.ndarray, attack: float = 0.1,
                      decay: float = 0.1, sustain: float = 0.7,
                      release: float = 0.2) -> np.ndarray:
        """Apply ADSR envelope to audio."""
        n_samples = len(audio)
        envelope = np.ones(n_samples)

        # Attack
        attack_samples = int(attack * n_samples)
        envelope[:attack_samples] = np.linspace(0, 1, attack_samples)

        # Decay
        decay_samples = int(decay * n_samples)
        decay_end = attack_samples + decay_samples
        envelope[attack_samples:decay_end] = np.linspace(1, sustain, decay_samples)

        # Sustain (already at sustain level)
        sustain_end = n_samples - int(release * n_samples)
        envelope[decay_end:sustain_end] = sustain

        # Release
        envelope[sustain_end:] = np.linspace(sustain, 0, n_samples - sustain_end)

        return audio * envelope

    def generate_neutral(self, duration: float = 3.0) -> np.ndarray:
        """
        Generate neutral emotion audio.
        Characteristics: Medium pitch, steady rhythm, minimal variation.
        """
        # Base frequency: medium pitch (male: ~120Hz, female: ~220Hz)
        base_freq = 150.0
        tone = self.generate_base_tone(duration, base_freq)

        # Add subtle harmonics
        harmonics = [base_freq * 2, base_freq * 3, base_freq * 4]
        amplitudes = [0.3, 0.15, 0.08]
        tone = self.add_harmonics(tone, harmonics, amplitudes)

        # Steady envelope
        tone = self.apply_envelope(tone, attack=0.1, decay=0.05,
                                  sustain=0.8, release=0.15)

        # Normalize
        tone = tone / np.max(np.abs(tone)) * 0.7

        return tone.astype(np.float32)

    def generate_happy(self, duration: float = 3.0) -> np.ndarray:
        """
        Generate happy emotion audio.
        Characteristics: Higher pitch, faster rhythm, more energy.
        """
        # Higher pitch
        base_freq = 200.0
        tone = self.generate_base_tone(duration, base_freq)

        # More pronounced harmonics
        harmonics = [base_freq * 2, base_freq * 3, base_freq * 4, base_freq * 5]
        amplitudes = [0.4, 0.25, 0.15, 0.1]
        tone = self.add_harmonics(tone, harmonics, amplitudes)

        # Add vibrato (pitch modulation)
        t = np.linspace(0, duration, len(tone))
        vibrato = 1 + 0.02 * np.sin(2 * np.pi * 5 * t)  # 5Hz vibrato
        tone = tone * vibrato

        # Energetic envelope
        tone = self.apply_envelope(tone, attack=0.05, decay=0.05,
                                  sustain=0.9, release=0.1)

        # Higher energy
        tone = tone / np.max(np.abs(tone)) * 0.85

        return tone.astype(np.float32)

    def generate_sad(self, duration: float = 3.0) -> np.ndarray:
        """
        Generate sad emotion audio.
        Characteristics: Lower pitch, slower rhythm, less energy.
        """
        # Lower pitch
        base_freq = 100.0
        tone = self.generate_base_tone(duration, base_freq)

        # Fewer harmonics (less bright)
        harmonics = [base_freq * 2, base_freq * 3]
        amplitudes = [0.25, 0.12]
        tone = self.add_harmonics(tone, harmonics, amplitudes)

        # Add tremolo (amplitude modulation)
        t = np.linspace(0, duration, len(tone))
        tremolo = 1 - 0.05 * np.sin(2 * np.pi * 3 * t)  # 3Hz tremolo
        tone = tone * tremolo

        # Slower envelope
        tone = self.apply_envelope(tone, attack=0.15, decay=0.1,
                                  sustain=0.6, release=0.25)

        # Lower energy
        tone = tone / np.max(np.abs(tone)) * 0.6

        return tone.astype(np.float32)

    def generate_angry(self, duration: float = 3.0) -> np.ndarray:
        """
        Generate angry emotion audio.
        Characteristics: Variable pitch, harsh harmonics, high energy.
        """
        # Medium-high pitch with variations
        base_freq = 180.0
        tone = self.generate_base_tone(duration, base_freq)

        # Harsh harmonics
        harmonics = [base_freq * 2, base_freq * 3, base_freq * 4, base_freq * 6]
        amplitudes = [0.5, 0.3, 0.2, 0.15]
        tone = self.add_harmonics(tone, harmonics, amplitudes)

        # Add roughness (noise)
        noise = np.random.randn(len(tone)) * 0.1
        tone = tone + noise

        # Aggressive envelope
        tone = self.apply_envelope(tone, attack=0.02, decay=0.05,
                                  sustain=0.95, release=0.08)

        # High energy
        tone = tone / np.max(np.abs(tone)) * 0.9

        return tone.astype(np.float32)

    def generate_fearful(self, duration: float = 3.0) -> np.ndarray:
        """
        Generate fearful emotion audio.
        Characteristics: Variable pitch, trembling, high frequency.
        """
        # Higher pitch with instability
        base_freq = 220.0
        tone = self.generate_base_tone(duration, base_freq)

        # Unstable harmonics
        harmonics = [base_freq * 2, base_freq * 3, base_freq * 5]
        amplitudes = [0.35, 0.2, 0.15]
        tone = self.add_harmonics(tone, harmonics, amplitudes)

        # Add trembling (fast amplitude modulation)
        t = np.linspace(0, duration, len(tone))
        trembling = 1 - 0.08 * np.sin(2 * np.pi * 8 * t)  # 8Hz trembling
        tone = tone * trembling

        # Unstable envelope
        tone = self.apply_envelope(tone, attack=0.08, decay=0.12,
                                  sustain=0.7, release=0.15)

        tone = tone / np.max(np.abs(tone)) * 0.75

        return tone.astype(np.float32)

    def generate_disgusted(self, duration: float = 3.0) -> np.ndarray:
        """
        Generate disgusted emotion audio.
        Characteristics: Lower pitch, nasal quality, reduced energy.
        """
        # Lower-medium pitch
        base_freq = 130.0
        tone = self.generate_base_tone(duration, base_freq)

        # Nasal harmonics (odd harmonics emphasized)
        harmonics = [base_freq * 3, base_freq * 5, base_freq * 7]
        amplitudes = [0.4, 0.25, 0.15]
        tone = self.add_harmonics(tone, harmonics, amplitudes)

        # Add slight roughness
        noise = np.random.randn(len(tone)) * 0.05
        tone = tone + noise

        # Reduced energy envelope
        tone = self.apply_envelope(tone, attack=0.12, decay=0.1,
                                  sustain=0.65, release=0.2)

        tone = tone / np.max(np.abs(tone)) * 0.65

        return tone.astype(np.float32)

    def generate_surprised(self, duration: float = 3.0) -> np.ndarray:
        """
        Generate surprised emotion audio.
        Characteristics: Sudden onset, high pitch, short duration tendency.
        """
        # High pitch
        base_freq = 250.0
        tone = self.generate_base_tone(duration, base_freq)

        # Bright harmonics
        harmonics = [base_freq * 2, base_freq * 3, base_freq * 4]
        amplitudes = [0.45, 0.3, 0.2]
        tone = self.add_harmonics(tone, harmonics, amplitudes)

        # Very fast attack envelope
        tone = self.apply_envelope(tone, attack=0.01, decay=0.15,
                                  sustain=0.8, release=0.12)

        tone = tone / np.max(np.abs(tone)) * 0.8

        return tone.astype(np.float32)


def create_test_dataset(output_dir: Path, samples_per_emotion: int = 10):
    """
    Create a synthetic test dataset with multiple samples per emotion.

    Args:
        output_dir: Directory to save audio files
        samples_per_emotion: Number of samples to generate per emotion
    """
    logger.info("🎡 Creating synthetic test dataset...")
    logger.info(f"Output: {output_dir}")
    logger.info(f"Samples per emotion: {samples_per_emotion}")

    output_dir.mkdir(parents=True, exist_ok=True)

    generator = SyntheticAudioGenerator(sample_rate=16000)

    emotions = {
        "neutral": generator.generate_neutral,
        "happy": generator.generate_happy,
        "sad": generator.generate_sad,
        "angry": generator.generate_angry,
        "fearful": generator.generate_fearful,
        "disgusted": generator.generate_disgusted,
        "surprised": generator.generate_surprised
    }

    total_files = 0

    for emotion, generate_fn in emotions.items():
        emotion_dir = output_dir / emotion
        emotion_dir.mkdir(exist_ok=True)

        logger.info(f"\n  Generating {emotion}...")

        for i in range(samples_per_emotion):
            # Vary duration slightly
            duration = 2.5 + np.random.rand() * 1.0  # 2.5 to 3.5 seconds

            audio = generate_fn(duration)

            filename = emotion_dir / f"{emotion}_{i:03d}.wav"
            sf.write(filename, audio, 16000)
            total_files += 1

        logger.info(f"    βœ“ {samples_per_emotion} files created")

    logger.info(f"\nβœ… Total: {total_files} synthetic audio files created")
    logger.info(f"πŸ“ Location: {output_dir}")

    # Create metadata file
    metadata = {
        "dataset_name": "synthetic_emotions_test",
        "total_samples": total_files,
        "samples_per_emotion": samples_per_emotion,
        "emotions": list(emotions.keys()),
        "sample_rate": 16000,
        "description": "Synthetic audio samples for testing emotion recognition"
    }

    import json
    with open(output_dir / "metadata.json", "w") as f:
        json.dump(metadata, f, indent=2)

    logger.info(f"πŸ“„ Metadata saved to: {output_dir / 'metadata.json'}")

    return output_dir


def main():
    import argparse

    parser = argparse.ArgumentParser(description="Create synthetic test audio data")
    parser.add_argument("--output", type=str, default="data/raw/synthetic/",
                       help="Output directory")
    parser.add_argument("--samples", type=int, default=10,
                       help="Samples per emotion (default: 10)")

    args = parser.parse_args()

    output_dir = Path(args.output)
    create_test_dataset(output_dir, args.samples)

    logger.info("\n" + "="*60)
    logger.info("Next steps:")
    logger.info("="*60)
    logger.info("\n1. Prepare dataset for training:")
    logger.info(f"\n   python scripts/data/download_ptbr_datasets.py \\")
    logger.info(f"       --prepare-local {output_dir}")
    logger.info("\n2. Fine-tune with synthetic data:")
    logger.info("\n   python scripts/training/finetune_emotion2vec.py \\")
    logger.info("       --dataset data/prepared/synthetic_prepared \\")
    logger.info("       --epochs 5 \\")
    logger.info("       --device cpu")
    logger.info("\nπŸ’‘ Note: This is synthetic data for testing only.")
    logger.info("   Use real datasets (VERBO, emoUERJ) for production fine-tuning.")


if __name__ == "__main__":
    main()