Spaces:

maya-research
/

maya1

Running on Zero

App Files Files Community

Veena commited on Nov 6

Commit

002a88c

1 Parent(s): 30a893c

Update Maya1 Gradio app with preset characters

Browse files

Files changed (9) hide show

.gitignore +16 -0
maya1/__init__.py +7 -0
maya1/api_v2.py +342 -0
maya1/constants.py +95 -0
maya1/model_loader.py +145 -0
maya1/pipeline.py +128 -0
maya1/prompt_builder.py +31 -0
maya1/snac_decoder.py +515 -0
maya1/streaming_pipeline.py +159 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,16 @@

+__pycache__/
+*.pyc
+*.pyo
+*.pyd
+.Python
+*.so
+*.egg
+*.egg-info/
+dist/
+build/
+.cache/
+.pytest_cache/
+*.wav
+*.mp3
+.DS_Store

maya1/__init__.py ADDED Viewed

	@@ -0,0 +1,7 @@

+"""
+Maya1 TTS Inference System
+Open-source inference for description-conditioned TTS with emotion control.
+"""
+__version__ = "1.0.0"
+__author__ = "Maya Research AI"

maya1/api_v2.py ADDED Viewed

	@@ -0,0 +1,342 @@

+import os
+import io
+import wave
+import time
+from typing import Optional
+from fastapi import FastAPI, HTTPException
+from fastapi.responses import StreamingResponse
+from fastapi.middleware.cors import CORSMiddleware
+from pydantic import BaseModel, Field
+from dotenv import load_dotenv
+from .model_loader import Maya1Model
+from .prompt_builder import Maya1PromptBuilder
+from .snac_decoder import SNACDecoder
+from .pipeline import Maya1Pipeline
+from .streaming_pipeline import Maya1SlidingWindowPipeline
+from .constants import (
+    DEFAULT_TEMPERATURE,
+    DEFAULT_TOP_P,
+    DEFAULT_MAX_TOKENS,
+    DEFAULT_REPETITION_PENALTY,
+    AUDIO_SAMPLE_RATE,
+)
+# Timeout settings (seconds)
+GENERATE_TIMEOUT = 60
+# Load environment variables
+load_dotenv()
+# Initialize FastAPI app
+app = FastAPI(
+    title="Maya1 TTS API",
+    description="Open source TTS inference for Maya1",
+    version="1.0.0",
+    docs_url=None,
+    redoc_url=None,
+)
+app.add_middleware(
+    CORSMiddleware,
+    allow_origins=["*"],
+    allow_credentials=True,
+    allow_methods=["*"],
+    allow_headers=["*"],
+)
+# Global state
+model = None
+prompt_builder = None
+snac_decoder = None
+pipeline = None
+streaming_pipeline = None
+# ============================================================================
+# Startup/Shutdown
+# ============================================================================
+@app.on_event("startup")
+async def startup_event():
+    """Initialize model on startup."""
+    global model, prompt_builder, snac_decoder, pipeline, streaming_pipeline
+    print("\n" + "="*60)
+    print(" Starting Maya1 TTS API Server")
+    print("="*60 + "\n")
+    # Initialize components
+    model = Maya1Model()
+    prompt_builder = Maya1PromptBuilder(model.tokenizer, model)
+    # Initialize SNAC decoder
+    snac_decoder = SNACDecoder(enable_batching=True, max_batch_size=64, batch_timeout_ms=15)
+    await snac_decoder.start_batch_processor()
+    # Initialize pipelines
+    pipeline = Maya1Pipeline(model, prompt_builder, snac_decoder)
+    streaming_pipeline = Maya1SlidingWindowPipeline(model, prompt_builder, snac_decoder)
+    print("\n" + "="*60)
+    print("Maya1 TTS API Server Ready")
+    print("="*60 + "\n")
+@app.on_event("shutdown")
+async def shutdown_event():
+    """Cleanup on shutdown."""
+    print("\nShutting down Maya1 TTS API Server")
+    if snac_decoder and snac_decoder.is_running:
+        await snac_decoder.stop_batch_processor()
+# ============================================================================
+# Utility Functions
+# ============================================================================
+def create_wav_header(sample_rate: int = 24000, channels: int = 1, bits_per_sample: int = 16, data_size: int = 0) -> bytes:
+    """Create WAV file header."""
+    import struct
+    byte_rate = sample_rate * channels * bits_per_sample // 8
+    block_align = channels * bits_per_sample // 8
+    header = struct.pack(
+        '<4sI4s4sIHHIIHH4sI',
+        b'RIFF',
+        36 + data_size,
+        b'WAVE',
+        b'fmt ',
+        16,
+        1,
+        channels,
+        sample_rate,
+        byte_rate,
+        block_align,
+        bits_per_sample,
+        b'data',
+        data_size
+    )
+    return header
+# ============================================================================
+# Request/Response Models
+# ============================================================================
+class TTSRequest(BaseModel):
+    """TTS generation request."""
+    description: str = Field(
+        ...,
+        description="Voice description (e.g., 'Male voice in their 30s with american accent')"
+    )
+    text: str = Field(
+        ...,
+        description="Text to synthesize (can include <emotion> tags)"
+    )
+    temperature: Optional[float] = Field(
+        default=DEFAULT_TEMPERATURE,
+        description="Sampling temperature"
+    )
+    top_p: Optional[float] = Field(
+        default=DEFAULT_TOP_P,
+        description="Nucleus sampling"
+    )
+    max_tokens: Optional[int] = Field(
+        default=DEFAULT_MAX_TOKENS,
+        description="Maximum tokens to generate"
+    )
+    repetition_penalty: Optional[float] = Field(
+        default=DEFAULT_REPETITION_PENALTY,
+        description="Repetition penalty"
+    )
+    seed: Optional[int] = Field(
+        default=None,
+        description="Random seed for reproducibility",
+        ge=0,
+    )
+    stream: bool = Field(
+        default=False,
+        description="Stream audio (True) or return complete WAV (False)"
+    )
+# ============================================================================
+# Endpoints
+# ============================================================================
+@app.get("/")
+async def root():
+    """Root endpoint."""
+    return {
+        "service": "Maya1 TTS API",
+        "version": "1.0.0",
+        "status": "running",
+        "model": "Maya1-Voice (open source)",
+        "endpoints": {
+            "generate": "/v1/tts/generate (POST)",
+            "health": "/health (GET)",
+        },
+    }
+@app.get("/health")
+async def health_check():
+    """Health check endpoint."""
+    return {
+        "status": "healthy",
+        "model": "Maya1-Voice",
+        "timestamp": time.time(),
+    }
+# ============================================================================
+# TTS Generation Endpoint
+# ============================================================================
+@app.post("/v1/tts/generate")
+async def generate_tts(request: TTSRequest):
+    """Generate TTS audio from description and text."""
+    try:
+        # Route to streaming or non-streaming
+        if request.stream:
+            return await _generate_tts_streaming(
+                description=request.description,
+                text=request.text,
+                temperature=request.temperature,
+                top_p=request.top_p,
+                max_tokens=request.max_tokens,
+                repetition_penalty=request.repetition_penalty,
+                seed=request.seed,
+            )
+        else:
+            return await _generate_tts_complete(
+                description=request.description,
+                text=request.text,
+                temperature=request.temperature,
+                top_p=request.top_p,
+                max_tokens=request.max_tokens,
+                repetition_penalty=request.repetition_penalty,
+                seed=request.seed,
+            )
+    except HTTPException:
+        raise
+    except Exception as e:
+        print(f" Error: {e}")
+        raise HTTPException(status_code=500, detail=str(e))
+async def _generate_tts_complete(
+    description: str,
+    text: str,
+    temperature: float,
+    top_p: float,
+    max_tokens: int,
+    repetition_penalty: float,
+    seed: Optional[int],
+):
+    """Generate complete WAV file (non-streaming)."""
+    try:
+        import asyncio
+        # Generate audio
+        audio_bytes = await asyncio.wait_for(
+            pipeline.generate_speech(
+                description=description,
+                text=text,
+                temperature=temperature,
+                top_p=top_p,
+                max_tokens=max_tokens,
+                repetition_penalty=repetition_penalty,
+                seed=seed,
+            ),
+            timeout=GENERATE_TIMEOUT
+        )
+        if audio_bytes is None:
+            raise Exception("Audio generation failed")
+        # Create WAV file
+        wav_buffer = io.BytesIO()
+        with wave.open(wav_buffer, 'wb') as wav_file:
+            wav_file.setnchannels(1)
+            wav_file.setsampwidth(2)
+            wav_file.setframerate(AUDIO_SAMPLE_RATE)
+            wav_file.writeframes(audio_bytes)
+        wav_buffer.seek(0)
+        return StreamingResponse(
+            wav_buffer,
+            media_type="audio/wav",
+            headers={"Content-Disposition": "attachment; filename=output.wav"}
+        )
+    except asyncio.TimeoutError:
+        raise HTTPException(status_code=504, detail="Generation timeout")
+async def _generate_tts_streaming(
+    description: str,
+    text: str,
+    temperature: float,
+    top_p: float,
+    max_tokens: int,
+    repetition_penalty: float,
+    seed: Optional[int],
+):
+    """Generate streaming audio."""
+    start_time = time.time()
+    first_audio_time = None
+    async def audio_stream_generator():
+        """Generate audio stream with WAV header."""
+        nonlocal first_audio_time
+        # Send WAV header first
+        yield create_wav_header(sample_rate=AUDIO_SAMPLE_RATE, channels=1, bits_per_sample=16)
+        # Stream audio chunks
+        async for audio_chunk in streaming_pipeline.generate_speech_stream(
+            description=description,
+            text=text,
+            temperature=temperature,
+            top_p=top_p,
+            max_tokens=max_tokens,
+            repetition_penalty=repetition_penalty,
+            seed=seed,
+        ):
+            if first_audio_time is None:
+                first_audio_time = time.time()
+                ttfb_ms = (first_audio_time - start_time) * 1000
+                print(f"⏱️  TTFB: {ttfb_ms:.1f}ms")
+            yield audio_chunk
+    try:
+        return StreamingResponse(
+            audio_stream_generator(),
+            media_type="audio/wav",
+            headers={"Cache-Control": "no-cache"}
+        )
+    except Exception as e:
+        print(f"Streaming error: {e}")
+        raise HTTPException(status_code=500, detail=str(e))
+# For running directly
+if __name__ == "__main__":
+    import uvicorn
+    uvicorn.run(
+        app,
+        host="0.0.0.0",
+        port=8000,
+        log_level="info"
+    )

maya1/constants.py ADDED Viewed

	@@ -0,0 +1,95 @@

+"""
+Maya1 Constants
+Token IDs and special tokens used in the model.
+Matches training configuration exactly.
+"""
+# Special control tokens
+SOH_ID = 128259  # Start of Human turn
+EOH_ID = 128260  # End of Human turn
+SOA_ID = 128261  # Start of AI turn
+EOA_ID = 128262  # End of AI turn (not used in maya1)
+PAD_ID = 128263  # Padding token
+# Text tokens
+BOS_ID = 128000  # Begin of sequence (Llama BOS)
+TEXT_EOT_ID = 128009  # End of text (appears in prefix, not a stop token!)
+# Audio tokens
+CODE_START_TOKEN_ID = 128257  # SOS - Start of Speech
+CODE_END_TOKEN_ID = 128258   # EOS - End of Speech (audio stop token)
+CODE_TOKEN_OFFSET = 128266   # Start of SNAC codes
+# SNAC token range
+SNAC_MIN_ID = 128266
+SNAC_MAX_ID = 156937  # 128266 + (7 * 4096) - 1
+# Stop tokens for generation
+# CRITICAL: Only use CODE_END_TOKEN_ID (128258) for audio generation
+# TEXT_EOT_ID (128009) appears in prefix and should NOT stop generation
+TRAINING_STOP_TOKEN_IDS = [CODE_END_TOKEN_ID]  # [128258]
+ALL_POSSIBLE_STOP_TOKENS = [TEXT_EOT_ID, CODE_END_TOKEN_ID]  # For reference only
+# 20 Extended Emotion Tags (must be single tokens)
+ALL_EMOTION_TAGS = [
+    '<angry>',
+    '<appalled>',
+    '<chuckle>',
+    '<cry>',
+    '<curious>',
+    '<disappointed>',
+    '<excited>',
+    '<exhale>',
+    '<gasp>',
+    '<giggle>',
+    '<gulp>',
+    '<laugh>',
+    '<laugh_harder>',
+    '<mischievous>',
+    '<sarcastic>',
+    '<scream>',
+    '<sigh>',
+    '<sing>',
+    '<snort>',
+    '<whisper>',
+]
+# Model configuration
+DEFAULT_MODEL_PATH = "maya-research/maya1"
+DEFAULT_CHECKPOINT = "checkpoint-25000"
+DEFAULT_MAX_MODEL_LEN = 8192
+# SNAC configuration
+SNAC_MODEL_NAME = "hubertsiuzdak/snac_24khz"
+SNAC_SAMPLE_RATE = 24000
+SNAC_TOKENS_PER_FRAME = 7
+SNAC_LEVELS = 3
+# Audio configuration
+AUDIO_SAMPLE_RATE = 24000
+AUDIO_CHANNELS = 1
+AUDIO_BITS_PER_SAMPLE = 16
+# Generation defaults
+DEFAULT_TEMPERATURE = 0.4  # Lower temp for more stable generation
+DEFAULT_TOP_P = 0.9
+DEFAULT_MAX_TOKENS = 2048  # Reasonable default for most use cases
+DEFAULT_MIN_TOKENS = 28  # At least 4 SNAC frames
+DEFAULT_REPETITION_PENALTY = 1.1
+DEFAULT_SEED = None  # None = random, set integer for reproducibility
+# IMPORTANT: Emotion tags consume audio time!
+# <laugh> = ~4-6 seconds (~300-400 tokens)
+# <excited>, <chuckle> = ~1-2 seconds (~50-150 tokens)
+# Recommended max_tokens by use case:
+# - Short phrases (< 10 words): 150-250 tokens (~3-5s)
+# - Medium text (10-30 words): 250-500 tokens (~5-10s)
+# - Long text (30+ words): 500-1500 tokens (~10-30s)
+# - Very long text: 1500-2000 tokens (~30-42s)
+# Note: 1 second ≈ 48 tokens (7 tokens/frame * 6.86 frames/sec)
+# Streaming configuration
+STREAM_BUFFER_SIZE = 28  # 4 frames (process every 28 tokens)
+SNAC_BATCH_SIZE = 64
+SNAC_BATCH_TIMEOUT_MS = 15

maya1/model_loader.py ADDED Viewed

	@@ -0,0 +1,145 @@

+"""
+Maya1 Model Loader
+Loads Maya1 model with vLLM engine and validates emotion tags.
+"""
+import os
+from transformers import AutoTokenizer
+from vllm import AsyncLLMEngine, AsyncEngineArgs, SamplingParams
+from .constants import (
+    ALL_EMOTION_TAGS,
+    DEFAULT_MAX_MODEL_LEN,
+    SOH_ID, EOH_ID, SOA_ID, BOS_ID, TEXT_EOT_ID, CODE_START_TOKEN_ID,
+)
+class Maya1Model:
+    """Maya1 TTS Model with vLLM inference engine."""
+    def __init__(
+        self,
+        model_path: str = None,
+        dtype: str = "bfloat16",
+        max_model_len: int = DEFAULT_MAX_MODEL_LEN,
+        gpu_memory_utilization: float = 0.85,
+        tensor_parallel_size: int = 1,
+        **engine_kwargs
+    ):
+        """
+        Initialize Maya1 model with vLLM.
+        Args:
+            model_path: Path to checkpoint (local or HF repo)
+            dtype: Model precision (bfloat16 recommended)
+            max_model_len: Maximum sequence length
+            gpu_memory_utilization: GPU memory fraction
+            tensor_parallel_size: Number of GPUs
+        """
+        # Use provided path or environment variable or default
+        if model_path is None:
+            model_path = os.environ.get(
+                'MAYA1_MODEL_PATH',
+                os.path.expanduser('~/models/maya1-voice')
+            )
+        self.model_path = model_path
+        self.dtype = dtype
+        print(f"Initializing Maya1 Model")
+        print(f"Model: {model_path}")
+        # Load tokenizer
+        self.tokenizer = AutoTokenizer.from_pretrained(
+            model_path,
+            trust_remote_code=True,
+        )
+        print(f"Tokenizer loaded: {len(self.tokenizer)} tokens")
+        # Validate emotion tags
+        self._validate_emotion_tags()
+        # Precompute special token strings
+        self._init_special_tokens()
+        # Initialize vLLM engine
+        print(f"Initializing vLLM engine...")
+        engine_args = AsyncEngineArgs(
+            model=model_path,
+            tokenizer=model_path,
+            dtype=dtype,
+            max_model_len=max_model_len,
+            gpu_memory_utilization=gpu_memory_utilization,
+            tensor_parallel_size=tensor_parallel_size,
+            trust_remote_code=True,
+            disable_log_stats=False,
+            **engine_kwargs
+        )
+        self.engine = AsyncLLMEngine.from_engine_args(engine_args)
+        print(f"Maya1 Model ready\n")
+    def _validate_emotion_tags(self):
+        """Validate that all 20 emotion tags are single tokens."""
+        failed_tags = []
+        for tag in ALL_EMOTION_TAGS:
+            token_ids = self.tokenizer.encode(tag, add_special_tokens=False)
+            if len(token_ids) != 1:
+                failed_tags.append((tag, len(token_ids)))
+        if failed_tags:
+            print(f"ERROR: {len(failed_tags)} emotion tags are NOT single tokens!")
+            raise AssertionError(f"Emotion tags validation failed")
+        print(f"All {len(ALL_EMOTION_TAGS)} emotion tags validated")
+    def _init_special_tokens(self):
+        """Precompute special token strings for fast prefix building."""
+        self.soh_token = self.tokenizer.decode([SOH_ID])
+        self.bos_token = self.tokenizer.bos_token
+        self.eot_token = self.tokenizer.decode([TEXT_EOT_ID])
+        self.eoh_token = self.tokenizer.decode([EOH_ID])
+        self.soa_token = self.tokenizer.decode([SOA_ID])
+        self.sos_token = self.tokenizer.decode([CODE_START_TOKEN_ID])
+    async def generate(self, prompt: str, sampling_params: SamplingParams):
+        """
+        Generate tokens from prompt (non-streaming).
+        Args:
+            prompt: Input prompt
+            sampling_params: vLLM sampling parameters
+        Returns:
+            Generated output from vLLM
+        """
+        request_id = f"req_{id(prompt)}"
+        # Collect results from async generator
+        final_output = None
+        async for output in self.engine.generate(
+            prompt=prompt,
+            sampling_params=sampling_params,
+            request_id=request_id
+        ):
+            final_output = output
+        return [final_output] if final_output else []
+    async def generate_stream(self, prompt: str, sampling_params: SamplingParams):
+        """
+        Generate tokens from prompt (streaming).
+        Args:
+            prompt: Input prompt
+            sampling_params: vLLM sampling parameters
+        Yields:
+            Generated outputs from vLLM
+        """
+        request_id = f"req_{id(prompt)}"
+        # Stream from engine
+        async for output in self.engine.generate(
+            prompt=prompt,
+            sampling_params=sampling_params,
+            request_id=request_id
+        ):
+            yield output

maya1/pipeline.py ADDED Viewed

	@@ -0,0 +1,128 @@

+"""
+Maya1 Generation Pipeline
+End-to-end pipeline for TTS generation (non-streaming).
+"""
+import asyncio
+from typing import Optional, List
+from vllm import SamplingParams
+from .constants import (
+    CODE_END_TOKEN_ID,
+    CODE_START_TOKEN_ID,
+    SNAC_MIN_ID,
+    SNAC_MAX_ID,
+    DEFAULT_TEMPERATURE,
+    DEFAULT_TOP_P,
+    DEFAULT_MAX_TOKENS,
+    DEFAULT_MIN_TOKENS,
+    DEFAULT_REPETITION_PENALTY,
+    DEFAULT_SEED,
+)
+class Maya1Pipeline:
+    """End-to-end TTS pipeline for Maya1."""
+    def __init__(self, model, prompt_builder, snac_decoder):
+        """
+        Initialize pipeline.
+        Args:
+            model: Maya1Model instance
+            prompt_builder: Maya1PromptBuilder instance
+            snac_decoder: SNACDecoder instance
+        """
+        self.model = model
+        self.prompt_builder = prompt_builder
+        self.snac_decoder = snac_decoder
+        print(f"✅ Maya1Pipeline initialized")
+    async def generate_speech(
+        self,
+        description: str,
+        text: str,
+        temperature: float = DEFAULT_TEMPERATURE,
+        top_p: float = DEFAULT_TOP_P,
+        max_tokens: int = DEFAULT_MAX_TOKENS,
+        repetition_penalty: float = DEFAULT_REPETITION_PENALTY,
+        seed: Optional[int] = None,
+    ) -> Optional[bytes]:
+        """
+        Generate speech audio (non-streaming).
+        Args:
+            description: Voice description
+            text: Text to synthesize (may include <emotion> tags)
+            temperature: Sampling temperature
+            top_p: Nucleus sampling
+            max_tokens: Max SNAC tokens to generate
+            repetition_penalty: Prevent loops
+            seed: Random seed for reproducibility
+        Returns:
+            Audio bytes (int16 PCM, 24kHz mono) or None if failed
+        """
+        # Build prompt
+        prompt = self.prompt_builder.build_prefix(description, text)
+        # Configure sampling
+        sampling_params = SamplingParams(
+            temperature=temperature,
+            top_p=top_p,
+            max_tokens=max_tokens,
+            min_tokens=DEFAULT_MIN_TOKENS,
+            repetition_penalty=repetition_penalty,
+            stop_token_ids=[CODE_END_TOKEN_ID],
+            seed=seed if seed is not None else DEFAULT_SEED,
+        )
+        # Generate tokens
+        outputs = await self.model.generate(prompt, sampling_params)
+        if not outputs or len(outputs) == 0:
+            return None
+        output = outputs[0]
+        generated_token_ids = output.outputs[0].token_ids
+        # Extract SNAC codes
+        snac_codes = self._extract_snac_codes(generated_token_ids)
+        if not snac_codes:
+            return None
+        # Decode to audio
+        audio_bytes = await self.snac_decoder.decode_single_async(snac_codes)
+        if audio_bytes:
+            frames = len(snac_codes) // 7
+            duration_sec = frames / 6.86
+            print(f" Generated {frames} frames (~{duration_sec:.1f}s audio)")
+        return audio_bytes
+    def _extract_snac_codes(self, token_ids: List[int]) -> List[int]:
+        # Find SOS and EOS positions
+        try:
+            sos_idx = token_ids.index(CODE_START_TOKEN_ID)
+        except ValueError:
+            sos_idx = -1
+        try:
+            eos_idx = token_ids.index(CODE_END_TOKEN_ID)
+        except ValueError:
+            eos_idx = len(token_ids)
+        # Extract tokens between SOS and EOS
+        if sos_idx >= 0:
+            snac_tokens = token_ids[sos_idx + 1:eos_idx]
+        else:
+            # If no SOS found, take everything before EOS
+            snac_tokens = token_ids[:eos_idx]
+        # Filter to only valid SNAC token IDs
+        snac_codes = [
+            token_id for token_id in snac_tokens
+            if SNAC_MIN_ID <= token_id <= SNAC_MAX_ID
+        ]
+        return snac_codes

maya1/prompt_builder.py ADDED Viewed

	@@ -0,0 +1,31 @@

+"""
+Maya1 Prompt Builder
+Builds formatted prompts for description-conditioned TTS.
+Format: <SOH><BOS><description="..."> text<EOT><EOH><SOA><SOS>
+"""
+from .constants import ALL_EMOTION_TAGS
+class Maya1PromptBuilder:
+    """Builds prompts in the format expected by Maya1 model."""
+    def __init__(self, tokenizer, model):
+        self.tokenizer = tokenizer
+        self.model = model
+    def build_prefix(self, description: str, text: str) -> str:
+        # Format as: <description="..."> text
+        formatted_text = f'<description="{description}"> {text}'
+        # Build full prefix with special tokens
+        prompt = (
+            self.model.soh_token +
+            self.model.bos_token +
+            formatted_text +
+            self.model.eot_token +
+            self.model.eoh_token +
+            self.model.soa_token +
+            self.model.sos_token
+        )
+        return prompt

maya1/snac_decoder.py ADDED Viewed

	@@ -0,0 +1,515 @@

+import torch
+import numpy as np
+import asyncio
+from typing import List, Optional, Tuple
+from snac import SNAC
+from .constants import (
+    CODE_END_TOKEN_ID,
+    CODE_TOKEN_OFFSET,
+    SNAC_MODEL_NAME,
+    SNAC_SAMPLE_RATE,
+    SNAC_TOKENS_PER_FRAME,
+)
+class SNACDecoder:
+    """
+    SNAC Decoder for maya1.
+    Unpacks 7-token SNAC frames and decodes to audio waveforms.
+    Unpacking logic is the EXACT INVERSE of training preprocessing.
+    Supports async batching for concurrent requests.
+    CRITICAL: Any mismatch in unpacking will produce garbage audio.
+    """
+    def __init__(
+        self,
+        device: str = "cuda",
+        compile_decoder: bool = False,
+        enable_batching: bool = False,
+        max_batch_size: int = 64,
+        batch_timeout_ms: int = 15,
+    ):
+        """
+        Initialize SNAC decoder.
+        Args:
+            device: Device for SNAC model (cuda/cpu)
+            compile_decoder: Use torch.compile for speedup
+            enable_batching: Enable async batching
+            max_batch_size: Max sequences to batch together
+            batch_timeout_ms: Max wait time before processing batch
+        """
+        self.device = device
+        self.enable_batching = enable_batching
+        self.max_batch_size = max_batch_size
+        self.batch_timeout_ms = batch_timeout_ms
+        print(f"Loading SNAC 24kHz model to {device}...")
+        self.snac_model = SNAC.from_pretrained(SNAC_MODEL_NAME).eval().to(device)
+        if compile_decoder:
+            print(f"Compiling SNAC decoder with torch.compile...")
+            self._compile_model()
+        # Batching infrastructure
+        if enable_batching:
+            self.request_queue = asyncio.Queue()
+            self.batch_processor_task = None
+            self._running = False
+            print(f"Batching enabled (max_batch={max_batch_size}, timeout={batch_timeout_ms}ms)")
+        print(f"SNAC decoder initialized")
+    def _compile_model(self):
+        """Compile SNAC decoder with torch.compile"""
+        # Warm up with various sizes
+        for frames in [4, 16, 32]:
+            dummy_codes = [
+                torch.randint(0, 4096, (1, frames), device=self.device),
+                torch.randint(0, 4096, (1, frames * 2), device=self.device),
+                torch.randint(0, 4096, (1, frames * 4), device=self.device),
+            ]
+            with torch.inference_mode():
+                z_q = self.snac_model.quantizer.from_codes(dummy_codes)
+                _ = self.snac_model.decoder(z_q)
+        # Apply compilation
+        self.snac_model.decoder = torch.compile(
+            self.snac_model.decoder,
+            mode="max-autotune"
+        )
+        self.snac_model.quantizer = torch.compile(
+            self.snac_model.quantizer,
+            mode="reduce-overhead"
+        )
+        print(f"SNAC decoder compiled")
+    def unpack_snac_from_7(self, vocab_ids: List[int]) -> List[List[int]]:
+        """
+        Unpack 7-token SNAC frames to 3 hierarchical levels.
+        This is the EXACT INVERSE of the training preprocessing function
+        `pack_snac_to_7_and_offset()`.
+        Frame structure:
+        [slot0, slot1, slot2, slot3, slot4, slot5, slot6]
+        Unpacking:
+        - slot0: L1[i]
+        - slot1: L2[2*i]      (even index)
+        - slot2: L3[4*i + 0]
+        - slot3: L3[4*i + 1]
+        - slot4: L2[2*i + 1]  (odd index)
+        - slot5: L3[4*i + 2]
+        - slot6: L3[4*i + 3]
+        Args:
+            vocab_ids: List of SNAC token IDs (128266-156937)
+                       Must be divisible by 7
+        Returns:
+            [L1, L2, L3] where:
+                L1: n elements (coarse level)
+                L2: 2n elements (medium level)
+                L3: 4n elements (fine level)
+        """
+        # Strip EOS token if present
+        if vocab_ids and vocab_ids[-1] == CODE_END_TOKEN_ID:
+            vocab_ids = vocab_ids[:-1]
+        # Ensure complete frames (divisible by 7)
+        frames = len(vocab_ids) // SNAC_TOKENS_PER_FRAME
+        vocab_ids = vocab_ids[:frames * SNAC_TOKENS_PER_FRAME]
+        if frames == 0:
+            return [[], [], []]
+        l1, l2, l3 = [], [], []
+        for i in range(frames):
+            # Extract 7 slots for this frame
+            slots = vocab_ids[i*7:(i+1)*7]
+            # Subtract offset (128266) and mod 4096 to get original codes
+            # Each level uses 4096 codes (0-4095)
+            l1.append((slots[0] - CODE_TOKEN_OFFSET) % 4096)
+            l2.extend([
+                (slots[1] - CODE_TOKEN_OFFSET) % 4096,  # Even index
+                (slots[4] - CODE_TOKEN_OFFSET) % 4096,  # Odd index
+            ])
+            l3.extend([
+                (slots[2] - CODE_TOKEN_OFFSET) % 4096,
+                (slots[3] - CODE_TOKEN_OFFSET) % 4096,
+                (slots[5] - CODE_TOKEN_OFFSET) % 4096,
+                (slots[6] - CODE_TOKEN_OFFSET) % 4096,
+            ])
+        return [l1, l2, l3]
+    @torch.inference_mode()
+    def decode(
+        self,
+        snac_tokens: List[int],
+        trim_warmup: bool = True,
+        trim_amount: Optional[int] = None,
+        use_sliding_window: bool = False
+    ) -> Optional[np.ndarray]:
+        """
+        Decode SNAC tokens to audio waveform.
+        Args:
+            snac_tokens: List of SNAC token IDs (7*n tokens)
+            trim_warmup: Whether to trim SNAC warmup samples (default: True)
+            trim_amount: Number of samples to trim (default: 2048 for first chunk, 0 for others)
+                        Can be set to a smaller value (e.g., 512) for intermediate chunks
+            use_sliding_window: If True, only return middle 2048 samples (for sliding window streaming)
+        Returns:
+            Audio waveform as numpy array (float32, 24kHz mono)
+            Shape: (samples,)
+            Returns None if not enough tokens
+        """
+        if len(snac_tokens) < SNAC_TOKENS_PER_FRAME:
+            print(f"Not enough SNAC tokens: {len(snac_tokens)} < {SNAC_TOKENS_PER_FRAME}")
+            return None
+        # Unpack to 3 levels
+        levels = self.unpack_snac_from_7(snac_tokens)
+        if not levels[0]:  # No frames after unpacking
+            return None
+        # Convert to tensors
+        codes = [
+            torch.tensor(level, dtype=torch.long, device=self.device).unsqueeze(0)
+            for level in levels
+        ]
+        # Decode through SNAC
+        z_q = self.snac_model.quantizer.from_codes(codes)
+        audio = self.snac_model.decoder(z_q)
+        # Extract audio (remove padding if any)
+        # SNAC decoder outputs: [batch, 1, samples]
+        audio = audio[0, 0].cpu().numpy()
+        # Sliding window mode: only keep middle 2048 samples
+        # This eliminates popping/cracking when using overlapping 28-token windows
+        if use_sliding_window:
+            if len(audio) >= 4096:
+                audio = audio[2048:4096]  # Keep middle portion only
+            else:
+                # For shorter audio, keep everything (final chunk)
+                pass
+        else:
+            # Standard mode: trim warm-up samples
+            # Default: 2048 samples for first chunk, 0 for subsequent chunks
+            # Can be customized via trim_amount parameter
+            if trim_warmup:
+                if trim_amount is None:
+                    trim_amount = 2048  # Default full trim
+                if len(audio) > trim_amount:
+                    audio = audio[trim_amount:]
+        return audio
+    def decode_to_bytes(
+        self,
+        snac_tokens: List[int],
+        trim_warmup: bool = True,
+        use_sliding_window: bool = False
+    ) -> Optional[bytes]:
+        """
+        Decode SNAC tokens to audio bytes (int16 PCM).
+        Args:
+            snac_tokens: List of SNAC token IDs
+            trim_warmup: Whether to trim SNAC warmup samples (default: True)
+            use_sliding_window: If True, only return middle 2048 samples (for sliding window streaming)
+        Returns:
+            Audio as bytes (int16 PCM, 24kHz mono)
+            Returns None if decode fails
+        """
+        audio = self.decode(snac_tokens, trim_warmup=trim_warmup, use_sliding_window=use_sliding_window)
+        if audio is None:
+            return None
+        # Convert float32 to int16 PCM
+        audio_int16 = (audio * 32767).astype(np.int16)
+        return audio_int16.tobytes()
+    def validate_tokens(self, snac_tokens: List[int]) -> bool:
+        """
+        Validate SNAC tokens before decoding.
+        Args:
+            snac_tokens: List of SNAC token IDs
+        Returns:
+            True if valid, False otherwise
+        """
+        # Check minimum length
+        if len(snac_tokens) < SNAC_TOKENS_PER_FRAME:
+            print(f"Too few tokens: {len(snac_tokens)}")
+            return False
+        # Check divisibility by 7
+        if len(snac_tokens) % SNAC_TOKENS_PER_FRAME != 0:
+            print(f"  Warning: Token count {len(snac_tokens)} not divisible by 7")
+            print(f"   Will truncate to {(len(snac_tokens) // 7) * 7}")
+        # Check token range
+        for i, token_id in enumerate(snac_tokens):
+            if token_id < CODE_TOKEN_OFFSET or token_id > 156937:
+                print(f" Invalid token at position {i}: {token_id}")
+                print(f"   Expected range: [{CODE_TOKEN_OFFSET}, 156937]")
+                return False
+        return True
+    # ========== Async Batching Methods ==========
+    @property
+    def is_running(self) -> bool:
+        """Check if batch processor is running."""
+        return self._running if self.enable_batching else False
+    async def start_batch_processor(self):
+        """Start the background batch processor task."""
+        if not self.enable_batching:
+            return
+        if self._running:
+            print("Batch processor already running")
+            return
+        self._running = True
+        self.batch_processor_task = asyncio.create_task(self._batch_processor_loop())
+        print("Batch processor started")
+    async def stop_batch_processor(self):
+        """Stop the background batch processor task."""
+        if not self.enable_batching:
+            return
+        if not self._running:
+            return
+        self._running = False
+        if self.batch_processor_task:
+            self.batch_processor_task.cancel()
+            try:
+                await self.batch_processor_task
+            except asyncio.CancelledError:
+                pass
+        print("Batch processor stopped")
+    async def decode_single_async(
+        self,
+        snac_tokens: List[int],
+        trim_warmup: bool = True,
+        use_sliding_window: bool = False
+    ) -> Optional[bytes]:
+        """
+        Async decode for batching support.
+        Queues the request and waits for batched processing.
+        Args:
+            snac_tokens: List of SNAC token IDs
+            trim_warmup: Whether to trim SNAC warmup samples (default: True)
+            use_sliding_window: If True, only return middle 2048 samples (for sliding window streaming)
+        Returns:
+            Audio bytes or None if decode fails
+        """
+        if not self.enable_batching:
+            # Fallback to synchronous decode
+            return self.decode_to_bytes(snac_tokens, trim_warmup=trim_warmup, use_sliding_window=use_sliding_window)
+        # Create future for result
+        result_future = asyncio.Future()
+        # Add to queue (include trim_warmup and sliding_window flags)
+        await self.request_queue.put((snac_tokens, trim_warmup, use_sliding_window, result_future))
+        # Wait for result
+        return await result_future
+    async def _batch_processor_loop(self):
+        """Background task that processes batched decode requests."""
+        while self._running:
+            try:
+                # Collect batch
+                batch = await self._collect_batch()
+                if not batch:
+                    continue
+                # Process batch
+                await self._process_batch(batch)
+            except asyncio.CancelledError:
+                break
+            except Exception as e:
+                print(f"Batch processor error: {e}")
+                import traceback
+                traceback.print_exc()
+    async def _collect_batch(self) -> List[Tuple[List[int], bool, bool, asyncio.Future]]:
+        """
+        Collect requests into a batch.
+        Waits for timeout or until batch is full.
+        Returns:
+            List of (tokens, trim_warmup, use_sliding_window, future) tuples
+        """
+        batch = []
+        timeout_sec = self.batch_timeout_ms / 1000.0
+        try:
+            # Wait for first request (blocking)
+            first_item = await asyncio.wait_for(
+                self.request_queue.get(),
+                timeout=timeout_sec
+            )
+            batch.append(first_item)
+            # Collect more requests (non-blocking)
+            while len(batch) < self.max_batch_size:
+                try:
+                    item = await asyncio.wait_for(
+                        self.request_queue.get(),
+                        timeout=timeout_sec
+                    )
+                    batch.append(item)
+                except asyncio.TimeoutError:
+                    break  # Timeout reached, process what we have
+        except asyncio.TimeoutError:
+            # No requests in timeout period
+            pass
+        return batch
+    @torch.inference_mode()
+    async def _process_batch(self, batch: List[Tuple[List[int], bool, bool, asyncio.Future]]):
+        """
+        Process a batch of decode requests.
+        Args:
+            batch: List of (tokens, trim_warmup, use_sliding_window, future) tuples
+        """
+        if not batch:
+            return
+        # Extract components
+        token_sequences = [item[0] for item in batch]
+        trim_warmup_flags = [item[1] for item in batch]
+        sliding_window_flags = [item[2] for item in batch]
+        futures = [item[3] for item in batch]
+        lengths = [len(tokens) for tokens in token_sequences]
+        can_batch_efficiently = len(set(lengths)) == 1
+        if can_batch_efficiently and len(batch) > 1:
+            # Efficient batching: all same length
+            try:
+                audio_bytes_list = await self._decode_batch_same_length(
+                    token_sequences, trim_warmup_flags, sliding_window_flags
+                )
+                # Set results
+                for future, audio_bytes in zip(futures, audio_bytes_list):
+                    if not future.done():
+                        future.set_result(audio_bytes)
+            except Exception as e:
+                # Set exceptions
+                for future in futures:
+                    if not future.done():
+                        future.set_exception(e)
+        else:
+            # Sequential decode (different lengths or single item)
+            for tokens, trim_warmup, use_sliding_window, future in batch:
+                try:
+                    audio_bytes = self.decode_to_bytes(
+                        tokens, trim_warmup=trim_warmup, use_sliding_window=use_sliding_window
+                    )
+                    if not future.done():
+                        future.set_result(audio_bytes)
+                except Exception as e:
+                    if not future.done():
+                        future.set_exception(e)
+    async def _decode_batch_same_length(
+        self,
+        token_sequences: List[List[int]],
+        trim_warmup_flags: List[bool],
+        sliding_window_flags: List[bool]
+    ) -> List[Optional[bytes]]:
+        """
+        Decode multiple sequences with same length in parallel.
+        Args:
+            token_sequences: List of token sequences (all same length)
+            trim_warmup_flags: List of trim_warmup flags for each sequence
+            sliding_window_flags: List of use_sliding_window flags for each sequence
+        Returns:
+            List of audio bytes
+        """
+        if not token_sequences:
+            return []
+        # Unpack all sequences
+        unpacked_list = [self.unpack_snac_from_7(tokens) for tokens in token_sequences]
+        # Check all have valid frames
+        valid_indices = [i for i, levels in enumerate(unpacked_list) if levels[0]]
+        if not valid_indices:
+            return [None] * len(token_sequences)
+        # Stack into batched tensors
+        batch_size = len(valid_indices)
+        frames = len(unpacked_list[valid_indices[0]][0])
+        # Build batched codes [batch, frames], [batch, 2*frames], [batch, 4*frames]
+        codes = [
+            torch.stack([
+                torch.tensor(unpacked_list[i][level_idx], dtype=torch.long, device=self.device)
+                for i in valid_indices
+            ], dim=0)
+            for level_idx in range(3)
+        ]
+        # Batched decode
+        z_q = self.snac_model.quantizer.from_codes(codes)
+        audio_batch = self.snac_model.decoder(z_q)  # [batch, 1, samples]
+        # Extract and convert to bytes
+        audio_bytes_list = [None] * len(token_sequences)
+        for batch_idx, orig_idx in enumerate(valid_indices):
+            audio = audio_batch[batch_idx, 0].detach().cpu().numpy()
+            # Apply sliding window or trim warmup based on flags
+            if sliding_window_flags[orig_idx]:
+                # Sliding window mode: keep middle 2048 samples only
+                if len(audio) >= 4096:
+                    audio = audio[2048:4096]
+            else:
+                # Standard mode: trim warm-up if requested
+                if trim_warmup_flags[orig_idx] and len(audio) > 2048:
+                    audio = audio[2048:]
+            # Convert to int16
+            audio_int16 = (audio * 32767).astype(np.int16)
+            audio_bytes_list[orig_idx] = audio_int16.tobytes()
+        return audio_bytes_list

maya1/streaming_pipeline.py ADDED Viewed

	@@ -0,0 +1,159 @@

+"""
+Maya1 Streaming Pipeline - Sliding Window Approach
+Implements sliding window technique for smooth streaming without artifacts.
+"""
+import asyncio
+from typing import AsyncGenerator, Optional
+from vllm import SamplingParams
+from .constants import (
+    CODE_END_TOKEN_ID,
+    SNAC_MIN_ID,
+    SNAC_MAX_ID,
+    DEFAULT_TEMPERATURE,
+    DEFAULT_TOP_P,
+    DEFAULT_MAX_TOKENS,
+    DEFAULT_MIN_TOKENS,
+    DEFAULT_REPETITION_PENALTY,
+    DEFAULT_SEED,
+)
+class Maya1SlidingWindowPipeline:
+    """
+    Streaming TTS pipeline using sliding window approach.
+    Decodes overlapping 28-token windows (4 frames) and keeps only
+    the middle 2048 samples for smooth audio continuity.
+    """
+    # Sliding window configuration
+    WINDOW_SIZE = 28  # 4 frames (7 tokens per frame)
+    YIELD_STRIDE = 7  # Yield every 1 frame
+    MIDDLE_SAMPLES = 2048  # Keep middle 2048 samples from each decode
+    def __init__(self, model, prompt_builder, snac_decoder):
+        """
+        Initialize sliding window streaming pipeline.
+        Args:
+            model: Maya1Model instance
+            prompt_builder: Maya1PromptBuilder instance
+            snac_decoder: SNACDecoder instance
+        """
+        self.model = model
+        self.prompt_builder = prompt_builder
+        self.snac_decoder = snac_decoder
+        print(f"Sliding window pipeline initialized")
+    async def generate_speech_stream(
+        self,
+        description: str,
+        text: str,
+        temperature: float = DEFAULT_TEMPERATURE,
+        top_p: float = DEFAULT_TOP_P,
+        max_tokens: int = DEFAULT_MAX_TOKENS,
+        repetition_penalty: float = DEFAULT_REPETITION_PENALTY,
+        seed: Optional[int] = None,
+    ) -> AsyncGenerator[bytes, None]:
+        """
+        Generate speech audio with sliding window streaming.
+        Args:
+            description: Voice description
+            text: Text to synthesize (may include <emotion> tags)
+            temperature: Sampling temperature
+            top_p: Nucleus sampling
+            max_tokens: Max SNAC tokens to generate
+            repetition_penalty: Prevent loops
+            seed: Random seed
+        Yields:
+            Audio bytes (int16 PCM, 24kHz mono)
+        """
+        # Build prompt
+        prompt = self.prompt_builder.build_prefix(description, text)
+        # Configure sampling
+        sampling_params = SamplingParams(
+            temperature=temperature,
+            top_p=top_p,
+            max_tokens=max_tokens,
+            min_tokens=DEFAULT_MIN_TOKENS,
+            repetition_penalty=repetition_penalty,
+            stop_token_ids=[CODE_END_TOKEN_ID],
+            seed=seed if seed is not None else DEFAULT_SEED,
+        )
+        # Stream tokens
+        snac_buffer = []
+        last_yield_position = 0
+        chunk_count = 0
+        total_tokens_seen = 0
+        async for output in self.model.generate_stream(prompt, sampling_params):
+            # Get latest generated tokens (cumulative list)
+            generated_token_ids = output.outputs[0].token_ids
+            # Process only NEW tokens since last iteration
+            new_tokens = generated_token_ids[total_tokens_seen:]
+            total_tokens_seen = len(generated_token_ids)
+            # Collect SNAC codes from new tokens
+            for token_id in new_tokens:
+                # Stop if we hit EOS
+                if token_id == CODE_END_TOKEN_ID:
+                    break
+                # Only collect valid SNAC tokens
+                if SNAC_MIN_ID <= token_id <= SNAC_MAX_ID:
+                    snac_buffer.append(token_id)
+            # Yield audio when we have enough tokens for a window
+            while len(snac_buffer) >= last_yield_position + self.WINDOW_SIZE:
+                # Get window of 28 tokens
+                window_start = last_yield_position
+                window_end = window_start + self.WINDOW_SIZE
+                window = snac_buffer[window_start:window_end]
+                if len(window) == self.WINDOW_SIZE:
+                    # Decode window to audio
+                    audio_bytes = await self.snac_decoder.decode_single_async(window)
+                    if audio_bytes:
+                        # Extract middle portion of audio
+                        audio_samples = len(audio_bytes) // 2
+                        middle_start_sample = (audio_samples - self.MIDDLE_SAMPLES) // 2
+                        middle_end_sample = middle_start_sample + self.MIDDLE_SAMPLES
+                        # Convert to byte positions
+                        middle_start_byte = middle_start_sample * 2
+                        middle_end_byte = middle_end_sample * 2
+                        # Extract middle chunk
+                        audio_chunk = audio_bytes[middle_start_byte:middle_end_byte]
+                        chunk_count += 1
+                        if chunk_count == 1:
+                            print(f" First chunk ready")
+                        yield audio_chunk
+                # Move forward by stride
+                last_yield_position += self.YIELD_STRIDE
+            # Check if generation is done
+            if CODE_END_TOKEN_ID in new_tokens:
+                break
+        # Final chunk: decode remaining tokens
+        remaining_tokens = len(snac_buffer) - last_yield_position
+        if remaining_tokens >= self.WINDOW_SIZE:
+            window = snac_buffer[-self.WINDOW_SIZE:]
+            audio_bytes = await self.snac_decoder.decode_single_async(window)
+            if audio_bytes:
+                yield audio_bytes[-self.MIDDLE_SAMPLES * 2:]
+        frames = len(snac_buffer) // 7
+        duration = frames / 6.86
+        print(f"Streamed {chunk_count} chunks (~{duration:.1f}s audio)")