maya1-txt2speech / maya1 /constants.py
Rajkumar Pramanik "RJproz
initial commit
3c92819
raw
history blame
2.76 kB
"""
Maya1 Constants
Token IDs and special tokens used in the model.
Matches training configuration exactly.
"""
# Special control tokens
SOH_ID = 128259 # Start of Human turn
EOH_ID = 128260 # End of Human turn
SOA_ID = 128261 # Start of AI turn
EOA_ID = 128262 # End of AI turn (not used in maya1)
PAD_ID = 128263 # Padding token
# Text tokens
BOS_ID = 128000 # Begin of sequence (Llama BOS)
TEXT_EOT_ID = 128009 # End of text (appears in prefix, not a stop token!)
# Audio tokens
CODE_START_TOKEN_ID = 128257 # SOS - Start of Speech
CODE_END_TOKEN_ID = 128258 # EOS - End of Speech (audio stop token)
CODE_TOKEN_OFFSET = 128266 # Start of SNAC codes
# SNAC token range
SNAC_MIN_ID = 128266
SNAC_MAX_ID = 156937 # 128266 + (7 * 4096) - 1
# Stop tokens for generation
# CRITICAL: Only use CODE_END_TOKEN_ID (128258) for audio generation
# TEXT_EOT_ID (128009) appears in prefix and should NOT stop generation
TRAINING_STOP_TOKEN_IDS = [CODE_END_TOKEN_ID] # [128258]
ALL_POSSIBLE_STOP_TOKENS = [TEXT_EOT_ID, CODE_END_TOKEN_ID] # For reference only
# 20 Extended Emotion Tags (must be single tokens)
ALL_EMOTION_TAGS = [
'<angry>',
'<appalled>',
'<chuckle>',
'<cry>',
'<curious>',
'<disappointed>',
'<excited>',
'<exhale>',
'<gasp>',
'<giggle>',
'<gulp>',
'<laugh>',
'<laugh_harder>',
'<mischievous>',
'<sarcastic>',
'<scream>',
'<sigh>',
'<sing>',
'<snort>',
'<whisper>',
]
# Model configuration
DEFAULT_MODEL_PATH = "maya-research/maya1"
DEFAULT_CHECKPOINT = "checkpoint-25000"
DEFAULT_MAX_MODEL_LEN = 8192
# SNAC configuration
SNAC_MODEL_NAME = "hubertsiuzdak/snac_24khz"
SNAC_SAMPLE_RATE = 24000
SNAC_TOKENS_PER_FRAME = 7
SNAC_LEVELS = 3
# Audio configuration
AUDIO_SAMPLE_RATE = 24000
AUDIO_CHANNELS = 1
AUDIO_BITS_PER_SAMPLE = 16
# Generation defaults
DEFAULT_TEMPERATURE = 0.4 # Lower temp for more stable generation
DEFAULT_TOP_P = 0.9
DEFAULT_MAX_TOKENS = 2048 # Reasonable default for most use cases
DEFAULT_MIN_TOKENS = 28 # At least 4 SNAC frames
DEFAULT_REPETITION_PENALTY = 1.1
DEFAULT_SEED = None # None = random, set integer for reproducibility
# IMPORTANT: Emotion tags consume audio time!
# <laugh> = ~4-6 seconds (~300-400 tokens)
# <excited>, <chuckle> = ~1-2 seconds (~50-150 tokens)
# Recommended max_tokens by use case:
# - Short phrases (< 10 words): 150-250 tokens (~3-5s)
# - Medium text (10-30 words): 250-500 tokens (~5-10s)
# - Long text (30+ words): 500-1500 tokens (~10-30s)
# - Very long text: 1500-2000 tokens (~30-42s)
# Note: 1 second ≈ 48 tokens (7 tokens/frame * 6.86 frames/sec)
# Streaming configuration
STREAM_BUFFER_SIZE = 28 # 4 frames (process every 28 tokens)
SNAC_BATCH_SIZE = 64
SNAC_BATCH_TIMEOUT_MS = 15