Spaces:

rjproz
/

maya1-txt2speech

Running

Rajkumar Pramanik "RJproz

initial commit

3c92819 25 days ago

2.76 kB

	"""
	Maya1 Constants
	Token IDs and special tokens used in the model.
	Matches training configuration exactly.
	"""

	# Special control tokens
	SOH_ID = 128259 # Start of Human turn
	EOH_ID = 128260 # End of Human turn
	SOA_ID = 128261 # Start of AI turn
	EOA_ID = 128262 # End of AI turn (not used in maya1)
	PAD_ID = 128263 # Padding token

	# Text tokens
	BOS_ID = 128000 # Begin of sequence (Llama BOS)
	TEXT_EOT_ID = 128009 # End of text (appears in prefix, not a stop token!)

	# Audio tokens
	CODE_START_TOKEN_ID = 128257 # SOS - Start of Speech
	CODE_END_TOKEN_ID = 128258 # EOS - End of Speech (audio stop token)
	CODE_TOKEN_OFFSET = 128266 # Start of SNAC codes

	# SNAC token range
	SNAC_MIN_ID = 128266
	SNAC_MAX_ID = 156937 # 128266 + (7 * 4096) - 1

	# Stop tokens for generation
	# CRITICAL: Only use CODE_END_TOKEN_ID (128258) for audio generation
	# TEXT_EOT_ID (128009) appears in prefix and should NOT stop generation
	TRAINING_STOP_TOKEN_IDS = [CODE_END_TOKEN_ID] # [128258]
	ALL_POSSIBLE_STOP_TOKENS = [TEXT_EOT_ID, CODE_END_TOKEN_ID] # For reference only

	# 20 Extended Emotion Tags (must be single tokens)
	ALL_EMOTION_TAGS = [
	'<angry>',
	'<appalled>',
	'<chuckle>',
	'<cry>',
	'<curious>',
	'<disappointed>',
	'<excited>',
	'<exhale>',
	'<gasp>',
	'<giggle>',
	'<gulp>',
	'<laugh>',
	'<laugh_harder>',
	'<mischievous>',
	'<sarcastic>',
	'<scream>',
	'<sigh>',
	'<sing>',
	'<snort>',
	'<whisper>',
	]

	# Model configuration
	DEFAULT_MODEL_PATH = "maya-research/maya1"
	DEFAULT_CHECKPOINT = "checkpoint-25000"
	DEFAULT_MAX_MODEL_LEN = 8192

	# SNAC configuration
	SNAC_MODEL_NAME = "hubertsiuzdak/snac_24khz"
	SNAC_SAMPLE_RATE = 24000
	SNAC_TOKENS_PER_FRAME = 7
	SNAC_LEVELS = 3

	# Audio configuration
	AUDIO_SAMPLE_RATE = 24000
	AUDIO_CHANNELS = 1
	AUDIO_BITS_PER_SAMPLE = 16

	# Generation defaults
	DEFAULT_TEMPERATURE = 0.4 # Lower temp for more stable generation
	DEFAULT_TOP_P = 0.9
	DEFAULT_MAX_TOKENS = 2048 # Reasonable default for most use cases
	DEFAULT_MIN_TOKENS = 28 # At least 4 SNAC frames
	DEFAULT_REPETITION_PENALTY = 1.1
	DEFAULT_SEED = None # None = random, set integer for reproducibility

	# IMPORTANT: Emotion tags consume audio time!
	# <laugh> = ~4-6 seconds (~300-400 tokens)
	# <excited>, <chuckle> = ~1-2 seconds (~50-150 tokens)

	# Recommended max_tokens by use case:
	# - Short phrases (< 10 words): 150-250 tokens (~3-5s)
	# - Medium text (10-30 words): 250-500 tokens (~5-10s)
	# - Long text (30+ words): 500-1500 tokens (~10-30s)
	# - Very long text: 1500-2000 tokens (~30-42s)
	# Note: 1 second ≈ 48 tokens (7 tokens/frame * 6.86 frames/sec)

	# Streaming configuration
	STREAM_BUFFER_SIZE = 28 # 4 frames (process every 28 tokens)
	SNAC_BATCH_SIZE = 64
	SNAC_BATCH_TIMEOUT_MS = 15