Spaces:
Sleeping
Sleeping
| """Prompt engineering for consistent voice generation.""" | |
| import re | |
| import random | |
| from typing import List, Dict, Optional | |
| import logging | |
| # Set up logging | |
| logger = logging.getLogger(__name__) | |
| # Voice style descriptors for consistent prompting | |
| VOICE_STYLES = { | |
| "alloy": { | |
| "adjectives": ["balanced", "natural", "clear", "articulate", "neutral", "conversational"], | |
| "characteristics": ["medium pitch", "even pacing", "neutral tone", "balanced resonance"], | |
| "speaking_style": "conversational and balanced" | |
| }, | |
| "echo": { | |
| "adjectives": ["resonant", "deep", "reverberant", "rich", "sonorous", "full"], | |
| "characteristics": ["lower pitch", "deliberate pacing", "resonant tone", "deeper timbre"], | |
| "speaking_style": "rich and resonant" | |
| }, | |
| "fable": { | |
| "adjectives": ["bright", "light", "clear", "energetic", "articulate", "animated"], | |
| "characteristics": ["higher pitch", "lively pacing", "bright tone", "clear articulation"], | |
| "speaking_style": "bright and energetic" | |
| }, | |
| "onyx": { | |
| "adjectives": ["deep", "authoritative", "powerful", "commanding", "strong", "resolute"], | |
| "characteristics": ["low pitch", "measured pacing", "authoritative tone", "strong projection"], | |
| "speaking_style": "deep and authoritative" | |
| }, | |
| "nova": { | |
| "adjectives": ["warm", "pleasant", "smooth", "harmonious", "gentle", "comforting"], | |
| "characteristics": ["medium pitch", "smooth pacing", "warm tone", "pleasant timbre"], | |
| "speaking_style": "warm and smooth" | |
| }, | |
| "shimmer": { | |
| "adjectives": ["light", "airy", "bright", "crystalline", "delicate", "expressive"], | |
| "characteristics": ["higher pitch", "quick pacing", "light tone", "bright timbre"], | |
| "speaking_style": "light and expressive" | |
| }, | |
| "custom": { | |
| "adjectives": ["clear", "distinct", "authentic", "natural", "personalized", "unique"], | |
| "characteristics": ["natural rhythm", "authentic tone", "personal inflection", "distinctive sound"], | |
| "speaking_style": "authentic and natural" | |
| } | |
| } | |
| def initialize_templates(): | |
| """Initialize prompt templates - placeholder for any future setup.""" | |
| logger.info("Prompt templates initialized") | |
| return VOICE_STYLES | |
| def split_into_segments(text: str, max_chars: int = 150) -> List[str]: | |
| """Split text into optimal segments for better generation. | |
| Args: | |
| text: Text to split | |
| max_chars: Maximum characters per segment | |
| Returns: | |
| List of text segments | |
| """ | |
| # Handle empty or very short text | |
| if not text or len(text) <= max_chars: | |
| return [text] | |
| # Split by sentences first | |
| sentences = re.split(r'(?<=[.!?])\s+', text) | |
| # Initialize segments | |
| segments = [] | |
| current_segment = "" | |
| for sentence in sentences: | |
| # If adding this sentence would exceed max_chars | |
| if len(current_segment) + len(sentence) > max_chars: | |
| # If current segment is not empty, add it to segments | |
| if current_segment: | |
| segments.append(current_segment.strip()) | |
| current_segment = "" | |
| # If this sentence alone exceeds max_chars, split it by phrases | |
| if len(sentence) > max_chars: | |
| phrases = re.split(r'(?<=[,;:])\s+', sentence) | |
| for phrase in phrases: | |
| if len(phrase) > max_chars: | |
| # Split long phrases into chunks | |
| words = phrase.split() | |
| chunk = "" | |
| for word in words: | |
| if len(chunk) + len(word) + 1 <= max_chars: | |
| chunk += " " + word if chunk else word | |
| else: | |
| segments.append(chunk.strip()) | |
| chunk = word | |
| if chunk: | |
| segments.append(chunk.strip()) | |
| else: | |
| if len(current_segment) + len(phrase) <= max_chars: | |
| current_segment += " " + phrase if current_segment else phrase | |
| else: | |
| segments.append(current_segment.strip()) | |
| current_segment = phrase | |
| else: | |
| current_segment = sentence | |
| else: | |
| current_segment += " " + sentence if current_segment else sentence | |
| # Add the last segment | |
| if current_segment: | |
| segments.append(current_segment.strip()) | |
| logger.info(f"Split text into {len(segments)} segments") | |
| return segments | |
| def format_text_for_voice(text: str, voice_name: str, segment_index: int = 0, total_segments: int = 1) -> str: | |
| """Format text with voice characteristics for more consistent generation. | |
| Args: | |
| text: Text to format | |
| voice_name: Name of the voice | |
| segment_index: Index of this segment (for multi-segment texts) | |
| total_segments: Total number of segments | |
| Returns: | |
| Formatted text optimized for consistent voice generation | |
| """ | |
| # IMPORTANT: We no longer add voice instructions in brackets since CSM reads them aloud | |
| # Instead, we're using speaker IDs to control voice identity which is what the model expects | |
| # Just return the unmodified text - the Generator class will handle proper formatting | |
| return text |