Spaces:

jameszokah
/

jamiya

Sleeping

App Files Files Community

jamiya / app /text_normalizer.py

jameszokah

init commit

383520d 8 months ago

raw

history blame

5.57 kB

	"""
	Text normalization and cleaning utilities for CSM-1B TTS system.
	Handles common issues like contractions, numbers, and special characters.
	"""
	import re
	import logging

	logger = logging.getLogger(__name__)

	class TextNormalizer:
	"""Text normalization utilities for TTS."""

	# Common English contractions mapping
	CONTRACTIONS = {
	"don't": "dont",
	"won't": "wont",
	"can't": "cant",
	"isn't": "isnt",
	"he's": "hes",
	"she's": "shes",
	"they're": "theyre",
	"we're": "were",
	"you're": "youre",
	"that's": "thats",
	"it's": "its",
	"what's": "whats",
	"let's": "lets",
	"who's": "whos",
	"how's": "hows",
	"where's": "wheres",
	"there's": "theres",
	"wouldn't": "wouldnt",
	"shouldn't": "shouldnt",
	"couldn't": "couldnt",
	"hasn't": "hasnt",
	"haven't": "havent",
	"hadn't": "hadnt",
	"didn't": "didnt",
	"i'm": "im",
	"i've": "ive",
	"i'd": "id",
	"i'll": "ill",
	"you've": "youve",
	"you'll": "youll",
	"you'd": "youd",
	"we've": "weve",
	"we'll": "well",
	"we'd": "wed",
	"they've": "theyve",
	"they'll": "theyll",
	"they'd": "theyd",
	"aren't": "arent",
	"weren't": "werent",
	"wasn't": "wasnt",
	}

	# Common abbreviations to expand
	ABBREVIATIONS = {
	"Mr.": "Mister",
	"Mrs.": "Misses",
	"Dr.": "Doctor",
	"Prof.": "Professor",
	"St.": "Street",
	"Rd.": "Road",
	"Ave.": "Avenue",
	"vs.": "versus",
	"etc.": "etcetera",
	"e.g.": "for example",
	"i.e.": "that is",
	"approx.": "approximately",
	}

	# Simple number words for common numbers
	NUMBER_WORDS = {
	"0": "zero",
	"1": "one",
	"2": "two",
	"3": "three",
	"4": "four",
	"5": "five",
	"6": "six",
	"7": "seven",
	"8": "eight",
	"9": "nine",
	"10": "ten",
	"11": "eleven",
	"12": "twelve",
	"13": "thirteen",
	"14": "fourteen",
	"15": "fifteen",
	"16": "sixteen",
	"17": "seventeen",
	"18": "eighteen",
	"19": "nineteen",
	"20": "twenty",
	"30": "thirty",
	"40": "forty",
	"50": "fifty",
	"60": "sixty",
	"70": "seventy",
	"80": "eighty",
	"90": "ninety",
	"100": "one hundred",
	"1000": "one thousand",
	"1000000": "one million",
	"1000000000": "one billion",
	}

	@classmethod
	def normalize_text(cls, text: str) -> str:
	"""
	Normalize text for TTS: handle contractions, punctuation, and special cases.

	Args:
	text: Input text to normalize

	Returns:
	Normalized text ready for TTS
	"""
	if not text:
	return text

	# Log original text for debugging
	logger.debug(f"Normalizing text: '{text}'")

	# Remove voice instructions in square brackets
	text = re.sub(r'\[.*?\]', '', text)

	# Handle contractions - preserving case sensitivity
	for contraction, replacement in cls.CONTRACTIONS.items():
	# Case insensitive replacement
	text = re.sub(r'\b' + re.escape(contraction) + r'\b', replacement, text, flags=re.IGNORECASE)

	# Expand common abbreviations
	for abbr, expanded in cls.ABBREVIATIONS.items():
	text = text.replace(abbr, expanded)

	# Handle numbers - only convert standalone numbers
	def replace_number(match):
	number = match.group(0)
	if number in cls.NUMBER_WORDS:
	return cls.NUMBER_WORDS[number]
	return number

	text = re.sub(r'\b\d+\b', replace_number, text)

	# Replace problematic symbols
	text = text.replace("&", " and ")
	text = text.replace("%", " percent ")
	text = text.replace("@", " at ")
	text = text.replace("#", " number ")
	text = text.replace("$", " dollar ")
	text = text.replace("€", " euro ")
	text = text.replace("£", " pound ")
	text = text.replace("¥", " yen ")

	# Handle dates in MM/DD/YYYY format
	text = re.sub(r'\b(\d{1,2})/(\d{1,2})/(\d{4})\b', r'\1 \2 \3', text)

	# Fix excessive spaces
	text = re.sub(r'\s+', ' ', text).strip()

	# Ensure sentence ends with punctuation
	if not text[-1] in ['.', '!', '?', ';', ':', ',']:
	text = text + '.'

	logger.debug(f"Normalized text: '{text}'")
	return text

	@classmethod
	def split_into_sentences(cls, text: str) -> list:
	"""
	Split text into sentences for better TTS performance.

	Args:
	text: Input text to split

	Returns:
	List of sentences
	"""
	# Normalize first
	text = cls.normalize_text(text)

	# Split on sentence boundaries
	sentences = re.split(r'(?<=[.!?])\s+', text)

	# Remove empty sentences
	sentences = [s for s in sentences if s.strip()]

	return sentences

	def clean_text_for_tts(text: str) -> str:
	"""Clean and normalize text for TTS processing."""
	return TextNormalizer.normalize_text(text)