Spaces:

MothersTongue
/

voice-matcher-api

Running on T4

voice-matcher-api / app /string_processor.py

fe79a8f over 1 year ago

686 Bytes

	import unicodedata
	import re


	def clean_transcription(text):
	# Normalize the text to NFKD form
	normalized_text = unicodedata.normalize('NFKD', text)

	# Remove diacritics
	cleaned_text = ''.join([c for c in normalized_text if not unicodedata.combining(c)])

	# Explicitly remove the leading ʻ character and any other specific characters
	cleaned_text = cleaned_text.replace('ʻ', '')

	# Remove any remaining special characters (if any)
	cleaned_text = re.sub(r'[^\w\s]', '', cleaned_text)

	# Ensure the text is stripped of any unwanted leading or trailing whitespace
	cleaned_text = cleaned_text.strip()

	return cleaned_text