Spaces:
Running
on
T4
Running
on
T4
| import unicodedata | |
| import re | |
| def clean_transcription(text): | |
| # Normalize the text to NFKD form | |
| normalized_text = unicodedata.normalize('NFKD', text) | |
| # Remove diacritics | |
| cleaned_text = ''.join([c for c in normalized_text if not unicodedata.combining(c)]) | |
| # Explicitly remove the leading ʻ character and any other specific characters | |
| cleaned_text = cleaned_text.replace('ʻ', '') | |
| # Remove any remaining special characters (if any) | |
| cleaned_text = re.sub(r'[^\w\s]', '', cleaned_text) | |
| # Ensure the text is stripped of any unwanted leading or trailing whitespace | |
| cleaned_text = cleaned_text.strip() | |
| return cleaned_text | |