""" Data Preprocessors Module Provides text preprocessing and data cleaning utilities. """ import re from typing import List, Dict, Any class TextPreprocessor: """Text preprocessing utilities.""" @staticmethod def clean_text(text: str) -> str: """ Clean and normalize text. Args: text: Input text Returns: Cleaned text """ # Remove extra whitespace text = re.sub(r'\s+', ' ', text) # Strip leading/trailing whitespace text = text.strip() return text @staticmethod def remove_special_chars(text: str, keep_chars: str = " .,!?-") -> str: """ Remove special characters. Args: text: Input text keep_chars: Characters to keep Returns: Text with special chars removed """ pattern = f"[^a-zA-Z0-9{re.escape(keep_chars)}]" return re.sub(pattern, '', text) class DataCleaner: """Data cleaning utilities.""" @staticmethod def remove_duplicates(data: List[Dict[str, Any]], key: str = "instruction") -> List[Dict[str, Any]]: """ Remove duplicate examples. Args: data: List of data examples key: Key to check for duplicates Returns: Deduplicated data """ seen = set() unique_data = [] for example in data: value = example.get(key, "") if value and value not in seen: seen.add(value) unique_data.append(example) return unique_data @staticmethod def filter_by_length( data: List[Dict[str, Any]], min_length: int = 10, max_length: int = 10000, key: str = "output" ) -> List[Dict[str, Any]]: """ Filter examples by length. Args: data: List of data examples min_length: Minimum text length max_length: Maximum text length key: Key to check length Returns: Filtered data """ filtered = [] for example in data: text = example.get(key, "") if min_length <= len(text) <= max_length: filtered.append(example) return filtered