Spaces:
Runtime error
Runtime error
| """ | |
| Data Preprocessors Module | |
| Provides text preprocessing and data cleaning utilities. | |
| """ | |
| import re | |
| from typing import List, Dict, Any | |
| class TextPreprocessor: | |
| """Text preprocessing utilities.""" | |
| def clean_text(text: str) -> str: | |
| """ | |
| Clean and normalize text. | |
| Args: | |
| text: Input text | |
| Returns: | |
| Cleaned text | |
| """ | |
| # Remove extra whitespace | |
| text = re.sub(r'\s+', ' ', text) | |
| # Strip leading/trailing whitespace | |
| text = text.strip() | |
| return text | |
| def remove_special_chars(text: str, keep_chars: str = " .,!?-") -> str: | |
| """ | |
| Remove special characters. | |
| Args: | |
| text: Input text | |
| keep_chars: Characters to keep | |
| Returns: | |
| Text with special chars removed | |
| """ | |
| pattern = f"[^a-zA-Z0-9{re.escape(keep_chars)}]" | |
| return re.sub(pattern, '', text) | |
| class DataCleaner: | |
| """Data cleaning utilities.""" | |
| def remove_duplicates(data: List[Dict[str, Any]], key: str = "instruction") -> List[Dict[str, Any]]: | |
| """ | |
| Remove duplicate examples. | |
| Args: | |
| data: List of data examples | |
| key: Key to check for duplicates | |
| Returns: | |
| Deduplicated data | |
| """ | |
| seen = set() | |
| unique_data = [] | |
| for example in data: | |
| value = example.get(key, "") | |
| if value and value not in seen: | |
| seen.add(value) | |
| unique_data.append(example) | |
| return unique_data | |
| def filter_by_length( | |
| data: List[Dict[str, Any]], | |
| min_length: int = 10, | |
| max_length: int = 10000, | |
| key: str = "output" | |
| ) -> List[Dict[str, Any]]: | |
| """ | |
| Filter examples by length. | |
| Args: | |
| data: List of data examples | |
| min_length: Minimum text length | |
| max_length: Maximum text length | |
| key: Key to check length | |
| Returns: | |
| Filtered data | |
| """ | |
| filtered = [] | |
| for example in data: | |
| text = example.get(key, "") | |
| if min_length <= len(text) <= max_length: | |
| filtered.append(example) | |
| return filtered | |