Spaces:
Runtime error
Runtime error
File size: 2,323 Bytes
ec8f374 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 |
"""
Data Preprocessors Module
Provides text preprocessing and data cleaning utilities.
"""
import re
from typing import List, Dict, Any
class TextPreprocessor:
"""Text preprocessing utilities."""
@staticmethod
def clean_text(text: str) -> str:
"""
Clean and normalize text.
Args:
text: Input text
Returns:
Cleaned text
"""
# Remove extra whitespace
text = re.sub(r'\s+', ' ', text)
# Strip leading/trailing whitespace
text = text.strip()
return text
@staticmethod
def remove_special_chars(text: str, keep_chars: str = " .,!?-") -> str:
"""
Remove special characters.
Args:
text: Input text
keep_chars: Characters to keep
Returns:
Text with special chars removed
"""
pattern = f"[^a-zA-Z0-9{re.escape(keep_chars)}]"
return re.sub(pattern, '', text)
class DataCleaner:
"""Data cleaning utilities."""
@staticmethod
def remove_duplicates(data: List[Dict[str, Any]], key: str = "instruction") -> List[Dict[str, Any]]:
"""
Remove duplicate examples.
Args:
data: List of data examples
key: Key to check for duplicates
Returns:
Deduplicated data
"""
seen = set()
unique_data = []
for example in data:
value = example.get(key, "")
if value and value not in seen:
seen.add(value)
unique_data.append(example)
return unique_data
@staticmethod
def filter_by_length(
data: List[Dict[str, Any]],
min_length: int = 10,
max_length: int = 10000,
key: str = "output"
) -> List[Dict[str, Any]]:
"""
Filter examples by length.
Args:
data: List of data examples
min_length: Minimum text length
max_length: Maximum text length
key: Key to check length
Returns:
Filtered data
"""
filtered = []
for example in data:
text = example.get(key, "")
if min_length <= len(text) <= max_length:
filtered.append(example)
return filtered
|