Spaces:
Runtime error
Runtime error
File size: 6,729 Bytes
ec8f374 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 |
"""
Quality Validator Module
Validates training data quality on multiple dimensions.
"""
import re
from typing import List, Dict, Any, Tuple
class QualityValidator:
"""Validate and score training data quality."""
def __init__(
self,
min_length: int = 10,
max_length: int = 5000,
min_score: float = 60.0
):
"""
Initialize quality validator.
Args:
min_length: Minimum text length
max_length: Maximum text length
min_score: Minimum quality score threshold (0-100)
"""
self.min_length = min_length
self.max_length = max_length
self.min_score = min_score
def validate_example(self, example: Dict[str, Any]) -> Tuple[bool, List[str]]:
"""
Validate a single example.
Args:
example: Data example to validate
Returns:
Tuple of (is_valid, list_of_issues)
"""
issues = []
# Check required fields
if "instruction" not in example:
issues.append("Missing 'instruction' field")
if "output" not in example:
issues.append("Missing 'output' field")
if issues:
return False, issues
# Check lengths
instruction = example.get("instruction", "")
output = example.get("output", "")
if len(instruction) < 5:
issues.append("Instruction too short")
if len(output) < self.min_length:
issues.append(f"Output too short (min {self.min_length} chars)")
if len(output) > self.max_length:
issues.append(f"Output too long (max {self.max_length} chars)")
# Check for empty output
if not output.strip():
issues.append("Empty output")
return len(issues) == 0, issues
def score_example(self, example: Dict[str, Any]) -> float:
"""
Score example quality (0-100).
Scoring dimensions:
- Length appropriateness
- Completeness
- Coherence (basic checks)
Args:
example: Data example
Returns:
Quality score (0-100)
"""
score = 100.0
# Check validity first
is_valid, issues = self.validate_example(example)
if not is_valid:
score -= 20.0 * len(issues)
if score <= 0:
return 0.0
# Length scoring
output = example.get("output", "")
output_len = len(output)
if output_len < self.min_length:
score -= 20.0
elif output_len > self.max_length:
score -= 10.0
# Coherence checks
# Check for repetition
words = output.lower().split()
if len(words) > 0:
unique_ratio = len(set(words)) / len(words)
if unique_ratio < 0.3: # Too repetitive
score -= 30.0
# Check for proper sentences (basic)
sentences = re.split(r'[.!?]+', output)
valid_sentences = [s for s in sentences if len(s.strip()) > 10]
if len(valid_sentences) == 0:
score -= 20.0
# Check for gibberish (basic)
if output_len > 20:
# Check if output has reasonable word length distribution
avg_word_len = sum(len(w) for w in words) / max(len(words), 1)
if avg_word_len > 15 or avg_word_len < 2: # Likely gibberish
score -= 25.0
return max(0.0, min(100.0, score))
def validate_batch(
self,
data: List[Dict[str, Any]],
verbose: bool = False
) -> Dict[str, Any]:
"""
Validate a batch of examples.
Args:
data: List of data examples
verbose: Print detailed validation info
Returns:
Validation results dict
"""
valid_data = []
invalid_data = []
scores = []
for i, example in enumerate(data):
is_valid, issues = self.validate_example(example)
score = self.score_example(example)
scores.append(score)
if is_valid and score >= self.min_score:
valid_data.append(example)
else:
invalid_data.append({
"example": example,
"issues": issues,
"score": score
})
if verbose:
print(f"Example {i} failed validation (score: {score:.1f})")
for issue in issues:
print(f" - {issue}")
avg_score = sum(scores) / len(scores) if scores else 0.0
results = {
"total": len(data),
"valid": len(valid_data),
"invalid": len(invalid_data),
"pass_rate": len(valid_data) / len(data) if data else 0.0,
"avg_score": avg_score,
"valid_data": valid_data,
"invalid_data": invalid_data
}
if verbose:
print(f"\n✅ Validation complete:")
print(f" Total: {results['total']}")
print(f" Valid: {results['valid']}")
print(f" Invalid: {results['invalid']}")
print(f" Pass rate: {results['pass_rate']*100:.1f}%")
print(f" Avg score: {avg_score:.1f}")
return results
def filter_data(self, data: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
"""
Filter data, keeping only valid examples.
Args:
data: List of data examples
Returns:
Filtered valid data
"""
results = self.validate_batch(data)
return results["valid_data"]
def get_quality_report(self, data: List[Dict[str, Any]]) -> str:
"""
Generate a quality report for the data.
Args:
data: List of data examples
Returns:
Formatted quality report
"""
results = self.validate_batch(data)
report = f"""
DATA QUALITY REPORT
==================
Total Examples: {results['total']}
Valid Examples: {results['valid']}
Invalid Examples: {results['invalid']}
Pass Rate: {results['pass_rate']*100:.1f}%
Average Quality Score: {results['avg_score']:.1f}/100
"""
if results['invalid_data']:
report += "COMMON ISSUES:\n"
issue_counts = {}
for item in results['invalid_data']:
for issue in item['issues']:
issue_counts[issue] = issue_counts.get(issue, 0) + 1
for issue, count in sorted(issue_counts.items(), key=lambda x: x[1], reverse=True):
report += f" - {issue}: {count} examples\n"
return report
|