LaunchLLM / data_aggregation /quality_validator.py
Bmccloud22's picture
Deploy LaunchLLM - Production AI Training Platform
ec8f374 verified
"""
Quality Validator Module
Validates training data quality on multiple dimensions.
"""
import re
from typing import List, Dict, Any, Tuple
class QualityValidator:
"""Validate and score training data quality."""
def __init__(
self,
min_length: int = 10,
max_length: int = 5000,
min_score: float = 60.0
):
"""
Initialize quality validator.
Args:
min_length: Minimum text length
max_length: Maximum text length
min_score: Minimum quality score threshold (0-100)
"""
self.min_length = min_length
self.max_length = max_length
self.min_score = min_score
def validate_example(self, example: Dict[str, Any]) -> Tuple[bool, List[str]]:
"""
Validate a single example.
Args:
example: Data example to validate
Returns:
Tuple of (is_valid, list_of_issues)
"""
issues = []
# Check required fields
if "instruction" not in example:
issues.append("Missing 'instruction' field")
if "output" not in example:
issues.append("Missing 'output' field")
if issues:
return False, issues
# Check lengths
instruction = example.get("instruction", "")
output = example.get("output", "")
if len(instruction) < 5:
issues.append("Instruction too short")
if len(output) < self.min_length:
issues.append(f"Output too short (min {self.min_length} chars)")
if len(output) > self.max_length:
issues.append(f"Output too long (max {self.max_length} chars)")
# Check for empty output
if not output.strip():
issues.append("Empty output")
return len(issues) == 0, issues
def score_example(self, example: Dict[str, Any]) -> float:
"""
Score example quality (0-100).
Scoring dimensions:
- Length appropriateness
- Completeness
- Coherence (basic checks)
Args:
example: Data example
Returns:
Quality score (0-100)
"""
score = 100.0
# Check validity first
is_valid, issues = self.validate_example(example)
if not is_valid:
score -= 20.0 * len(issues)
if score <= 0:
return 0.0
# Length scoring
output = example.get("output", "")
output_len = len(output)
if output_len < self.min_length:
score -= 20.0
elif output_len > self.max_length:
score -= 10.0
# Coherence checks
# Check for repetition
words = output.lower().split()
if len(words) > 0:
unique_ratio = len(set(words)) / len(words)
if unique_ratio < 0.3: # Too repetitive
score -= 30.0
# Check for proper sentences (basic)
sentences = re.split(r'[.!?]+', output)
valid_sentences = [s for s in sentences if len(s.strip()) > 10]
if len(valid_sentences) == 0:
score -= 20.0
# Check for gibberish (basic)
if output_len > 20:
# Check if output has reasonable word length distribution
avg_word_len = sum(len(w) for w in words) / max(len(words), 1)
if avg_word_len > 15 or avg_word_len < 2: # Likely gibberish
score -= 25.0
return max(0.0, min(100.0, score))
def validate_batch(
self,
data: List[Dict[str, Any]],
verbose: bool = False
) -> Dict[str, Any]:
"""
Validate a batch of examples.
Args:
data: List of data examples
verbose: Print detailed validation info
Returns:
Validation results dict
"""
valid_data = []
invalid_data = []
scores = []
for i, example in enumerate(data):
is_valid, issues = self.validate_example(example)
score = self.score_example(example)
scores.append(score)
if is_valid and score >= self.min_score:
valid_data.append(example)
else:
invalid_data.append({
"example": example,
"issues": issues,
"score": score
})
if verbose:
print(f"Example {i} failed validation (score: {score:.1f})")
for issue in issues:
print(f" - {issue}")
avg_score = sum(scores) / len(scores) if scores else 0.0
results = {
"total": len(data),
"valid": len(valid_data),
"invalid": len(invalid_data),
"pass_rate": len(valid_data) / len(data) if data else 0.0,
"avg_score": avg_score,
"valid_data": valid_data,
"invalid_data": invalid_data
}
if verbose:
print(f"\n✅ Validation complete:")
print(f" Total: {results['total']}")
print(f" Valid: {results['valid']}")
print(f" Invalid: {results['invalid']}")
print(f" Pass rate: {results['pass_rate']*100:.1f}%")
print(f" Avg score: {avg_score:.1f}")
return results
def filter_data(self, data: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
"""
Filter data, keeping only valid examples.
Args:
data: List of data examples
Returns:
Filtered valid data
"""
results = self.validate_batch(data)
return results["valid_data"]
def get_quality_report(self, data: List[Dict[str, Any]]) -> str:
"""
Generate a quality report for the data.
Args:
data: List of data examples
Returns:
Formatted quality report
"""
results = self.validate_batch(data)
report = f"""
DATA QUALITY REPORT
==================
Total Examples: {results['total']}
Valid Examples: {results['valid']}
Invalid Examples: {results['invalid']}
Pass Rate: {results['pass_rate']*100:.1f}%
Average Quality Score: {results['avg_score']:.1f}/100
"""
if results['invalid_data']:
report += "COMMON ISSUES:\n"
issue_counts = {}
for item in results['invalid_data']:
for issue in item['issues']:
issue_counts[issue] = issue_counts.get(issue, 0) + 1
for issue, count in sorted(issue_counts.items(), key=lambda x: x[1], reverse=True):
report += f" - {issue}: {count} examples\n"
return report