""" Quality Validator Module Validates training data quality on multiple dimensions. """ import re from typing import List, Dict, Any, Tuple class QualityValidator: """Validate and score training data quality.""" def __init__( self, min_length: int = 10, max_length: int = 5000, min_score: float = 60.0 ): """ Initialize quality validator. Args: min_length: Minimum text length max_length: Maximum text length min_score: Minimum quality score threshold (0-100) """ self.min_length = min_length self.max_length = max_length self.min_score = min_score def validate_example(self, example: Dict[str, Any]) -> Tuple[bool, List[str]]: """ Validate a single example. Args: example: Data example to validate Returns: Tuple of (is_valid, list_of_issues) """ issues = [] # Check required fields if "instruction" not in example: issues.append("Missing 'instruction' field") if "output" not in example: issues.append("Missing 'output' field") if issues: return False, issues # Check lengths instruction = example.get("instruction", "") output = example.get("output", "") if len(instruction) < 5: issues.append("Instruction too short") if len(output) < self.min_length: issues.append(f"Output too short (min {self.min_length} chars)") if len(output) > self.max_length: issues.append(f"Output too long (max {self.max_length} chars)") # Check for empty output if not output.strip(): issues.append("Empty output") return len(issues) == 0, issues def score_example(self, example: Dict[str, Any]) -> float: """ Score example quality (0-100). Scoring dimensions: - Length appropriateness - Completeness - Coherence (basic checks) Args: example: Data example Returns: Quality score (0-100) """ score = 100.0 # Check validity first is_valid, issues = self.validate_example(example) if not is_valid: score -= 20.0 * len(issues) if score <= 0: return 0.0 # Length scoring output = example.get("output", "") output_len = len(output) if output_len < self.min_length: score -= 20.0 elif output_len > self.max_length: score -= 10.0 # Coherence checks # Check for repetition words = output.lower().split() if len(words) > 0: unique_ratio = len(set(words)) / len(words) if unique_ratio < 0.3: # Too repetitive score -= 30.0 # Check for proper sentences (basic) sentences = re.split(r'[.!?]+', output) valid_sentences = [s for s in sentences if len(s.strip()) > 10] if len(valid_sentences) == 0: score -= 20.0 # Check for gibberish (basic) if output_len > 20: # Check if output has reasonable word length distribution avg_word_len = sum(len(w) for w in words) / max(len(words), 1) if avg_word_len > 15 or avg_word_len < 2: # Likely gibberish score -= 25.0 return max(0.0, min(100.0, score)) def validate_batch( self, data: List[Dict[str, Any]], verbose: bool = False ) -> Dict[str, Any]: """ Validate a batch of examples. Args: data: List of data examples verbose: Print detailed validation info Returns: Validation results dict """ valid_data = [] invalid_data = [] scores = [] for i, example in enumerate(data): is_valid, issues = self.validate_example(example) score = self.score_example(example) scores.append(score) if is_valid and score >= self.min_score: valid_data.append(example) else: invalid_data.append({ "example": example, "issues": issues, "score": score }) if verbose: print(f"Example {i} failed validation (score: {score:.1f})") for issue in issues: print(f" - {issue}") avg_score = sum(scores) / len(scores) if scores else 0.0 results = { "total": len(data), "valid": len(valid_data), "invalid": len(invalid_data), "pass_rate": len(valid_data) / len(data) if data else 0.0, "avg_score": avg_score, "valid_data": valid_data, "invalid_data": invalid_data } if verbose: print(f"\n✅ Validation complete:") print(f" Total: {results['total']}") print(f" Valid: {results['valid']}") print(f" Invalid: {results['invalid']}") print(f" Pass rate: {results['pass_rate']*100:.1f}%") print(f" Avg score: {avg_score:.1f}") return results def filter_data(self, data: List[Dict[str, Any]]) -> List[Dict[str, Any]]: """ Filter data, keeping only valid examples. Args: data: List of data examples Returns: Filtered valid data """ results = self.validate_batch(data) return results["valid_data"] def get_quality_report(self, data: List[Dict[str, Any]]) -> str: """ Generate a quality report for the data. Args: data: List of data examples Returns: Formatted quality report """ results = self.validate_batch(data) report = f""" DATA QUALITY REPORT ================== Total Examples: {results['total']} Valid Examples: {results['valid']} Invalid Examples: {results['invalid']} Pass Rate: {results['pass_rate']*100:.1f}% Average Quality Score: {results['avg_score']:.1f}/100 """ if results['invalid_data']: report += "COMMON ISSUES:\n" issue_counts = {} for item in results['invalid_data']: for issue in item['issues']: issue_counts[issue] = issue_counts.get(issue, 0) + 1 for issue, count in sorted(issue_counts.items(), key=lambda x: x[1], reverse=True): report += f" - {issue}: {count} examples\n" return report