Spaces:
Runtime error
Runtime error
| """ | |
| Quality Validator Module | |
| Validates training data quality on multiple dimensions. | |
| """ | |
| import re | |
| from typing import List, Dict, Any, Tuple | |
| class QualityValidator: | |
| """Validate and score training data quality.""" | |
| def __init__( | |
| self, | |
| min_length: int = 10, | |
| max_length: int = 5000, | |
| min_score: float = 60.0 | |
| ): | |
| """ | |
| Initialize quality validator. | |
| Args: | |
| min_length: Minimum text length | |
| max_length: Maximum text length | |
| min_score: Minimum quality score threshold (0-100) | |
| """ | |
| self.min_length = min_length | |
| self.max_length = max_length | |
| self.min_score = min_score | |
| def validate_example(self, example: Dict[str, Any]) -> Tuple[bool, List[str]]: | |
| """ | |
| Validate a single example. | |
| Args: | |
| example: Data example to validate | |
| Returns: | |
| Tuple of (is_valid, list_of_issues) | |
| """ | |
| issues = [] | |
| # Check required fields | |
| if "instruction" not in example: | |
| issues.append("Missing 'instruction' field") | |
| if "output" not in example: | |
| issues.append("Missing 'output' field") | |
| if issues: | |
| return False, issues | |
| # Check lengths | |
| instruction = example.get("instruction", "") | |
| output = example.get("output", "") | |
| if len(instruction) < 5: | |
| issues.append("Instruction too short") | |
| if len(output) < self.min_length: | |
| issues.append(f"Output too short (min {self.min_length} chars)") | |
| if len(output) > self.max_length: | |
| issues.append(f"Output too long (max {self.max_length} chars)") | |
| # Check for empty output | |
| if not output.strip(): | |
| issues.append("Empty output") | |
| return len(issues) == 0, issues | |
| def score_example(self, example: Dict[str, Any]) -> float: | |
| """ | |
| Score example quality (0-100). | |
| Scoring dimensions: | |
| - Length appropriateness | |
| - Completeness | |
| - Coherence (basic checks) | |
| Args: | |
| example: Data example | |
| Returns: | |
| Quality score (0-100) | |
| """ | |
| score = 100.0 | |
| # Check validity first | |
| is_valid, issues = self.validate_example(example) | |
| if not is_valid: | |
| score -= 20.0 * len(issues) | |
| if score <= 0: | |
| return 0.0 | |
| # Length scoring | |
| output = example.get("output", "") | |
| output_len = len(output) | |
| if output_len < self.min_length: | |
| score -= 20.0 | |
| elif output_len > self.max_length: | |
| score -= 10.0 | |
| # Coherence checks | |
| # Check for repetition | |
| words = output.lower().split() | |
| if len(words) > 0: | |
| unique_ratio = len(set(words)) / len(words) | |
| if unique_ratio < 0.3: # Too repetitive | |
| score -= 30.0 | |
| # Check for proper sentences (basic) | |
| sentences = re.split(r'[.!?]+', output) | |
| valid_sentences = [s for s in sentences if len(s.strip()) > 10] | |
| if len(valid_sentences) == 0: | |
| score -= 20.0 | |
| # Check for gibberish (basic) | |
| if output_len > 20: | |
| # Check if output has reasonable word length distribution | |
| avg_word_len = sum(len(w) for w in words) / max(len(words), 1) | |
| if avg_word_len > 15 or avg_word_len < 2: # Likely gibberish | |
| score -= 25.0 | |
| return max(0.0, min(100.0, score)) | |
| def validate_batch( | |
| self, | |
| data: List[Dict[str, Any]], | |
| verbose: bool = False | |
| ) -> Dict[str, Any]: | |
| """ | |
| Validate a batch of examples. | |
| Args: | |
| data: List of data examples | |
| verbose: Print detailed validation info | |
| Returns: | |
| Validation results dict | |
| """ | |
| valid_data = [] | |
| invalid_data = [] | |
| scores = [] | |
| for i, example in enumerate(data): | |
| is_valid, issues = self.validate_example(example) | |
| score = self.score_example(example) | |
| scores.append(score) | |
| if is_valid and score >= self.min_score: | |
| valid_data.append(example) | |
| else: | |
| invalid_data.append({ | |
| "example": example, | |
| "issues": issues, | |
| "score": score | |
| }) | |
| if verbose: | |
| print(f"Example {i} failed validation (score: {score:.1f})") | |
| for issue in issues: | |
| print(f" - {issue}") | |
| avg_score = sum(scores) / len(scores) if scores else 0.0 | |
| results = { | |
| "total": len(data), | |
| "valid": len(valid_data), | |
| "invalid": len(invalid_data), | |
| "pass_rate": len(valid_data) / len(data) if data else 0.0, | |
| "avg_score": avg_score, | |
| "valid_data": valid_data, | |
| "invalid_data": invalid_data | |
| } | |
| if verbose: | |
| print(f"\n✅ Validation complete:") | |
| print(f" Total: {results['total']}") | |
| print(f" Valid: {results['valid']}") | |
| print(f" Invalid: {results['invalid']}") | |
| print(f" Pass rate: {results['pass_rate']*100:.1f}%") | |
| print(f" Avg score: {avg_score:.1f}") | |
| return results | |
| def filter_data(self, data: List[Dict[str, Any]]) -> List[Dict[str, Any]]: | |
| """ | |
| Filter data, keeping only valid examples. | |
| Args: | |
| data: List of data examples | |
| Returns: | |
| Filtered valid data | |
| """ | |
| results = self.validate_batch(data) | |
| return results["valid_data"] | |
| def get_quality_report(self, data: List[Dict[str, Any]]) -> str: | |
| """ | |
| Generate a quality report for the data. | |
| Args: | |
| data: List of data examples | |
| Returns: | |
| Formatted quality report | |
| """ | |
| results = self.validate_batch(data) | |
| report = f""" | |
| DATA QUALITY REPORT | |
| ================== | |
| Total Examples: {results['total']} | |
| Valid Examples: {results['valid']} | |
| Invalid Examples: {results['invalid']} | |
| Pass Rate: {results['pass_rate']*100:.1f}% | |
| Average Quality Score: {results['avg_score']:.1f}/100 | |
| """ | |
| if results['invalid_data']: | |
| report += "COMMON ISSUES:\n" | |
| issue_counts = {} | |
| for item in results['invalid_data']: | |
| for issue in item['issues']: | |
| issue_counts[issue] = issue_counts.get(issue, 0) + 1 | |
| for issue, count in sorted(issue_counts.items(), key=lambda x: x[1], reverse=True): | |
| report += f" - {issue}: {count} examples\n" | |
| return report | |