File size: 6,729 Bytes
ec8f374
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
"""
Quality Validator Module

Validates training data quality on multiple dimensions.
"""

import re
from typing import List, Dict, Any, Tuple


class QualityValidator:
    """Validate and score training data quality."""

    def __init__(
        self,
        min_length: int = 10,
        max_length: int = 5000,
        min_score: float = 60.0
    ):
        """
        Initialize quality validator.

        Args:
            min_length: Minimum text length
            max_length: Maximum text length
            min_score: Minimum quality score threshold (0-100)
        """
        self.min_length = min_length
        self.max_length = max_length
        self.min_score = min_score

    def validate_example(self, example: Dict[str, Any]) -> Tuple[bool, List[str]]:
        """
        Validate a single example.

        Args:
            example: Data example to validate

        Returns:
            Tuple of (is_valid, list_of_issues)
        """
        issues = []

        # Check required fields
        if "instruction" not in example:
            issues.append("Missing 'instruction' field")
        if "output" not in example:
            issues.append("Missing 'output' field")

        if issues:
            return False, issues

        # Check lengths
        instruction = example.get("instruction", "")
        output = example.get("output", "")

        if len(instruction) < 5:
            issues.append("Instruction too short")
        if len(output) < self.min_length:
            issues.append(f"Output too short (min {self.min_length} chars)")
        if len(output) > self.max_length:
            issues.append(f"Output too long (max {self.max_length} chars)")

        # Check for empty output
        if not output.strip():
            issues.append("Empty output")

        return len(issues) == 0, issues

    def score_example(self, example: Dict[str, Any]) -> float:
        """
        Score example quality (0-100).

        Scoring dimensions:
        - Length appropriateness
        - Completeness
        - Coherence (basic checks)

        Args:
            example: Data example

        Returns:
            Quality score (0-100)
        """
        score = 100.0

        # Check validity first
        is_valid, issues = self.validate_example(example)
        if not is_valid:
            score -= 20.0 * len(issues)

        if score <= 0:
            return 0.0

        # Length scoring
        output = example.get("output", "")
        output_len = len(output)

        if output_len < self.min_length:
            score -= 20.0
        elif output_len > self.max_length:
            score -= 10.0

        # Coherence checks
        # Check for repetition
        words = output.lower().split()
        if len(words) > 0:
            unique_ratio = len(set(words)) / len(words)
            if unique_ratio < 0.3:  # Too repetitive
                score -= 30.0

        # Check for proper sentences (basic)
        sentences = re.split(r'[.!?]+', output)
        valid_sentences = [s for s in sentences if len(s.strip()) > 10]
        if len(valid_sentences) == 0:
            score -= 20.0

        # Check for gibberish (basic)
        if output_len > 20:
            # Check if output has reasonable word length distribution
            avg_word_len = sum(len(w) for w in words) / max(len(words), 1)
            if avg_word_len > 15 or avg_word_len < 2:  # Likely gibberish
                score -= 25.0

        return max(0.0, min(100.0, score))

    def validate_batch(
        self,
        data: List[Dict[str, Any]],
        verbose: bool = False
    ) -> Dict[str, Any]:
        """
        Validate a batch of examples.

        Args:
            data: List of data examples
            verbose: Print detailed validation info

        Returns:
            Validation results dict
        """
        valid_data = []
        invalid_data = []
        scores = []

        for i, example in enumerate(data):
            is_valid, issues = self.validate_example(example)
            score = self.score_example(example)
            scores.append(score)

            if is_valid and score >= self.min_score:
                valid_data.append(example)
            else:
                invalid_data.append({
                    "example": example,
                    "issues": issues,
                    "score": score
                })

                if verbose:
                    print(f"Example {i} failed validation (score: {score:.1f})")
                    for issue in issues:
                        print(f"  - {issue}")

        avg_score = sum(scores) / len(scores) if scores else 0.0

        results = {
            "total": len(data),
            "valid": len(valid_data),
            "invalid": len(invalid_data),
            "pass_rate": len(valid_data) / len(data) if data else 0.0,
            "avg_score": avg_score,
            "valid_data": valid_data,
            "invalid_data": invalid_data
        }

        if verbose:
            print(f"\n✅ Validation complete:")
            print(f"  Total: {results['total']}")
            print(f"  Valid: {results['valid']}")
            print(f"  Invalid: {results['invalid']}")
            print(f"  Pass rate: {results['pass_rate']*100:.1f}%")
            print(f"  Avg score: {avg_score:.1f}")

        return results

    def filter_data(self, data: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
        """
        Filter data, keeping only valid examples.

        Args:
            data: List of data examples

        Returns:
            Filtered valid data
        """
        results = self.validate_batch(data)
        return results["valid_data"]

    def get_quality_report(self, data: List[Dict[str, Any]]) -> str:
        """
        Generate a quality report for the data.

        Args:
            data: List of data examples

        Returns:
            Formatted quality report
        """
        results = self.validate_batch(data)

        report = f"""
DATA QUALITY REPORT
==================

Total Examples: {results['total']}
Valid Examples: {results['valid']}
Invalid Examples: {results['invalid']}
Pass Rate: {results['pass_rate']*100:.1f}%
Average Quality Score: {results['avg_score']:.1f}/100

"""

        if results['invalid_data']:
            report += "COMMON ISSUES:\n"
            issue_counts = {}
            for item in results['invalid_data']:
                for issue in item['issues']:
                    issue_counts[issue] = issue_counts.get(issue, 0) + 1

            for issue, count in sorted(issue_counts.items(), key=lambda x: x[1], reverse=True):
                report += f"  - {issue}: {count} examples\n"

        return report