File size: 5,590 Bytes
4089b4a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
"""
Validation utilities for model and data validation
"""
import os
import yaml
from typing import Dict, List, Optional
import logging
from pathlib import Path


def validate_config(config: Dict) -> List[str]:
    """
    Validate configuration file for common issues.
    
    Args:
        config: Configuration dictionary
        
    Returns:
        List of validation error messages (empty if valid)
    """
    errors = []
    
    # Check required sections
    required_sections = ['model', 'training', 'data', 'labels']
    for section in required_sections:
        if section not in config:
            errors.append(f"Missing required section: {section}")
    
    if errors:
        return errors
    
    # Validate model section
    if 'name' not in config['model']:
        errors.append("model.name is required")
    if 'num_labels' not in config['model']:
        errors.append("model.num_labels is required")
    elif config['model']['num_labels'] != len(config.get('labels', [])):
        errors.append(f"model.num_labels ({config['model']['num_labels']}) doesn't match number of labels ({len(config['labels'])})")
    
    # Validate training section
    training = config['training']
    if 'num_train_epochs' in training and training['num_train_epochs'] <= 0:
        errors.append("training.num_train_epochs must be positive")
    if 'learning_rate' in training and training['learning_rate'] <= 0:
        errors.append("training.learning_rate must be positive")
    if 'per_device_train_batch_size' in training and training['per_device_train_batch_size'] <= 0:
        errors.append("training.per_device_train_batch_size must be positive")
    
    # Validate data section
    data = config['data']
    if 'data_path' in data and not os.path.exists(data['data_path']):
        errors.append(f"Data file not found: {data['data_path']}")
    
    train_size = data.get('train_size', 0)
    val_size = data.get('val_size', 0)
    test_size = data.get('test_size', 0)
    total = train_size + val_size + test_size
    if abs(total - 1.0) > 1e-6:
        errors.append(f"Data split sizes must sum to 1.0, got {total}")
    
    # Validate labels
    if 'labels' not in config or not config['labels']:
        errors.append("labels section is required and cannot be empty")
    elif len(set(config['labels'])) != len(config['labels']):
        errors.append("labels must be unique")
    
    return errors


def validate_model_path(model_path: str) -> bool:
    """
    Validate that model path exists and contains required files.
    
    Args:
        model_path: Path to model directory
        
    Returns:
        True if valid, False otherwise
    """
    if not os.path.exists(model_path):
        logging.error(f"Model path does not exist: {model_path}")
        return False
    
    required_files = ['config.json']
    for file in required_files:
        file_path = os.path.join(model_path, file)
        if not os.path.exists(file_path):
            logging.error(f"Required file missing: {file_path}")
            return False
    
    return True


def validate_data_file(data_path: str, required_columns: List[str] = None) -> List[str]:
    """
    Validate data file format and content.
    
    Args:
        data_path: Path to data file
        required_columns: List of required column names
        
    Returns:
        List of validation error messages (empty if valid)
    """
    errors = []
    
    if required_columns is None:
        required_columns = ['comment', 'label']
    
    if not os.path.exists(data_path):
        errors.append(f"Data file not found: {data_path}")
        return errors
    
    try:
        import pandas as pd
        df = pd.read_csv(data_path)
        
        # Check required columns
        missing_columns = [col for col in required_columns if col not in df.columns]
        if missing_columns:
            errors.append(f"Missing required columns: {missing_columns}")
        
        # Check for empty dataframe
        if len(df) == 0:
            errors.append("Data file is empty")
        
        # Check for missing values in required columns
        if 'comment' in df.columns:
            empty_comments = df['comment'].isna().sum() + (df['comment'].str.strip().str.len() == 0).sum()
            if empty_comments > 0:
                errors.append(f"Found {empty_comments} empty comments")
        
        if 'label' in df.columns:
            missing_labels = df['label'].isna().sum()
            if missing_labels > 0:
                errors.append(f"Found {missing_labels} missing labels")
        
    except Exception as e:
        errors.append(f"Error reading data file: {str(e)}")
    
    return errors


def validate_config_file(config_path: str) -> bool:
    """
    Validate configuration file.
    
    Args:
        config_path: Path to configuration file
        
    Returns:
        True if valid, False otherwise
    """
    if not os.path.exists(config_path):
        logging.error(f"Config file not found: {config_path}")
        return False
    
    try:
        with open(config_path, 'r') as f:
            config = yaml.safe_load(f)
        
        errors = validate_config(config)
        if errors:
            logging.error("Configuration validation errors:")
            for error in errors:
                logging.error(f"  - {error}")
            return False
        
        logging.info("Configuration file is valid")
        return True
        
    except Exception as e:
        logging.error(f"Error reading config file: {str(e)}")
        return False