File size: 3,069 Bytes
00f3ac8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
import torch
import os
from model import GPT, GPTConfig

def compress_model(compression_type='cpu_compatible'):
    """
    Load the original model and compress it using specified method
    
    Args:
        compression_type (str): 
            'cpu_compatible' - Uses FP32 but removes training artifacts
            'fp16' - Half precision, better for GPU but may not work on all CPUs
            'quantized' - INT8 quantization, most compressed but slightly lower quality
    """
    input_path = 'best_model.pt'
    output_path = f'compressed_model_{compression_type}.pt'
    
    print(f"Loading model from {input_path}...")
    
    # Load original model
    checkpoint = torch.load(input_path, map_location='cpu')
    
    # Get original size
    original_size = os.path.getsize(input_path) / (1024 * 1024)  # MB
    print(f"Original model size: {original_size:.2f} MB")
    
    # Initialize model
    config = GPTConfig(
        block_size=1024,
        vocab_size=50304,
        n_layer=12,
        n_head=12,
        n_embd=768
    )
    
    model = GPT(config)
    model.load_state_dict(checkpoint['model_state_dict'])
    
    # Apply compression based on type
    if compression_type == 'fp16':
        model = model.half()  # Convert to FP16
        dtype = 'float16'
    elif compression_type == 'quantized':
        # Quantize the model to INT8
        model = torch.quantization.quantize_dynamic(
            model, {torch.nn.Linear}, dtype=torch.qint8
        )
        dtype = 'int8'
    else:  # cpu_compatible
        model = model.float()  # Ensure FP32
        dtype = 'float32'
    
    # Create minimal checkpoint
    compressed_checkpoint = {
        'model_state_dict': model.state_dict(),
        'config': {
            'block_size': config.block_size,
            'vocab_size': config.vocab_size,
            'n_layer': config.n_layer,
            'n_head': config.n_head,
            'n_embd': config.n_embd
        },
        'dtype': dtype,
        'compression_type': compression_type
    }
    
    # Save compressed model
    print(f"Saving {compression_type} compressed model...")
    torch.save(compressed_checkpoint, output_path)
    
    # Get compressed size
    compressed_size = os.path.getsize(output_path) / (1024 * 1024)  # MB
    print(f"Compressed model size: {compressed_size:.2f} MB")
    print(f"Compression ratio: {original_size/compressed_size:.2f}x")
    
    # Verify loading
    print("\nVerifying compressed model...")
    try:
        test_load = torch.load(output_path, map_location='cpu')
        print("✓ Compressed model loads successfully!")
        print(f"Model type: {test_load['compression_type']}")
        print(f"Data type: {test_load['dtype']}")
    except Exception as e:
        print(f"Error loading compressed model: {str(e)}")

if __name__ == "__main__":
    # Create CPU-compatible version
    compress_model('cpu_compatible')
    
    # Optionally create FP16 version for GPU
    # compress_model('fp16')
    
    # Optionally create quantized version
    # compress_model('quantized')