Spaces:
Sleeping
Sleeping
File size: 3,069 Bytes
00f3ac8 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 |
import torch
import os
from model import GPT, GPTConfig
def compress_model(compression_type='cpu_compatible'):
"""
Load the original model and compress it using specified method
Args:
compression_type (str):
'cpu_compatible' - Uses FP32 but removes training artifacts
'fp16' - Half precision, better for GPU but may not work on all CPUs
'quantized' - INT8 quantization, most compressed but slightly lower quality
"""
input_path = 'best_model.pt'
output_path = f'compressed_model_{compression_type}.pt'
print(f"Loading model from {input_path}...")
# Load original model
checkpoint = torch.load(input_path, map_location='cpu')
# Get original size
original_size = os.path.getsize(input_path) / (1024 * 1024) # MB
print(f"Original model size: {original_size:.2f} MB")
# Initialize model
config = GPTConfig(
block_size=1024,
vocab_size=50304,
n_layer=12,
n_head=12,
n_embd=768
)
model = GPT(config)
model.load_state_dict(checkpoint['model_state_dict'])
# Apply compression based on type
if compression_type == 'fp16':
model = model.half() # Convert to FP16
dtype = 'float16'
elif compression_type == 'quantized':
# Quantize the model to INT8
model = torch.quantization.quantize_dynamic(
model, {torch.nn.Linear}, dtype=torch.qint8
)
dtype = 'int8'
else: # cpu_compatible
model = model.float() # Ensure FP32
dtype = 'float32'
# Create minimal checkpoint
compressed_checkpoint = {
'model_state_dict': model.state_dict(),
'config': {
'block_size': config.block_size,
'vocab_size': config.vocab_size,
'n_layer': config.n_layer,
'n_head': config.n_head,
'n_embd': config.n_embd
},
'dtype': dtype,
'compression_type': compression_type
}
# Save compressed model
print(f"Saving {compression_type} compressed model...")
torch.save(compressed_checkpoint, output_path)
# Get compressed size
compressed_size = os.path.getsize(output_path) / (1024 * 1024) # MB
print(f"Compressed model size: {compressed_size:.2f} MB")
print(f"Compression ratio: {original_size/compressed_size:.2f}x")
# Verify loading
print("\nVerifying compressed model...")
try:
test_load = torch.load(output_path, map_location='cpu')
print("✓ Compressed model loads successfully!")
print(f"Model type: {test_load['compression_type']}")
print(f"Data type: {test_load['dtype']}")
except Exception as e:
print(f"Error loading compressed model: {str(e)}")
if __name__ == "__main__":
# Create CPU-compatible version
compress_model('cpu_compatible')
# Optionally create FP16 version for GPU
# compress_model('fp16')
# Optionally create quantized version
# compress_model('quantized') |