Spaces:
Sleeping
Sleeping
| import torch | |
| import os | |
| from model import GPT, GPTConfig | |
| def compress_model(compression_type='cpu_compatible'): | |
| """ | |
| Load the original model and compress it using specified method | |
| Args: | |
| compression_type (str): | |
| 'cpu_compatible' - Uses FP32 but removes training artifacts | |
| 'fp16' - Half precision, better for GPU but may not work on all CPUs | |
| 'quantized' - INT8 quantization, most compressed but slightly lower quality | |
| """ | |
| input_path = 'best_model.pt' | |
| output_path = f'compressed_model_{compression_type}.pt' | |
| print(f"Loading model from {input_path}...") | |
| # Load original model | |
| checkpoint = torch.load(input_path, map_location='cpu') | |
| # Get original size | |
| original_size = os.path.getsize(input_path) / (1024 * 1024) # MB | |
| print(f"Original model size: {original_size:.2f} MB") | |
| # Initialize model | |
| config = GPTConfig( | |
| block_size=1024, | |
| vocab_size=50304, | |
| n_layer=12, | |
| n_head=12, | |
| n_embd=768 | |
| ) | |
| model = GPT(config) | |
| model.load_state_dict(checkpoint['model_state_dict']) | |
| # Apply compression based on type | |
| if compression_type == 'fp16': | |
| model = model.half() # Convert to FP16 | |
| dtype = 'float16' | |
| elif compression_type == 'quantized': | |
| # Quantize the model to INT8 | |
| model = torch.quantization.quantize_dynamic( | |
| model, {torch.nn.Linear}, dtype=torch.qint8 | |
| ) | |
| dtype = 'int8' | |
| else: # cpu_compatible | |
| model = model.float() # Ensure FP32 | |
| dtype = 'float32' | |
| # Create minimal checkpoint | |
| compressed_checkpoint = { | |
| 'model_state_dict': model.state_dict(), | |
| 'config': { | |
| 'block_size': config.block_size, | |
| 'vocab_size': config.vocab_size, | |
| 'n_layer': config.n_layer, | |
| 'n_head': config.n_head, | |
| 'n_embd': config.n_embd | |
| }, | |
| 'dtype': dtype, | |
| 'compression_type': compression_type | |
| } | |
| # Save compressed model | |
| print(f"Saving {compression_type} compressed model...") | |
| torch.save(compressed_checkpoint, output_path) | |
| # Get compressed size | |
| compressed_size = os.path.getsize(output_path) / (1024 * 1024) # MB | |
| print(f"Compressed model size: {compressed_size:.2f} MB") | |
| print(f"Compression ratio: {original_size/compressed_size:.2f}x") | |
| # Verify loading | |
| print("\nVerifying compressed model...") | |
| try: | |
| test_load = torch.load(output_path, map_location='cpu') | |
| print("✓ Compressed model loads successfully!") | |
| print(f"Model type: {test_load['compression_type']}") | |
| print(f"Data type: {test_load['dtype']}") | |
| except Exception as e: | |
| print(f"Error loading compressed model: {str(e)}") | |
| if __name__ == "__main__": | |
| # Create CPU-compatible version | |
| compress_model('cpu_compatible') | |
| # Optionally create FP16 version for GPU | |
| # compress_model('fp16') | |
| # Optionally create quantized version | |
| # compress_model('quantized') |