Spaces:

dhairyashil
/

ShakespeareGPT-Forge

Sleeping

App Files Files Community

ShakespeareGPT-Forge / compress_model.py

dhairyashil

add model

00f3ac8 11 months ago

raw

history blame contribute delete

3.07 kB

	import torch
	import os
	from model import GPT, GPTConfig

	def compress_model(compression_type='cpu_compatible'):
	"""
	Load the original model and compress it using specified method

	Args:
	compression_type (str):
	'cpu_compatible' - Uses FP32 but removes training artifacts
	'fp16' - Half precision, better for GPU but may not work on all CPUs
	'quantized' - INT8 quantization, most compressed but slightly lower quality
	"""
	input_path = 'best_model.pt'
	output_path = f'compressed_model_{compression_type}.pt'

	print(f"Loading model from {input_path}...")

	# Load original model
	checkpoint = torch.load(input_path, map_location='cpu')

	# Get original size
	original_size = os.path.getsize(input_path) / (1024 * 1024) # MB
	print(f"Original model size: {original_size:.2f} MB")

	# Initialize model
	config = GPTConfig(
	block_size=1024,
	vocab_size=50304,
	n_layer=12,
	n_head=12,
	n_embd=768
	)

	model = GPT(config)
	model.load_state_dict(checkpoint['model_state_dict'])

	# Apply compression based on type
	if compression_type == 'fp16':
	model = model.half() # Convert to FP16
	dtype = 'float16'
	elif compression_type == 'quantized':
	# Quantize the model to INT8
	model = torch.quantization.quantize_dynamic(
	model, {torch.nn.Linear}, dtype=torch.qint8
	)
	dtype = 'int8'
	else: # cpu_compatible
	model = model.float() # Ensure FP32
	dtype = 'float32'

	# Create minimal checkpoint
	compressed_checkpoint = {
	'model_state_dict': model.state_dict(),
	'config': {
	'block_size': config.block_size,
	'vocab_size': config.vocab_size,
	'n_layer': config.n_layer,
	'n_head': config.n_head,
	'n_embd': config.n_embd
	},
	'dtype': dtype,
	'compression_type': compression_type
	}

	# Save compressed model
	print(f"Saving {compression_type} compressed model...")
	torch.save(compressed_checkpoint, output_path)

	# Get compressed size
	compressed_size = os.path.getsize(output_path) / (1024 * 1024) # MB
	print(f"Compressed model size: {compressed_size:.2f} MB")
	print(f"Compression ratio: {original_size/compressed_size:.2f}x")

	# Verify loading
	print("\nVerifying compressed model...")
	try:
	test_load = torch.load(output_path, map_location='cpu')
	print("✓ Compressed model loads successfully!")
	print(f"Model type: {test_load['compression_type']}")
	print(f"Data type: {test_load['dtype']}")
	except Exception as e:
	print(f"Error loading compressed model: {str(e)}")

	if __name__ == "__main__":
	# Create CPU-compatible version
	compress_model('cpu_compatible')

	# Optionally create FP16 version for GPU
	# compress_model('fp16')

	# Optionally create quantized version
	# compress_model('quantized')