|
|
|
|
|
""" |
|
|
Convert DeepSeek-R1-Distill-Qwen-1.5B to ternary format. |
|
|
|
|
|
Stores linear weights as bitplanes (pos_mask, neg_mask) + per-row scale. |
|
|
Embeddings and layernorms stay FP16. LM head stays FP16. |
|
|
|
|
|
(c) 2026 OpenTransformers Ltd / Scott Bisset |
|
|
""" |
|
|
|
|
|
import os |
|
|
import json |
|
|
import struct |
|
|
import numpy as np |
|
|
from pathlib import Path |
|
|
import time |
|
|
|
|
|
def load_safetensors(model_dir): |
|
|
"""Load all tensors from safetensors files.""" |
|
|
import torch; from safetensors.torch import load_file |
|
|
|
|
|
tensors = {} |
|
|
for f in sorted(Path(model_dir).glob("*.safetensors")): |
|
|
print(f"Loading {f.name}...") |
|
|
state = load_file(str(f)) |
|
|
for key, val in state.items(): |
|
|
tensors[key] = val.float().numpy() |
|
|
return tensors |
|
|
|
|
|
def quantize_row_ternary(row, alpha=0.7): |
|
|
"""Quantize a single row to ternary {-1, 0, +1}. Vectorized bitpacking.""" |
|
|
row = row.astype(np.float32) |
|
|
mean_abs = np.mean(np.abs(row)) |
|
|
threshold = alpha * mean_abs |
|
|
|
|
|
pos = row >= threshold |
|
|
neg = row <= -threshold |
|
|
|
|
|
nz_mask = pos | neg |
|
|
scale = np.mean(np.abs(row[nz_mask])) if nz_mask.any() else np.float32(1.0) |
|
|
|
|
|
|
|
|
in_dim = len(row) |
|
|
pad = (64 - in_dim % 64) % 64 |
|
|
if pad: |
|
|
pos = np.concatenate([pos, np.zeros(pad, dtype=bool)]) |
|
|
neg = np.concatenate([neg, np.zeros(pad, dtype=bool)]) |
|
|
|
|
|
|
|
|
pos_r = pos.reshape(-1, 64).astype(np.uint64) |
|
|
neg_r = neg.reshape(-1, 64).astype(np.uint64) |
|
|
bit_positions = (np.uint64(1) << np.arange(64, dtype=np.uint64)) |
|
|
pos_bits = np.bitwise_or.reduce(pos_r * bit_positions, axis=1) |
|
|
neg_bits = np.bitwise_or.reduce(neg_r * bit_positions, axis=1) |
|
|
|
|
|
return pos_bits, neg_bits, np.float32(scale) |
|
|
|
|
|
return pos_bits, neg_bits, np.float32(scale) |
|
|
|
|
|
def quantize_weight_matrix(weight, alpha=0.7): |
|
|
"""Quantize entire weight matrix [out_dim, in_dim] to ternary. Fully vectorized.""" |
|
|
w = weight.astype(np.float32) |
|
|
out_dim, in_dim = w.shape |
|
|
|
|
|
|
|
|
row_means = np.mean(np.abs(w), axis=1, keepdims=True) |
|
|
thresholds = alpha * row_means |
|
|
|
|
|
pos = w >= thresholds |
|
|
neg = w <= -thresholds |
|
|
|
|
|
|
|
|
nz = pos | neg |
|
|
|
|
|
scales = np.zeros(out_dim, dtype=np.float32) |
|
|
for i in range(out_dim): |
|
|
if nz[i].any(): |
|
|
scales[i] = np.mean(np.abs(w[i, nz[i]])) |
|
|
else: |
|
|
scales[i] = 1.0 |
|
|
|
|
|
|
|
|
total = out_dim * in_dim |
|
|
sparsity = 1.0 - np.sum(nz) / total |
|
|
|
|
|
|
|
|
pad = (64 - in_dim % 64) % 64 |
|
|
if pad: |
|
|
pos = np.concatenate([pos, np.zeros((out_dim, pad), dtype=bool)], axis=1) |
|
|
neg = np.concatenate([neg, np.zeros((out_dim, pad), dtype=bool)], axis=1) |
|
|
|
|
|
padded_dim = pos.shape[1] |
|
|
chunks = padded_dim // 64 |
|
|
|
|
|
|
|
|
bit_positions = (np.uint64(1) << np.arange(64, dtype=np.uint64)) |
|
|
|
|
|
pos_r = pos.reshape(out_dim, chunks, 64).astype(np.uint64) |
|
|
neg_r = neg.reshape(out_dim, chunks, 64).astype(np.uint64) |
|
|
|
|
|
all_pos = np.bitwise_or.reduce(pos_r * bit_positions, axis=2) |
|
|
all_neg = np.bitwise_or.reduce(neg_r * bit_positions, axis=2) |
|
|
|
|
|
return all_pos, all_neg, scales, sparsity |
|
|
|
|
|
def save_ternary_model(tensors, output_dir, alpha=0.7): |
|
|
"""Convert and save full model to ternary format.""" |
|
|
os.makedirs(output_dir, exist_ok=True) |
|
|
|
|
|
config = { |
|
|
"hidden_size": 1536, |
|
|
"intermediate_size": 8960, |
|
|
"num_attention_heads": 12, |
|
|
"num_key_value_heads": 2, |
|
|
"num_hidden_layers": 28, |
|
|
"vocab_size": 151936, |
|
|
"head_dim": 128, |
|
|
"rope_theta": 1000000.0, |
|
|
"rms_norm_eps": 1e-6, |
|
|
"alpha": alpha, |
|
|
} |
|
|
|
|
|
|
|
|
ternary_keys = [] |
|
|
keep_keys = [] |
|
|
|
|
|
for key in tensors: |
|
|
if any(p in key for p in ['q_proj.weight', 'k_proj.weight', 'v_proj.weight', |
|
|
'o_proj.weight', 'gate_proj.weight', 'up_proj.weight', |
|
|
'down_proj.weight']): |
|
|
ternary_keys.append(key) |
|
|
else: |
|
|
keep_keys.append(key) |
|
|
|
|
|
print(f"\nTernary layers: {len(ternary_keys)}") |
|
|
print(f"FP16 layers: {len(keep_keys)}") |
|
|
|
|
|
|
|
|
with open(os.path.join(output_dir, "config.json"), "w") as f: |
|
|
json.dump(config, f, indent=2) |
|
|
|
|
|
|
|
|
total_ternary_bytes = 0 |
|
|
total_original_bytes = 0 |
|
|
|
|
|
for key in ternary_keys: |
|
|
w = tensors[key].astype(np.float32) |
|
|
out_dim, in_dim = w.shape |
|
|
total_original_bytes += w.nbytes |
|
|
|
|
|
t0 = time.time() |
|
|
pos, neg, scales, sparsity = quantize_weight_matrix(w, alpha) |
|
|
dt = time.time() - t0 |
|
|
|
|
|
|
|
|
prefix = os.path.join(output_dir, key.replace(".", "_")) |
|
|
pos.tofile(prefix + ".pos") |
|
|
neg.tofile(prefix + ".neg") |
|
|
scales.tofile(prefix + ".scales") |
|
|
|
|
|
ternary_bytes = pos.nbytes + neg.nbytes + scales.nbytes |
|
|
total_ternary_bytes += ternary_bytes |
|
|
ratio = w.nbytes / ternary_bytes |
|
|
|
|
|
print(f" {key}: {w.shape} -> ternary ({ternary_bytes/1024:.0f}KB, " |
|
|
f"{ratio:.1f}x compression, {sparsity:.1%} sparse, {dt:.1f}s)") |
|
|
|
|
|
|
|
|
total_fp16_bytes = 0 |
|
|
for key in keep_keys: |
|
|
w = tensors[key].astype(np.float16) |
|
|
prefix = os.path.join(output_dir, key.replace(".", "_")) |
|
|
w.tofile(prefix + ".fp16") |
|
|
total_fp16_bytes += w.nbytes |
|
|
print(f" {key}: {w.shape} -> fp16 ({w.nbytes/1024:.0f}KB)") |
|
|
|
|
|
|
|
|
manifest = { |
|
|
"ternary": {k: list(tensors[k].shape) for k in ternary_keys}, |
|
|
"fp16": {k: list(tensors[k].shape) for k in keep_keys}, |
|
|
} |
|
|
with open(os.path.join(output_dir, "manifest.json"), "w") as f: |
|
|
json.dump(manifest, f, indent=2) |
|
|
|
|
|
total_bytes = total_ternary_bytes + total_fp16_bytes |
|
|
orig_bytes = total_original_bytes + total_fp16_bytes |
|
|
print(f"\n=== Summary ===") |
|
|
print(f"Original FP32 linear weights: {total_original_bytes/1024/1024:.1f} MB") |
|
|
print(f"Ternary linear weights: {total_ternary_bytes/1024/1024:.1f} MB") |
|
|
print(f"FP16 other weights: {total_fp16_bytes/1024/1024:.1f} MB") |
|
|
print(f"Total model size: {total_bytes/1024/1024:.1f} MB") |
|
|
print(f"Compression vs FP32: {orig_bytes/total_bytes:.1f}x") |
|
|
|
|
|
if __name__ == "__main__": |
|
|
import sys |
|
|
model_dir = sys.argv[1] if len(sys.argv) > 1 else "deepseek-r1-1.5b-hf" |
|
|
output_dir = sys.argv[2] if len(sys.argv) > 2 else "deepseek-r1-1.5b-ternary" |
|
|
alpha = float(sys.argv[3]) if len(sys.argv) > 3 else 0.7 |
|
|
|
|
|
print(f"Loading model from {model_dir}...") |
|
|
tensors = load_safetensors(model_dir) |
|
|
|
|
|
print(f"Converting to ternary (alpha={alpha})...") |
|
|
save_ternary_model(tensors, output_dir, alpha) |
|
|
print("Done!") |
|
|
|