|
|
|
|
|
"""Packed unary converter: uint8 magnitudes + bitpacked signs + per-row scales.""" |
|
|
import os, json, sys, time |
|
|
import numpy as np |
|
|
from pathlib import Path |
|
|
|
|
|
def load_safetensors(model_dir): |
|
|
from safetensors.torch import load_file |
|
|
tensors = {} |
|
|
for f in sorted(Path(model_dir).glob("*.safetensors")): |
|
|
print(f" Loading {f.name}...") |
|
|
for k, v in load_file(str(f)).items(): |
|
|
tensors[k] = v.float().numpy() |
|
|
return tensors |
|
|
|
|
|
def quantize_packed(w, n_levels=7): |
|
|
out_dim, in_dim = w.shape |
|
|
chunks = (in_dim + 63) // 64 |
|
|
padded = chunks * 64 |
|
|
row_max = np.max(np.abs(w), axis=1, keepdims=True) |
|
|
row_max = np.where(row_max == 0, 1.0, row_max) |
|
|
scales = (row_max.flatten() / n_levels).astype(np.float32) |
|
|
mags = np.clip(np.round(np.abs(w / scales[:, None])), 0, n_levels).astype(np.uint8) |
|
|
signs = (w < 0) |
|
|
rmm = np.max(mags, axis=1).astype(np.uint8) |
|
|
if in_dim < padded: |
|
|
sp = np.zeros((out_dim, padded), dtype=bool) |
|
|
sp[:, :in_dim] = signs |
|
|
else: |
|
|
sp = signs |
|
|
bit_pos = np.uint64(1) << np.arange(64, dtype=np.uint64) |
|
|
sign_bits = np.bitwise_or.reduce(sp.reshape(out_dim, chunks, 64).astype(np.uint64) * bit_pos, axis=2) |
|
|
return mags, sign_bits, scales, rmm, np.mean(mags), np.mean(mags == 0) |
|
|
|
|
|
def convert(tensors, output_dir, n_levels=7): |
|
|
os.makedirs(output_dir, exist_ok=True) |
|
|
config = {"hidden_size":1536,"intermediate_size":8960,"num_attention_heads":12, |
|
|
"num_key_value_heads":2,"num_hidden_layers":28,"vocab_size":151936, |
|
|
"head_dim":128,"rope_theta":1000000.0,"rms_norm_eps":1e-6, |
|
|
"n_levels":n_levels,"quant_type":"packed_unary"} |
|
|
linear_keys = [k for k in tensors if any(p in k for p in |
|
|
['q_proj.weight','k_proj.weight','v_proj.weight','o_proj.weight', |
|
|
'gate_proj.weight','up_proj.weight','down_proj.weight'])] |
|
|
other_keys = [k for k in tensors if k not in linear_keys] |
|
|
with open(os.path.join(output_dir, "config.json"), "w") as f: |
|
|
json.dump(config, f, indent=2) |
|
|
total_packed = total_orig = 0 |
|
|
all_avg = [] |
|
|
for key in linear_keys: |
|
|
w = tensors[key]; total_orig += w.nbytes |
|
|
t0 = time.time() |
|
|
mags, sb, sc, rmm, am, sp = quantize_packed(w, n_levels) |
|
|
dt = time.time() - t0 |
|
|
pfx = os.path.join(output_dir, key.replace(".", "_")) |
|
|
mags.tofile(pfx+".mags"); sb.tofile(pfx+".signs") |
|
|
sc.tofile(pfx+".scales"); rmm.tofile(pfx+".rmm") |
|
|
ub = mags.nbytes + sb.nbytes + sc.nbytes + rmm.nbytes |
|
|
total_packed += ub; all_avg.append(am) |
|
|
print(f" {key}: {w.shape} -> {ub/1024:.0f}KB (avg_mag={am:.2f}, {dt:.1f}s)") |
|
|
total_fp16 = 0 |
|
|
for key in other_keys: |
|
|
w = tensors[key].astype(np.float16) |
|
|
pfx = os.path.join(output_dir, key.replace(".", "_")) |
|
|
w.tofile(pfx+".fp16"); total_fp16 += w.nbytes |
|
|
manifest = {"packed":{k:list(tensors[k].shape) for k in linear_keys}, |
|
|
"fp16":{k:list(tensors[k].shape) for k in other_keys}} |
|
|
with open(os.path.join(output_dir, "manifest.json"), "w") as f: |
|
|
json.dump(manifest, f, indent=2) |
|
|
print(f"\n=== PACKED UNARY ===") |
|
|
print(f"Packed linear: {total_packed/1e6:.1f} MB | FP16 other: {total_fp16/1e6:.1f} MB") |
|
|
print(f"Total: {(total_packed+total_fp16)/1e6:.1f} MB | Avg mag: {np.mean(all_avg):.3f}") |
|
|
print(f"Expected speedup vs 7-plane: {7/np.mean(all_avg):.1f}x") |
|
|
|
|
|
if __name__ == "__main__": |
|
|
model_dir = sys.argv[1] if len(sys.argv) > 1 else "deepseek-r1-1.5b-hf" |
|
|
output_dir = sys.argv[2] if len(sys.argv) > 2 else "deepseek-r1-1.5b-packed" |
|
|
tensors = load_safetensors(model_dir) |
|
|
convert(tensors, output_dir) |
|
|
print("Done!") |
|
|
|