jerryzh168 commited on
Commit
0955f10
·
verified ·
1 Parent(s): 44b9297

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +22 -12
README.md CHANGED
@@ -48,9 +48,17 @@ intxwo = IntxWeightOnlyConfig(weight_dtype=torch.int8, granularity=PerAxis(0))
48
  qconfig_dict = {
49
  # highest priority
50
  "model.decoder.layers.3.self_attn.q_proj": int4wo,
 
 
 
 
 
51
  "model.decoder.layers.*.self_attn.q_proj": float8dyn,
52
  "model.decoder.layers.*.self_attn.k_proj": float8dyn,
53
- "model.decoder.layers.*.self_attn.v_proj": None,
 
 
 
54
  "_default": intxwo,
55
  }
56
  quant_config = ModuleFqnToConfig(qconfig_dict)
@@ -65,19 +73,23 @@ print("quantized model:", quantized_model)
65
  tokenizer = AutoTokenizer.from_pretrained(model_id)
66
  for i in range(12):
67
  if i == 3:
68
- print("type:", quantized_model.model.decoder.layers[i].self_attn.q_proj.weight)
69
  assert isinstance(quantized_model.model.decoder.layers[i].self_attn.q_proj.weight, Int4TilePackedTo4dTensor)
 
 
70
  else:
71
  assert isinstance(quantized_model.model.decoder.layers[i].self_attn.q_proj.weight, Float8Tensor)
72
- assert isinstance(quantized_model.model.decoder.layers[i].self_attn.k_proj.weight, Float8Tensor)
73
- assert not isinstance(quantized_model.model.decoder.layers[i].self_attn.v_proj.weight, Float8Tensor)
74
  assert isinstance(quantized_model.model.decoder.layers[i].self_attn.out_proj.weight, IntxUnpackedToInt8Tensor)
75
 
76
- # Push to hub
77
  MODEL_NAME = model_id.split("/")[-1]
78
  save_to = f"torchao-testing/{MODEL_NAME}-ModuleFqnToConfig-v1-regex-0.14.0.dev"
79
  quantized_model.push_to_hub(save_to, safe_serialization=False)
80
  tokenizer.push_to_hub(save_to)
 
 
 
81
 
82
  # Manual Testing
83
  prompt = "What are we having for dinner?"
@@ -143,10 +155,12 @@ print("quantized model:", quantized_model)
143
  for i in range(12):
144
  if i == 3:
145
  assert isinstance(quantized_model.model.decoder.layers[i].self_attn.q_proj.weight, Int4TilePackedTo4dTensor)
 
 
146
  else:
147
  assert isinstance(quantized_model.model.decoder.layers[i].self_attn.q_proj.weight, Float8Tensor)
148
- assert isinstance(quantized_model.model.decoder.layers[i].self_attn.k_proj.weight, Float8Tensor)
149
- assert not isinstance(quantized_model.model.decoder.layers[i].self_attn.v_proj.weight, Float8Tensor)
150
  assert isinstance(quantized_model.model.decoder.layers[i].self_attn.out_proj.weight, IntxUnpackedToInt8Tensor)
151
 
152
  tokenizer = AutoTokenizer.from_pretrained(model_name)
@@ -154,10 +168,6 @@ tokenizer = AutoTokenizer.from_pretrained(model_name)
154
  input_ids = tokenizer(input_text, return_tensors="pt").to(device)
155
 
156
  output = quantized_model.generate(**input_ids, max_new_tokens=max_new_tokens)
157
- EXPECTED_OUTPUT = [
158
- "What are we having for dinner?\n\nJessica: (smiling)",
159
- "What are we having for dinner?\n\nJess: (smiling) I",
160
- ]
161
- # self.assertTrue(tokenizer.decode(output[0], skip_special_tokens=True) in EXPECTED_OUTPUT)
162
 
163
  ```
 
48
  qconfig_dict = {
49
  # highest priority
50
  "model.decoder.layers.3.self_attn.q_proj": int4wo,
51
+ "model.decoder.layers.3.self_attn.k_proj": int4wo,
52
+ "model.decoder.layers.3.self_attn.v_proj": int4wo,
53
+ # vllm
54
+ "model.decoder.layers.3.self_attn.qkv_proj": int4wo,
55
+
56
  "model.decoder.layers.*.self_attn.q_proj": float8dyn,
57
  "model.decoder.layers.*.self_attn.k_proj": float8dyn,
58
+ "model.decoder.layers.*.self_attn.v_proj": float8dyn,
59
+ # vllm
60
+ "model.decoder.layers.*.self_attn.qkv_proj": float8dyn,
61
+
62
  "_default": intxwo,
63
  }
64
  quant_config = ModuleFqnToConfig(qconfig_dict)
 
73
  tokenizer = AutoTokenizer.from_pretrained(model_id)
74
  for i in range(12):
75
  if i == 3:
 
76
  assert isinstance(quantized_model.model.decoder.layers[i].self_attn.q_proj.weight, Int4TilePackedTo4dTensor)
77
+ assert isinstance(quantized_model.model.decoder.layers[i].self_attn.k_proj.weight, Int4TilePackedTo4dTensor)
78
+ assert isinstance(quantized_model.model.decoder.layers[i].self_attn.v_proj.weight, Int4TilePackedTo4dTensor)
79
  else:
80
  assert isinstance(quantized_model.model.decoder.layers[i].self_attn.q_proj.weight, Float8Tensor)
81
+ assert isinstance(quantized_model.model.decoder.layers[i].self_attn.k_proj.weight, Float8Tensor)
82
+ assert isinstance(quantized_model.model.decoder.layers[i].self_attn.v_proj.weight, Float8Tensor)
83
  assert isinstance(quantized_model.model.decoder.layers[i].self_attn.out_proj.weight, IntxUnpackedToInt8Tensor)
84
 
85
+ # # # Push to hub
86
  MODEL_NAME = model_id.split("/")[-1]
87
  save_to = f"torchao-testing/{MODEL_NAME}-ModuleFqnToConfig-v1-regex-0.14.0.dev"
88
  quantized_model.push_to_hub(save_to, safe_serialization=False)
89
  tokenizer.push_to_hub(save_to)
90
+ # quantized_model.save_pretrained(save_to, safe_serialization=False)
91
+ # tokenizer.save_pretrained(save_to)
92
+
93
 
94
  # Manual Testing
95
  prompt = "What are we having for dinner?"
 
155
  for i in range(12):
156
  if i == 3:
157
  assert isinstance(quantized_model.model.decoder.layers[i].self_attn.q_proj.weight, Int4TilePackedTo4dTensor)
158
+ assert isinstance(quantized_model.model.decoder.layers[i].self_attn.k_proj.weight, Int4TilePackedTo4dTensor)
159
+ assert isinstance(quantized_model.model.decoder.layers[i].self_attn.v_proj.weight, Int4TilePackedTo4dTensor)
160
  else:
161
  assert isinstance(quantized_model.model.decoder.layers[i].self_attn.q_proj.weight, Float8Tensor)
162
+ assert isinstance(quantized_model.model.decoder.layers[i].self_attn.k_proj.weight, Float8Tensor)
163
+ assert isinstance(quantized_model.model.decoder.layers[i].self_attn.v_proj.weight, Float8Tensor)
164
  assert isinstance(quantized_model.model.decoder.layers[i].self_attn.out_proj.weight, IntxUnpackedToInt8Tensor)
165
 
166
  tokenizer = AutoTokenizer.from_pretrained(model_name)
 
168
  input_ids = tokenizer(input_text, return_tensors="pt").to(device)
169
 
170
  output = quantized_model.generate(**input_ids, max_new_tokens=max_new_tokens)
171
+ print(tokenizer.decode(output[0], skip_special_tokens=True))
 
 
 
 
172
 
173
  ```