jerryzh168 commited on
Commit
09640ea
·
verified ·
1 Parent(s): 1ecc373

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +51 -0
README.md CHANGED
@@ -6,6 +6,7 @@ config version: 1
6
  torchao version: 0.14.dev
7
  ```
8
 
 
9
  ```
10
  import logging
11
 
@@ -106,4 +107,54 @@ output_text = tokenizer.batch_decode(
106
  print("Response:", output_text[0][len(prompt) :])
107
 
108
  assert(correct_output_text == output_text)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
109
  ```
 
6
  torchao version: 0.14.dev
7
  ```
8
 
9
+ # Generate Quantized Model
10
  ```
11
  import logging
12
 
 
107
  print("Response:", output_text[0][len(prompt) :])
108
 
109
  assert(correct_output_text == output_text)
110
+ ```
111
+
112
+
113
+ # Test Loading
114
+ ```
115
+ from transformers import (
116
+ AutoModelForCausalLM,
117
+ AutoProcessor,
118
+ AutoTokenizer,
119
+ TorchAoConfig,
120
+ )
121
+ from torchao.quantization import Float8Tensor
122
+ from torchao.quantization import (
123
+ Float8Tensor,
124
+ Int4TilePackedTo4dTensor,
125
+ IntxUnpackedToInt8Tensor,
126
+ )
127
+ import torch
128
+
129
+ model_name = "torchao-testing/opt-125m-ModuleFqnToConfig-v1-regex-0.14.0.dev"
130
+ device = "cuda"
131
+ input_text = "What are we having for dinner?"
132
+ max_new_tokens = 10
133
+
134
+ quantized_model = AutoModelForCausalLM.from_pretrained(
135
+ model_name,
136
+ device_map=device,
137
+ dtype=torch.bfloat16,
138
+ )
139
+ print("quantized model:", quantized_model)
140
+ for i in range(12):
141
+ if i == 3:
142
+ assert isinstance(quantized_model.model.decoder.layers[i].self_attn.q_proj.weight, Int4TilePackedTo4dTensor)
143
+ else:
144
+ assert isinstance(quantized_model.model.decoder.layers[i].self_attn.q_proj.weight, Float8Tensor)
145
+ assert isinstance(quantized_model.model.decoder.layers[i].self_attn.k_proj.weight, Float8Tensor)
146
+ assert not isinstance(quantized_model.model.decoder.layers[i].self_attn.v_proj.weight, Float8Tensor)
147
+ assert isinstance(quantized_model.model.decoder.layers[i].self_attn.out_proj.weight, IntxUnpackedToInt8Tensor)
148
+
149
+ tokenizer = AutoTokenizer.from_pretrained(model_name)
150
+
151
+ input_ids = tokenizer(input_text, return_tensors="pt").to(device)
152
+
153
+ output = quantized_model.generate(**input_ids, max_new_tokens=max_new_tokens)
154
+ EXPECTED_OUTPUT = [
155
+ "What are we having for dinner?\n\nJessica: (smiling)",
156
+ "What are we having for dinner?\n\nJess: (smiling) I",
157
+ ]
158
+ # self.assertTrue(tokenizer.decode(output[0], skip_special_tokens=True) in EXPECTED_OUTPUT)
159
+
160
  ```