Update README.md
Browse files
README.md
CHANGED
|
@@ -151,7 +151,6 @@ quantized_model = AutoModelForCausalLM.from_pretrained(
|
|
| 151 |
device_map=device,
|
| 152 |
dtype=torch.bfloat16,
|
| 153 |
)
|
| 154 |
-
print("quantized model:", quantized_model)
|
| 155 |
for i in range(12):
|
| 156 |
if i == 3:
|
| 157 |
assert isinstance(quantized_model.model.decoder.layers[i].self_attn.q_proj.weight, Int4TilePackedTo4dTensor)
|
|
@@ -170,4 +169,12 @@ input_ids = tokenizer(input_text, return_tensors="pt").to(device)
|
|
| 170 |
output = quantized_model.generate(**input_ids, max_new_tokens=max_new_tokens)
|
| 171 |
print(tokenizer.decode(output[0], skip_special_tokens=True))
|
| 172 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 173 |
```
|
|
|
|
| 151 |
device_map=device,
|
| 152 |
dtype=torch.bfloat16,
|
| 153 |
)
|
|
|
|
| 154 |
for i in range(12):
|
| 155 |
if i == 3:
|
| 156 |
assert isinstance(quantized_model.model.decoder.layers[i].self_attn.q_proj.weight, Int4TilePackedTo4dTensor)
|
|
|
|
| 169 |
output = quantized_model.generate(**input_ids, max_new_tokens=max_new_tokens)
|
| 170 |
print(tokenizer.decode(output[0], skip_special_tokens=True))
|
| 171 |
|
| 172 |
+
```
|
| 173 |
+
|
| 174 |
+
Output:
|
| 175 |
+
|
| 176 |
+
```
|
| 177 |
+
What are we having for dinner?
|
| 178 |
+
A nice dinner with a friend.
|
| 179 |
+
I
|
| 180 |
```
|