Update inference examples to use the correct chat template

#7
by mario-sanz - opened
Files changed (1) hide show
  1. README.md +8 -8
README.md CHANGED
@@ -45,13 +45,13 @@ You can use OLMo with the standard HuggingFace transformers library:
45
  from transformers import AutoModelForCausalLM, AutoTokenizer
46
  olmo = AutoModelForCausalLM.from_pretrained("allenai/Olmo-3-7B-Instruct")
47
  tokenizer = AutoTokenizer.from_pretrained("allenai/Olmo-3-7B-Instruct")
48
- message = ["Who would win in a fight - a dinosaur or a cow named Moo Moo?"]
49
- inputs = tokenizer(message, return_tensors='pt', return_token_type_ids=False)
50
  # optional verifying cuda
51
  # inputs = {k: v.to('cuda') for k,v in inputs.items()}
52
  # olmo = olmo.to('cuda')
53
  response = olmo.generate(**inputs, max_new_tokens=100, do_sample=True, top_k=50, top_p=0.95)
54
- print(tokenizer.batch_decode(response, skip_special_tokens=True)[0])
55
  >> 'This is a fun and imaginative question! Let’s break it down...'
56
  ```
57
 
@@ -184,8 +184,8 @@ model = AutoModelForCausalLM.from_pretrained(
184
  device_map="auto",
185
  )
186
 
187
- prompt = "Who would win in a fight - a dinosaur or a cow named MooMoo?"
188
- inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
189
 
190
  outputs = model.generate(
191
  **inputs,
@@ -194,7 +194,7 @@ outputs = model.generate(
194
  max_new_tokens=32768,
195
  )
196
 
197
- print(tokenizer.decode(outputs[0], skip_special_tokens=True))
198
  ```
199
 
200
  ### vllm Example
@@ -210,8 +210,8 @@ sampling_params = SamplingParams(
210
  max_tokens=32768,
211
  )
212
 
213
- prompt = "Who would win in a fight - a dinosaur or a cow named MooMoo?"
214
- outputs = llm.generate(prompt, sampling_params)
215
  print(outputs[0].outputs[0].text)
216
  ```
217
 
 
45
  from transformers import AutoModelForCausalLM, AutoTokenizer
46
  olmo = AutoModelForCausalLM.from_pretrained("allenai/Olmo-3-7B-Instruct")
47
  tokenizer = AutoTokenizer.from_pretrained("allenai/Olmo-3-7B-Instruct")
48
+ message = [{"role": "user", "content": "Who would win in a fight - a dinosaur or a cow named Moo Moo?"}]
49
+ inputs = tokenizer.apply_chat_template(message, add_generation_prompt=True, return_tensors='pt', return_dict=True)
50
  # optional verifying cuda
51
  # inputs = {k: v.to('cuda') for k,v in inputs.items()}
52
  # olmo = olmo.to('cuda')
53
  response = olmo.generate(**inputs, max_new_tokens=100, do_sample=True, top_k=50, top_p=0.95)
54
+ print(tokenizer.decode(response[0][inputs.input_ids.shape[1]:], skip_special_tokens=True))
55
  >> 'This is a fun and imaginative question! Let’s break it down...'
56
  ```
57
 
 
184
  device_map="auto",
185
  )
186
 
187
+ message = [{"role": "user", "content": "Who would win in a fight - a dinosaur or a cow named Moo Moo?"}]
188
+ inputs = tokenizer.apply_chat_template(message, add_generation_prompt=True, return_tensors='pt', return_dict=True).to(model.device)
189
 
190
  outputs = model.generate(
191
  **inputs,
 
194
  max_new_tokens=32768,
195
  )
196
 
197
+ print(tokenizer.decode(outputs[0][inputs.input_ids.shape[1]:], skip_special_tokens=True))
198
  ```
199
 
200
  ### vllm Example
 
210
  max_tokens=32768,
211
  )
212
 
213
+ message = [{"role": "user", "content": "Who would win in a fight - a dinosaur or a cow named Moo Moo?"}]
214
+ outputs = llm.chat(message, sampling_params)
215
  print(outputs[0].outputs[0].text)
216
  ```
217