Joel Lundgren commited on
Commit
f199719
·
1 Parent(s): c1ec8e5

changed back to transformers

Browse files
Files changed (2) hide show
  1. app.py +27 -23
  2. requirements.txt +0 -2
app.py CHANGED
@@ -1,8 +1,7 @@
1
  import gradio as gr
2
  from PIL import Image, ImageDraw
3
  from ultralytics import YOLO
4
- from transformers import AutoTokenizer
5
- from optimum.onnxruntime import ORTModelForCausalLM
6
  import torch
7
 
8
  # Load a pre-trained YOLO model
@@ -60,21 +59,16 @@ def get_llm(model_name):
60
  return llm_cache[model_name]
61
 
62
  model_map = {
63
- "qwen3:0.6b": "onnx-community/Qwen3-0.6B-ONNX",
64
- "gemma3:1b": "onnx-community/gemma-3-1b-it-ONNX-GQA"
65
- }
66
- hf_model_name = model_map[model_name]
67
-
68
- # Tokenizer is loaded from the original model's repo to ensure correct chat templates
69
- original_model_map = {
70
  "qwen3:0.6b": "Qwen/Qwen3-0.6B-Instruct",
71
  "gemma3:1b": "google/gemma-3-1b-it"
72
  }
73
- tokenizer = AutoTokenizer.from_pretrained(original_model_map[model_name])
74
- model = ORTModelForCausalLM.from_pretrained(
75
- hf_model_name,
76
- file_name="model_quantized.onnx",
77
- use_cache=False
 
 
78
  )
79
 
80
  llm_cache[model_name] = (model, tokenizer)
@@ -99,20 +93,30 @@ def generate_text(model_name, system_prompt, user_prompt):
99
  {"role": "user", "content": user_prompt},
100
  ]
101
 
102
- inputs = tokenizer.apply_chat_template(
 
 
 
 
 
 
 
 
103
  messages,
104
- add_generation_prompt=True,
105
- return_tensors="pt",
106
  )
107
 
108
- generated_ids = model.generate(inputs, max_new_tokens=512)
 
 
 
 
 
 
 
 
109
 
110
  response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
111
-
112
- # The response might include the prompt, so we remove it.
113
- # This is a common pattern when decoding from a generation.
114
- prompt_plus_response = tokenizer.decode(generated_ids[0], skip_special_tokens=True)
115
- response = prompt_plus_response[len(tokenizer.decode(inputs[0], skip_special_tokens=True)):]
116
 
117
  return response
118
 
 
1
  import gradio as gr
2
  from PIL import Image, ImageDraw
3
  from ultralytics import YOLO
4
+ from transformers import AutoTokenizer, AutoModelForCausalLM
 
5
  import torch
6
 
7
  # Load a pre-trained YOLO model
 
59
  return llm_cache[model_name]
60
 
61
  model_map = {
 
 
 
 
 
 
 
62
  "qwen3:0.6b": "Qwen/Qwen3-0.6B-Instruct",
63
  "gemma3:1b": "google/gemma-3-1b-it"
64
  }
65
+ hf_model_name = model_map[model_name]
66
+
67
+ tokenizer = AutoTokenizer.from_pretrained(hf_model_name)
68
+ model = AutoModelForCausalLM.from_pretrained(
69
+ hf_model_name,
70
+ torch_dtype=torch.bfloat16,
71
+ device_map="auto"
72
  )
73
 
74
  llm_cache[model_name] = (model, tokenizer)
 
93
  {"role": "user", "content": user_prompt},
94
  ]
95
 
96
+ chat_template_args = {
97
+ "tokenize": False,
98
+ "add_generation_prompt": True
99
+ }
100
+
101
+ if 'qwen' in model_name.lower():
102
+ chat_template_args['enable_thinking'] = False
103
+
104
+ text = tokenizer.apply_chat_template(
105
  messages,
106
+ **chat_template_args
 
107
  )
108
 
109
+ model_inputs = tokenizer([text], return_tensors="pt").to(model.device)
110
+
111
+ generated_ids = model.generate(
112
+ model_inputs.input_ids,
113
+ max_new_tokens=512
114
+ )
115
+ generated_ids = [
116
+ output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
117
+ ]
118
 
119
  response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
 
 
 
 
 
120
 
121
  return response
122
 
requirements.txt CHANGED
@@ -3,5 +3,3 @@ ultralytics
3
  torch
4
  transformers
5
  pillow
6
- bitsandbytes
7
- optimum[onnxruntime]
 
3
  torch
4
  transformers
5
  pillow