lentohaihane commited on
Commit
aedc0bd
·
verified ·
1 Parent(s): 63584c6

Training in progress, step 500

Browse files
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ tokenizer.json filter=lfs diff=lfs merge=lfs -text
README.md ADDED
@@ -0,0 +1,58 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ base_model: YannQi/R-4B
3
+ library_name: transformers
4
+ model_name: r-4b-sft-eda-all
5
+ tags:
6
+ - generated_from_trainer
7
+ - sft
8
+ - trl
9
+ licence: license
10
+ ---
11
+
12
+ # Model Card for r-4b-sft-eda-all
13
+
14
+ This model is a fine-tuned version of [YannQi/R-4B](https://huggingface.co/YannQi/R-4B).
15
+ It has been trained using [TRL](https://github.com/huggingface/trl).
16
+
17
+ ## Quick start
18
+
19
+ ```python
20
+ from transformers import pipeline
21
+
22
+ question = "If you had a time machine, but could only go to the past or the future once and never return, which would you choose and why?"
23
+ generator = pipeline("text-generation", model="lentohaihane/r-4b-sft-eda-all", device="cuda")
24
+ output = generator([{"role": "user", "content": question}], max_new_tokens=128, return_full_text=False)[0]
25
+ print(output["generated_text"])
26
+ ```
27
+
28
+ ## Training procedure
29
+
30
+
31
+
32
+
33
+ This model was trained with SFT.
34
+
35
+ ### Framework versions
36
+
37
+ - TRL: 0.25.0.dev0
38
+ - Transformers: 4.57.0
39
+ - Pytorch: 2.8.0
40
+ - Datasets: 4.3.0
41
+ - Tokenizers: 0.22.1
42
+
43
+ ## Citations
44
+
45
+
46
+
47
+ Cite TRL as:
48
+
49
+ ```bibtex
50
+ @misc{vonwerra2022trl,
51
+ title = {{TRL: Transformer Reinforcement Learning}},
52
+ author = {Leandro von Werra and Younes Belkada and Lewis Tunstall and Edward Beeching and Tristan Thrush and Nathan Lambert and Shengyi Huang and Kashif Rasul and Quentin Gallou{\'e}dec},
53
+ year = 2020,
54
+ journal = {GitHub repository},
55
+ publisher = {GitHub},
56
+ howpublished = {\url{https://github.com/huggingface/trl}}
57
+ }
58
+ ```
adapter_config.json ADDED
@@ -0,0 +1,40 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "alpha_pattern": {},
3
+ "auto_mapping": null,
4
+ "base_model_name_or_path": "YannQi/R-4B",
5
+ "bias": "none",
6
+ "corda_config": null,
7
+ "eva_config": null,
8
+ "exclude_modules": null,
9
+ "fan_in_fan_out": false,
10
+ "inference_mode": true,
11
+ "init_lora_weights": true,
12
+ "layer_replication": null,
13
+ "layers_pattern": null,
14
+ "layers_to_transform": null,
15
+ "loftq_config": {},
16
+ "lora_alpha": 128,
17
+ "lora_bias": false,
18
+ "lora_dropout": 0.05,
19
+ "megatron_config": null,
20
+ "megatron_core": "megatron.core",
21
+ "modules_to_save": null,
22
+ "peft_type": "LORA",
23
+ "qalora_group_size": 16,
24
+ "r": 16,
25
+ "rank_pattern": {},
26
+ "revision": null,
27
+ "target_modules": [
28
+ "q_proj",
29
+ "k_proj",
30
+ "o_proj",
31
+ "v_proj",
32
+ "embed_tokens"
33
+ ],
34
+ "target_parameters": null,
35
+ "task_type": "CAUSAL_LM",
36
+ "trainable_token_indices": null,
37
+ "use_dora": false,
38
+ "use_qalora": false,
39
+ "use_rslora": false
40
+ }
adapter_model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f66b02a88d4058ab03ba5693ae1778288c28e1500a51dd2f2644c4fd0e9345b4
3
+ size 846889208
added_tokens.json ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "</think>": 151668,
3
+ "</tool_call>": 151658,
4
+ "</tool_response>": 151666,
5
+ "<image>": 151669,
6
+ "<think>": 151667,
7
+ "<tool_call>": 151657,
8
+ "<tool_response>": 151665,
9
+ "<video>": 151670,
10
+ "<|box_end|>": 151649,
11
+ "<|box_start|>": 151648,
12
+ "<|endoftext|>": 151643,
13
+ "<|file_sep|>": 151664,
14
+ "<|fim_middle|>": 151660,
15
+ "<|fim_pad|>": 151662,
16
+ "<|fim_prefix|>": 151659,
17
+ "<|fim_suffix|>": 151661,
18
+ "<|im_end|>": 151645,
19
+ "<|im_start|>": 151644,
20
+ "<|image_pad|>": 151655,
21
+ "<|object_ref_end|>": 151647,
22
+ "<|object_ref_start|>": 151646,
23
+ "<|quad_end|>": 151651,
24
+ "<|quad_start|>": 151650,
25
+ "<|repo_name|>": 151663,
26
+ "<|video_pad|>": 151656,
27
+ "<|vision_end|>": 151653,
28
+ "<|vision_pad|>": 151654,
29
+ "<|vision_start|>": 151652
30
+ }
chat_template.jinja ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {% for message in messages %}{{'<|im_start|>' + message['role'] + '
2
+ '}}{# Render all images first #}{% for content in message['content'] | selectattr('type', 'equalto', 'image') %}{{ '<image>
3
+ ' }}{% endfor %}{# Render all video then #}{% for content in message['content'] | selectattr('type', 'equalto', 'video') %}{{ '<video>
4
+ ' }}{% endfor %}{# Render all text next #}{% if message['role'] != 'assistant' %}{% for content in message['content'] | selectattr('type', 'equalto', 'text') %}{{ content['text'] }}{% endfor %}{% else %}{% for content in message['content'] | selectattr('type', 'equalto', 'text') %}{% generation %}{{ content['text'] }}{% endgeneration %}{% endfor %}{% endif %}{{'<|im_end|>' + '
5
+ '}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant
6
+ <think>' }}{% endif %}{%- if add_generation_prompt %}{%- if thinking_mode is defined and thinking_mode == 'short' %}{{- '
7
+
8
+ </think>
9
+
10
+ ' }}{%- endif %}{%- if thinking_mode is defined and thinking_mode == 'long' %}{{- '
11
+ ' }}{%- endif %}{%- endif %}
image_processing_r.py ADDED
@@ -0,0 +1,499 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # coding=utf-8
2
+ # Copyright 2024 The HuggingFace Inc. team. All rights reserved.
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+
16
+ from collections.abc import Iterable
17
+ from typing import Optional, Union
18
+
19
+ import numpy as np
20
+
21
+ from transformers.image_processing_utils import (
22
+ BaseImageProcessor,
23
+ BatchFeature,
24
+ get_patch_output_size,
25
+ get_size_dict,
26
+ select_best_resolution,
27
+ )
28
+ from transformers.image_transforms import (
29
+ PaddingMode,
30
+ convert_to_rgb,
31
+ pad,
32
+ resize,
33
+ to_channel_dimension_format,
34
+ )
35
+ from transformers.image_utils import (
36
+ OPENAI_CLIP_MEAN,
37
+ OPENAI_CLIP_STD,
38
+ ChannelDimension,
39
+ ImageInput,
40
+ PILImageResampling,
41
+ get_image_size,
42
+ infer_channel_dimension_format,
43
+ is_scaled_image,
44
+ make_flat_list_of_images,
45
+ to_numpy_array,
46
+ valid_images,
47
+ validate_preprocess_arguments,
48
+ )
49
+ from transformers.utils import TensorType, is_vision_available, logging
50
+
51
+
52
+ logger = logging.get_logger(__name__)
53
+
54
+
55
+ if is_vision_available():
56
+ from PIL import Image
57
+
58
+
59
+ # Copied from transformers.models.llava_next.image_processing_llava_next.divide_to_patches
60
+ def divide_to_patches(image: np.array, patch_size: int, input_data_format) -> list[np.array]:
61
+ """
62
+ Divides an image into patches of a specified size.
63
+
64
+ Args:
65
+ image (`np.array`):
66
+ The input image.
67
+ patch_size (`int`):
68
+ The size of each patch.
69
+ input_data_format (`ChannelDimension` or `str`):
70
+ The channel dimension format of the input image.
71
+
72
+ Returns:
73
+ list: A list of np.array representing the patches.
74
+ """
75
+ patches = []
76
+ height, width = get_image_size(image, channel_dim=input_data_format)
77
+ for i in range(0, height, patch_size):
78
+ for j in range(0, width, patch_size):
79
+ if input_data_format == ChannelDimension.LAST:
80
+ patch = image[i : i + patch_size, j : j + patch_size]
81
+ else:
82
+ patch = image[:, i : i + patch_size, j : j + patch_size]
83
+ patches.append(patch)
84
+
85
+ return patches
86
+
87
+
88
+ # Copied from transformers.models.llava_next.image_processing_llava_next.expand_to_square
89
+ def expand_to_square(image: np.array, background_color, input_data_format) -> np.array:
90
+ """
91
+ Expands an image to a square by adding a background color.
92
+ """
93
+
94
+ height, width = get_image_size(image, channel_dim=input_data_format)
95
+ if width == height:
96
+ return image
97
+ elif width > height:
98
+ result = np.ones((width, width, image.shape[2]), dtype=image.dtype) * background_color
99
+ result[(width - height) // 2 : (width - height) // 2 + height, :] = image
100
+ return result
101
+ else:
102
+ result = np.ones((height, height, image.shape[2]), dtype=image.dtype) * background_color
103
+ result[:, (height - width) // 2 : (height - width) // 2 + width] = image
104
+ return result
105
+
106
+
107
+ class RImageProcessor(BaseImageProcessor):
108
+ model_input_names = ["pixel_values_videos"]
109
+
110
+ def __init__(
111
+ self,
112
+ do_resize: bool = True,
113
+ size: Optional[dict[str, int]] = None,
114
+ image_grid_pinpoints: Optional[list] = None,
115
+ resample: PILImageResampling = PILImageResampling.BICUBIC,
116
+ do_rescale: bool = True,
117
+ rescale_factor: Union[int, float] = 1 / 255,
118
+ do_normalize: bool = True,
119
+ image_mean: Optional[Union[float, list[float]]] = None,
120
+ image_std: Optional[Union[float, list[float]]] = None,
121
+ do_pad: Optional[bool] = True,
122
+ do_convert_rgb: bool = True,
123
+ **kwargs,
124
+ ) -> None:
125
+ super().__init__(**kwargs)
126
+ size = size if size is not None else {"height": 384, "width": 384}
127
+ size = get_size_dict(size, default_to_square=False)
128
+ image_grid_pinpoints = (
129
+ image_grid_pinpoints
130
+ if image_grid_pinpoints is not None
131
+ else [[384, 768], [768, 384], [768, 768], [1152, 384], [384, 1152]]
132
+ )
133
+ self.do_resize = do_resize
134
+ self.size = size
135
+ self.image_grid_pinpoints = image_grid_pinpoints
136
+ self.resample = resample
137
+ self.do_rescale = do_rescale
138
+ self.rescale_factor = rescale_factor
139
+ self.do_normalize = do_normalize
140
+ self.image_mean = image_mean if image_mean is not None else OPENAI_CLIP_MEAN
141
+ self.image_std = image_std if image_std is not None else OPENAI_CLIP_STD
142
+ self.do_pad = do_pad
143
+ self.do_convert_rgb = do_convert_rgb
144
+
145
+ # Copied from transformers.models.llava_next.image_processing_llava_next.LlavaNextImageProcessor.pad
146
+ def pad(
147
+ self,
148
+ image: np.ndarray,
149
+ padding: Union[int, tuple[int, int], Iterable[tuple[int, int]]],
150
+ mode: PaddingMode = PaddingMode.CONSTANT,
151
+ constant_values: Union[float, Iterable[float]] = 0.0,
152
+ data_format: Optional[Union[str, ChannelDimension]] = None,
153
+ input_data_format: Optional[Union[str, ChannelDimension]] = None,
154
+ ) -> np.ndarray:
155
+
156
+ # call the general `pad` if padding on `height/width`, otherwise it's the `num_patched` dim
157
+ if isinstance(padding, int) or len(padding) != 4:
158
+ return pad(image, padding, mode, constant_values, data_format, input_data_format)
159
+
160
+ if input_data_format is None:
161
+ input_data_format = infer_channel_dimension_format(image)
162
+ if mode == PaddingMode.CONSTANT:
163
+ image = np.pad(image, padding, mode="constant", constant_values=constant_values)
164
+ elif mode == PaddingMode.REFLECT:
165
+ image = np.pad(image, padding, mode="reflect")
166
+ elif mode == PaddingMode.REPLICATE:
167
+ image = np.pad(image, padding, mode="edge")
168
+ elif mode == PaddingMode.SYMMETRIC:
169
+ image = np.pad(image, padding, mode="symmetric")
170
+ else:
171
+ raise ValueError(f"Invalid padding mode: {mode}")
172
+ image = (
173
+ to_channel_dimension_format(image, data_format, input_data_format) if data_format is not None else image
174
+ )
175
+ return image
176
+
177
+ # Copied from transformers.models.llava_next.image_processing_llava_next.LlavaNextImageProcessor._resize_for_patching
178
+ def _resize_for_patching(
179
+ self, image: np.array, target_resolution: tuple, resample, input_data_format: ChannelDimension
180
+ ) -> np.array:
181
+ new_height, new_width = get_patch_output_size(image, target_resolution, input_data_format)
182
+
183
+ # Resize the image
184
+ resized_image = resize(image, (new_height, new_width), resample=resample, input_data_format=input_data_format)
185
+
186
+ return resized_image
187
+
188
+ # Copied from transformers.models.llava_next.image_processing_llava_next.LlavaNextImageProcessor._get_padding_size
189
+ def _get_padding_size(self, original_resolution: tuple, target_resolution: tuple):
190
+ original_height, original_width = original_resolution
191
+ target_height, target_width = target_resolution
192
+ paste_x, r_x = divmod(target_width - original_width, 2)
193
+ paste_y, r_y = divmod(target_height - original_height, 2)
194
+ return (paste_y, paste_y + r_y), (paste_x, paste_x + r_x)
195
+
196
+ # Copied from transformers.models.llava_next.image_processing_llava_next.LlavaNextImageProcessor._pad_for_patching
197
+ def _pad_for_patching(
198
+ self, image: np.array, target_resolution: tuple, input_data_format: ChannelDimension
199
+ ) -> np.array:
200
+ """
201
+ Pad an image to a target resolution while maintaining aspect ratio.
202
+ """
203
+ new_resolution = get_patch_output_size(image, target_resolution, input_data_format)
204
+ padding = self._get_padding_size(new_resolution, target_resolution)
205
+
206
+ padded_image = self.pad(image, padding=padding)
207
+
208
+ return padded_image
209
+
210
+ # Copied from transformers.models.llava_next.image_processing_llava_next.LlavaNextImageProcessor.get_image_patches
211
+ def get_image_patches(
212
+ self,
213
+ image: np.array,
214
+ grid_pinpoints,
215
+ size: tuple,
216
+ patch_size: int,
217
+ resample: PILImageResampling,
218
+ data_format: ChannelDimension,
219
+ input_data_format: ChannelDimension,
220
+ ) -> list[np.array]:
221
+ if not isinstance(grid_pinpoints, list):
222
+ raise TypeError("grid_pinpoints must be a list of possible resolutions.")
223
+
224
+ possible_resolutions = grid_pinpoints
225
+
226
+ image_size = get_image_size(image, channel_dim=input_data_format)
227
+ best_resolution = select_best_resolution(image_size, possible_resolutions)
228
+ resized_image = self._resize_for_patching(
229
+ image, best_resolution, resample=resample, input_data_format=input_data_format
230
+ )
231
+ padded_image = self._pad_for_patching(resized_image, best_resolution, input_data_format=input_data_format)
232
+
233
+ patches = divide_to_patches(padded_image, patch_size=patch_size, input_data_format=input_data_format)
234
+
235
+ # make sure that all patches are in the input data format
236
+ patches = [
237
+ to_channel_dimension_format(patch, channel_dim=data_format, input_channel_dim=input_data_format)
238
+ for patch in patches
239
+ ]
240
+
241
+ resized_original_image = resize(
242
+ image,
243
+ size=size,
244
+ resample=resample,
245
+ data_format=data_format,
246
+ input_data_format=input_data_format,
247
+ )
248
+
249
+ image_patches = [resized_original_image] + patches
250
+
251
+ return image_patches
252
+
253
+ # Copied from transformers.models.llava_next.image_processing_llava_next.LlavaNextImageProcessor._pad_for_batching
254
+ def _pad_for_batching(
255
+ self,
256
+ pixel_values: list[np.ndarray],
257
+ data_format: Optional[Union[str, ChannelDimension]] = None,
258
+ input_data_format: Optional[Union[str, ChannelDimension]] = None,
259
+ ):
260
+ max_patch = max(len(x) for x in pixel_values)
261
+ pixel_values = [
262
+ self.pad(
263
+ image,
264
+ padding=((0, max_patch - image.shape[0]), (0, 0), (0, 0), (0, 0)),
265
+ data_format=data_format,
266
+ input_data_format=input_data_format,
267
+ )
268
+ for image in pixel_values
269
+ ]
270
+
271
+ return pixel_values
272
+
273
+ # Copied from transformers.models.llava.image_processing_llava.LlavaImageProcessor.pad_to_square
274
+ def pad_to_square(
275
+ self,
276
+ image: np.ndarray,
277
+ background_color: Union[int, tuple[int, int, int]] = 0,
278
+ data_format: Optional[Union[str, ChannelDimension]] = None,
279
+ input_data_format: Optional[Union[str, ChannelDimension]] = None,
280
+ ) -> np.array:
281
+ height, width = get_image_size(image, input_data_format)
282
+ num_channels = image.shape[0] if input_data_format == ChannelDimension.FIRST else image.shape[-1]
283
+
284
+ if height == width:
285
+ image = (
286
+ to_channel_dimension_format(image, data_format, input_data_format)
287
+ if data_format is not None
288
+ else image
289
+ )
290
+ return image
291
+
292
+ max_dim = max(height, width)
293
+
294
+ # Ensure background_color is the correct shape
295
+ if isinstance(background_color, int):
296
+ background_color = [background_color]
297
+ elif len(background_color) != num_channels:
298
+ raise ValueError(
299
+ f"background_color must have no more than {num_channels} elements to match the number of channels"
300
+ )
301
+
302
+ if input_data_format == ChannelDimension.FIRST:
303
+ result = np.zeros((num_channels, max_dim, max_dim), dtype=image.dtype)
304
+ for i, color in enumerate(background_color):
305
+ result[i, :, :] = color
306
+ if width > height:
307
+ start = (max_dim - height) // 2
308
+ result[:, start : start + height, :] = image
309
+ else:
310
+ start = (max_dim - width) // 2
311
+ result[:, :, start : start + width] = image
312
+ else:
313
+ result = np.zeros((max_dim, max_dim, num_channels), dtype=image.dtype)
314
+ for i, color in enumerate(background_color):
315
+ result[:, :, i] = color
316
+ if width > height:
317
+ start = (max_dim - height) // 2
318
+ result[start : start + height, :, :] = image
319
+ else:
320
+ start = (max_dim - width) // 2
321
+ result[:, start : start + width, :] = image
322
+
323
+ image = (
324
+ to_channel_dimension_format(result, data_format, input_data_format) if data_format is not None else result
325
+ )
326
+ return image
327
+
328
+ def _preprocess(
329
+ self,
330
+ images: ImageInput,
331
+ do_resize: Optional[bool] = None,
332
+ size: Optional[dict[str, int]] = None,
333
+ resample: PILImageResampling = None,
334
+ do_rescale: Optional[bool] = None,
335
+ rescale_factor: Optional[float] = None,
336
+ do_normalize: Optional[bool] = None,
337
+ image_mean: Optional[Union[float, list[float]]] = None,
338
+ image_std: Optional[Union[float, list[float]]] = None,
339
+ do_convert_rgb: Optional[bool] = None,
340
+ data_format: Optional[ChannelDimension] = ChannelDimension.FIRST,
341
+ input_data_format: Optional[Union[str, ChannelDimension]] = None,
342
+ ) -> Image.Image:
343
+ if do_resize:
344
+ images = [
345
+ resize(image=image, size=size, resample=resample, input_data_format=input_data_format)
346
+ for image in images
347
+ ]
348
+
349
+ if do_rescale:
350
+ images = [
351
+ self.rescale(image=image, scale=rescale_factor, input_data_format=input_data_format)
352
+ for image in images
353
+ ]
354
+
355
+ if do_normalize:
356
+ images = [
357
+ self.normalize(image=image, mean=image_mean, std=image_std, input_data_format=input_data_format)
358
+ for image in images
359
+ ]
360
+
361
+ images = [
362
+ to_channel_dimension_format(image, data_format, input_channel_dim=input_data_format) for image in images
363
+ ]
364
+
365
+ return images
366
+
367
+ def preprocess(
368
+ self,
369
+ images: ImageInput,
370
+ do_resize: Optional[bool] = None,
371
+ size: Optional[dict[str, int]] = None,
372
+ image_grid_pinpoints: Optional[list] = None,
373
+ resample: PILImageResampling = None,
374
+ do_rescale: Optional[bool] = None,
375
+ rescale_factor: Optional[float] = None,
376
+ do_normalize: Optional[bool] = None,
377
+ image_mean: Optional[Union[float, list[float]]] = None,
378
+ image_std: Optional[Union[float, list[float]]] = None,
379
+ do_pad: Optional[bool] = None,
380
+ do_convert_rgb: Optional[bool] = None,
381
+ return_tensors: Optional[Union[str, TensorType]] = None,
382
+ data_format: Optional[ChannelDimension] = ChannelDimension.FIRST,
383
+ input_data_format: Optional[Union[str, ChannelDimension]] = None,
384
+ ):
385
+ do_resize = do_resize if do_resize is not None else self.do_resize
386
+ size = size if size is not None else self.size
387
+ size = get_size_dict(size, default_to_square=False)
388
+ image_grid_pinpoints = image_grid_pinpoints if image_grid_pinpoints is not None else self.image_grid_pinpoints
389
+ resample = resample if resample is not None else self.resample
390
+ do_rescale = do_rescale if do_rescale is not None else self.do_rescale
391
+ rescale_factor = rescale_factor if rescale_factor is not None else self.rescale_factor
392
+ do_normalize = do_normalize if do_normalize is not None else self.do_normalize
393
+ image_mean = image_mean if image_mean is not None else self.image_mean
394
+ image_std = image_std if image_std is not None else self.image_std
395
+ do_pad = do_pad if do_pad is not None else self.do_pad
396
+ do_convert_rgb = do_convert_rgb if do_convert_rgb is not None else self.do_convert_rgb
397
+
398
+ if isinstance(images, (tuple, list)) and isinstance(images[0], (tuple, list)):
399
+ # if the first element is a list, we assume that all elements are lists
400
+ batch_num_images = [len(x) for x in images]
401
+ elif isinstance(images, (tuple, list)):
402
+ # treat this as a single-image case for backward compatibility
403
+ batch_num_images = [1] * len(images)
404
+ else:
405
+ batch_num_images = [1]
406
+ # only single image patching is supported
407
+ need_patching = [n == 1 for n in batch_num_images for _ in range(n)]
408
+
409
+ images = make_flat_list_of_images(images)
410
+
411
+ if not valid_images(images):
412
+ raise ValueError(
413
+ "Invalid image type. Must be of type PIL.Image.Image, numpy.ndarray, "
414
+ "torch.Tensor, tf.Tensor or jax.ndarray."
415
+ )
416
+
417
+ validate_preprocess_arguments(
418
+ do_rescale=do_rescale,
419
+ rescale_factor=rescale_factor,
420
+ do_normalize=do_normalize,
421
+ image_mean=image_mean,
422
+ image_std=image_std,
423
+ do_resize=do_resize,
424
+ size=size,
425
+ resample=resample,
426
+ )
427
+
428
+ if do_convert_rgb:
429
+ images = [convert_to_rgb(image) for image in images]
430
+
431
+ # All transformations expect numpy arrays.
432
+ images = [to_numpy_array(image) for image in images]
433
+
434
+ if do_rescale and is_scaled_image(images[0]):
435
+ logger.warning_once(
436
+ "It looks like you are trying to rescale already rescaled images. If the input"
437
+ " images have pixel values between 0 and 1, set `do_rescale=False` to avoid rescaling them again."
438
+ )
439
+
440
+ if input_data_format is None:
441
+ # We assume that all images have the same channel dimension format.
442
+ input_data_format = infer_channel_dimension_format(images[0])
443
+
444
+ size_tuple = (
445
+ (size["height"], size["width"])
446
+ if "height" in size and "width" in size
447
+ else (size["shortest_edge"], size["shortest_edge"])
448
+ )
449
+
450
+ new_images = []
451
+ image_sizes = [get_image_size(image, channel_dim=input_data_format) for image in images]
452
+ for i, image in enumerate(images):
453
+ if need_patching[i]:
454
+ # convert image into a list of patches
455
+ # we intentionally use the same data format as the input data format
456
+ image_patches = self.get_image_patches(
457
+ image,
458
+ image_grid_pinpoints,
459
+ size=size_tuple,
460
+ patch_size=size_tuple[0],
461
+ resample=resample,
462
+ data_format=input_data_format,
463
+ input_data_format=input_data_format,
464
+ )
465
+ else:
466
+ padded_image = self.pad_to_square(
467
+ image=image,
468
+ background_color=tuple(int(x * 255) for x in self.image_mean),
469
+ input_data_format=input_data_format,
470
+ )
471
+ image_patches = [padded_image]
472
+
473
+ # preprocess patches
474
+ pixel_values = self._preprocess(
475
+ image_patches,
476
+ do_resize=do_resize,
477
+ size=size_tuple,
478
+ resample=resample,
479
+ do_rescale=do_rescale,
480
+ rescale_factor=rescale_factor,
481
+ do_normalize=do_normalize,
482
+ image_mean=image_mean,
483
+ image_std=image_std,
484
+ data_format=data_format,
485
+ input_data_format=input_data_format,
486
+ )
487
+ pixel_values = np.array(pixel_values)
488
+ new_images.append(pixel_values)
489
+
490
+ if do_pad:
491
+ processed_images = self._pad_for_batching(new_images)
492
+
493
+ return BatchFeature(
494
+ data={"pixel_values": processed_images, "image_sizes": image_sizes, "batch_num_images": batch_num_images},
495
+ tensor_type=return_tensors,
496
+ )
497
+
498
+
499
+ __all__ = ["RImageProcessor"]
merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
preprocessor_config.json ADDED
@@ -0,0 +1,51 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "auto_map": {
3
+ "AutoImageProcessor": "image_processing_r.RImageProcessor",
4
+ "AutoProcessor": "processing_r.RProcessor"
5
+ },
6
+ "do_convert_rgb": null,
7
+ "do_normalize": true,
8
+ "do_pad": true,
9
+ "do_rescale": true,
10
+ "do_resize": true,
11
+ "image_grid_pinpoints": [
12
+ [
13
+ 384,
14
+ 768
15
+ ],
16
+ [
17
+ 768,
18
+ 384
19
+ ],
20
+ [
21
+ 768,
22
+ 768
23
+ ],
24
+ [
25
+ 1152,
26
+ 384
27
+ ],
28
+ [
29
+ 384,
30
+ 1152
31
+ ]
32
+ ],
33
+ "image_mean": [
34
+ 0.5,
35
+ 0.5,
36
+ 0.5
37
+ ],
38
+ "image_processor_type": "RImageProcessor",
39
+ "image_std": [
40
+ 0.5,
41
+ 0.5,
42
+ 0.5
43
+ ],
44
+ "processor_class": "RProcessor",
45
+ "resample": 2,
46
+ "rescale_factor": 0.00392156862745098,
47
+ "size": {
48
+ "height": 384,
49
+ "width": 384
50
+ }
51
+ }
processing_r.py ADDED
@@ -0,0 +1,259 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Licensed under the Apache License, Version 2.0 (the "License");
2
+ # you may not use this file except in compliance with the License.
3
+ # You may obtain a copy of the License at
4
+ #
5
+ # http://www.apache.org/licenses/LICENSE-2.0
6
+ #
7
+ # Unless required by applicable law or agreed to in writing, software
8
+ # distributed under the License is distributed on an "AS IS" BASIS,
9
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
10
+ # See the License for the specific language governing permissions and
11
+ # limitations under the License.
12
+
13
+
14
+ import math
15
+ from collections.abc import Iterable
16
+ from typing import Union
17
+
18
+ import numpy as np
19
+
20
+ from transformers.feature_extraction_utils import BatchFeature
21
+ from transformers.image_processing_utils import select_best_resolution
22
+ from transformers.image_utils import ImageInput, get_image_size, to_numpy_array
23
+ from transformers.processing_utils import ProcessingKwargs, ProcessorMixin, Unpack, MultiModalData
24
+ from transformers.tokenization_utils_base import PreTokenizedInput, TextInput
25
+ from transformers.utils import logging
26
+
27
+
28
+ logger = logging.get_logger(__name__)
29
+
30
+
31
+ class RProcessorKwargs(ProcessingKwargs, total=False):
32
+ # see processing_utils.ProcessingKwargs documentation for usage.
33
+ _defaults = {
34
+ "text_kwargs": {
35
+ "padding": False,
36
+
37
+ },
38
+ "image_kwargs": {},
39
+ }
40
+
41
+
42
+ class RProcessor(ProcessorMixin):
43
+ attributes = ["image_processor", "tokenizer"]
44
+ valid_kwargs = [
45
+ "chat_template",
46
+ "num_image_tokens",
47
+ "image_processor_type",
48
+ "vision_feature_select_strategy",
49
+ "image_token",
50
+ "vision_aspect_ratio",
51
+ ]
52
+ image_processor_class = "AutoImageProcessor"
53
+ tokenizer_class = "AutoTokenizer"
54
+
55
+ def __init__(
56
+ self,
57
+ image_processor=None,
58
+ tokenizer=None,
59
+ num_image_tokens=None,
60
+ vision_feature_select_strategy=None,
61
+ chat_template=None,
62
+ image_token="<image>",
63
+ vision_aspect_ratio= "anyres",
64
+ **kwargs,
65
+ ):
66
+ self.num_image_tokens = num_image_tokens
67
+ self.vision_feature_select_strategy = vision_feature_select_strategy
68
+ self.image_token = tokenizer.image_token if hasattr(tokenizer, "image_token") else image_token
69
+ self.image_token_id = (
70
+ tokenizer.image_token_id
71
+ if getattr(tokenizer, "image_token_id", None)
72
+ else tokenizer.convert_tokens_to_ids(self.image_token)
73
+ )
74
+ self.vision_aspect_ratio = vision_aspect_ratio
75
+ super().__init__(image_processor, tokenizer, chat_template=chat_template)
76
+
77
+ def __call__(
78
+ self,
79
+ images: ImageInput = None,
80
+ text: Union[TextInput, PreTokenizedInput, list[TextInput], list[PreTokenizedInput]] = None,
81
+ audio=None,
82
+ **kwargs: Unpack[RProcessorKwargs],
83
+ ) -> BatchFeature:
84
+ output_kwargs = self._merge_kwargs(
85
+ RProcessorKwargs,
86
+ tokenizer_init_kwargs=self.tokenizer.init_kwargs,
87
+ **kwargs,
88
+ )
89
+
90
+ if isinstance(text, str):
91
+ text = [text]
92
+ elif not isinstance(text, list) and not isinstance(text[0], str):
93
+ raise ValueError("Invalid input text. Please provide a string, or a list of strings")
94
+
95
+ image_inputs = {}
96
+
97
+ if images is not None:
98
+ image_inputs = self.image_processor(images, **output_kwargs["images_kwargs"])
99
+
100
+ batch_num_images = iter(image_inputs["batch_num_images"])
101
+ image_sizes = iter(image_inputs["image_sizes"])
102
+ height, width = get_image_size(
103
+ to_numpy_array(image_inputs["pixel_values"][0][0]),
104
+ channel_dim=output_kwargs["images_kwargs"].get("data_format"),
105
+ )
106
+ text, num_image_tokens = self._expand_image_tokens(
107
+ text, image_sizes, height, width, self.image_token, batch_num_images
108
+ )
109
+
110
+ return_tensors = output_kwargs["text_kwargs"].pop("return_tensors", None)
111
+
112
+ text_inputs = self.tokenizer(text, **output_kwargs["text_kwargs"])
113
+ self._check_special_mm_tokens(text, text_inputs, modalities=["image"])
114
+
115
+
116
+ return BatchFeature(data={**text_inputs, **image_inputs}, tensor_type=return_tensors)
117
+
118
+ def _expand_image_tokens(
119
+ self,
120
+ text: list[TextInput],
121
+ image_sizes: Iterable[Union[list[int], int]],
122
+ height: int,
123
+ width: int,
124
+ special_token: str,
125
+ batch_num_images: Iterable[int],
126
+ ):
127
+
128
+ prompt_strings = []
129
+ max_num_vision_tokens = 0
130
+ for sample in text:
131
+ if special_token in sample:
132
+ is_multi_image = next(batch_num_images) != 1
133
+ else:
134
+ is_multi_image = False
135
+ while special_token in sample:
136
+ if is_multi_image:
137
+ num_image_tokens = self.num_image_tokens + 1 # one for image_newline
138
+ else:
139
+ original_size = next(image_sizes)
140
+ if not isinstance(original_size, (list, tuple)):
141
+ # cast to list to avoid numerical precision errors when calculating unpadding
142
+ original_size = original_size.tolist()
143
+ orig_height, orig_width = original_size
144
+ num_image_tokens = self._get_number_of_features(orig_height, orig_width, height, width)
145
+ max_num_vision_tokens = max(max_num_vision_tokens, num_image_tokens)
146
+ if self.vision_feature_select_strategy == "default":
147
+ num_image_tokens -= 1
148
+ sample = sample.replace(special_token, "<placeholder>" * num_image_tokens, 1)
149
+ prompt_strings.append(sample)
150
+ text = [sample.replace("<placeholder>", special_token) for sample in prompt_strings]
151
+ return text, max_num_vision_tokens
152
+
153
+ def _get_number_of_features(self, orig_height: int, orig_width: int, height: int, width: int) -> int:
154
+ image_grid_pinpoints = self.image_processor.image_grid_pinpoints
155
+
156
+ height_best_resolution, width_best_resolution = select_best_resolution(
157
+ [orig_height, orig_width], image_grid_pinpoints
158
+ )
159
+ scale_height, scale_width = height_best_resolution // height, width_best_resolution // width
160
+
161
+ patches_height = patches_width = int(math.sqrt(self.num_image_tokens))
162
+ unpadded_features, newline_features = self._get_unpadded_features(
163
+ orig_height, orig_width, patches_height, patches_width, scale_height, scale_width
164
+ )
165
+
166
+ # The base patch covers the entire image (no CLS for SigLIP)
167
+ base_features = self.num_image_tokens
168
+ num_image_tokens = unpadded_features + newline_features + base_features
169
+ return num_image_tokens
170
+
171
+ # Adapted from transformers.models.llava_next.processing_llava_next.LlavaNextProcessor._get_unpadded_features
172
+ def _get_unpadded_features(self, height, width, patches_height, patches_width, scale_height, scale_width):
173
+ current_height = patches_height * scale_height
174
+ current_width = patches_width * scale_width
175
+
176
+ original_aspect_ratio = width / height
177
+ current_aspect_ratio = current_width / current_height
178
+ if original_aspect_ratio > current_aspect_ratio:
179
+ new_height = int(round(height * (current_width / width), 7))
180
+ padding = (current_height - new_height) // 2
181
+ current_height -= padding * 2
182
+ else:
183
+ new_width = int(round(width * (current_height / height), 7))
184
+ padding = (current_width - new_width) // 2
185
+ current_width -= padding * 2
186
+
187
+ unpadded_features = current_height * current_width
188
+ newline_features = current_height
189
+
190
+ return (unpadded_features, newline_features)
191
+
192
+
193
+ def _get_num_multimodal_tokens(self, image_sizes=None, video_sizes=None, **kwargs):
194
+ """
195
+ Computes the number of placeholder tokens needed for multimodal inputs with the given sizes.
196
+ Args:
197
+ image_sizes (list[list[str]], *optional*):
198
+ The input sizes formatted as (height, width) per each image.
199
+ video_sizes (list[list[str]], *optional*):
200
+ The input sizes formatted as (num_frames, height, width) per each video.
201
+ audio_lengths (list[int], *optional*):
202
+ The input length formatted as per each audio.
203
+ Returns:
204
+ dict[str, list[int]]: A dictionary mapping each modality ("image", "video", "audio")
205
+ to a list containing the number of placeholder tokens required. If the model doesn't accept
206
+ a certain modality or no input sizes are provided, the dict value is set to an empty list.
207
+ """
208
+ vision_data = {}
209
+ if image_sizes is not None:
210
+ images_kwargs = RProcessorKwargs._defaults.get("images_kwargs", {})
211
+ images_kwargs.update(kwargs)
212
+
213
+ size = images_kwargs.get("size", None) or self.image_processor.size
214
+ size = (
215
+ (size["shortest_edge"], size["shortest_edge"])
216
+ if "shortest_edge" in size
217
+ else (min(size["height"], size["width"]), min(size["height"], size["width"]))
218
+ )
219
+ processed_height, processed_width = size
220
+
221
+ batch_num_image_tokens = []
222
+ num_image_patches = [1] * len(image_sizes) # llava-ov doesn't batch pixels as Idefics, thus `1` patch`
223
+ for image_size in image_sizes:
224
+ orig_height, orig_width = image_size
225
+ num_image_tokens = self._get_number_of_features(
226
+ orig_height, orig_width, processed_height, processed_width
227
+ )
228
+ if self.vision_feature_select_strategy == "default":
229
+ num_image_tokens -= 1
230
+ batch_num_image_tokens.append(num_image_tokens)
231
+ vision_data.update({"num_image_tokens": batch_num_image_tokens, "num_image_patches": num_image_patches})
232
+
233
+ return MultiModalData(**vision_data)
234
+
235
+ # Copied from transformers.models.clip.processing_clip.CLIPProcessor.batch_decode with CLIP->Llama
236
+ def batch_decode(self, *args, **kwargs):
237
+ """
238
+ This method forwards all its arguments to LlamaTokenizerFast's [`~PreTrainedTokenizer.batch_decode`]. Please
239
+ refer to the docstring of this method for more information.
240
+ """
241
+ return self.tokenizer.batch_decode(*args, **kwargs)
242
+
243
+ # Copied from transformers.models.clip.processing_clip.CLIPProcessor.decode with CLIP->Llama
244
+ def decode(self, *args, **kwargs):
245
+ """
246
+ This method forwards all its arguments to LlamaTokenizerFast's [`~PreTrainedTokenizer.decode`]. Please refer to
247
+ the docstring of this method for more information.
248
+ """
249
+ return self.tokenizer.decode(*args, **kwargs)
250
+
251
+ @property
252
+ # Copied from transformers.models.clip.processing_clip.CLIPProcessor.model_input_names
253
+ def model_input_names(self):
254
+ tokenizer_input_names = self.tokenizer.model_input_names
255
+ image_processor_input_names = self.image_processor.model_input_names
256
+ return list(dict.fromkeys(tokenizer_input_names + image_processor_input_names))
257
+
258
+
259
+ __all__ = ["RProcessor"]
processor_config.json ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "auto_map": {
3
+ "AutoProcessor": "processing_r.RProcessor"
4
+ },
5
+ "image_token": "<image>",
6
+ "num_image_tokens": 729,
7
+ "processor_class": "RProcessor",
8
+ "vision_aspect_ratio": "anyres",
9
+ "vision_feature_select_strategy": "full"
10
+ }
special_tokens_map.json ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "additional_special_tokens": [
3
+ "<|im_start|>",
4
+ "<|im_end|>",
5
+ "<|object_ref_start|>",
6
+ "<|object_ref_end|>",
7
+ "<|box_start|>",
8
+ "<|box_end|>",
9
+ "<|quad_start|>",
10
+ "<|quad_end|>",
11
+ "<|vision_start|>",
12
+ "<|vision_end|>",
13
+ "<|vision_pad|>",
14
+ "<|image_pad|>",
15
+ "<|video_pad|>"
16
+ ],
17
+ "eos_token": {
18
+ "content": "<|im_end|>",
19
+ "lstrip": false,
20
+ "normalized": false,
21
+ "rstrip": false,
22
+ "single_word": false
23
+ },
24
+ "pad_token": {
25
+ "content": "<|endoftext|>",
26
+ "lstrip": false,
27
+ "normalized": false,
28
+ "rstrip": false,
29
+ "single_word": false
30
+ }
31
+ }
tokenizer.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a44bdeb203b51e0bc7615c0e920f5278ebdac0a51146a9177a3d428030305168
3
+ size 11423190
tokenizer_config.json ADDED
@@ -0,0 +1,259 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_bos_token": false,
3
+ "add_prefix_space": false,
4
+ "added_tokens_decoder": {
5
+ "151643": {
6
+ "content": "<|endoftext|>",
7
+ "lstrip": false,
8
+ "normalized": false,
9
+ "rstrip": false,
10
+ "single_word": false,
11
+ "special": true
12
+ },
13
+ "151644": {
14
+ "content": "<|im_start|>",
15
+ "lstrip": false,
16
+ "normalized": false,
17
+ "rstrip": false,
18
+ "single_word": false,
19
+ "special": true
20
+ },
21
+ "151645": {
22
+ "content": "<|im_end|>",
23
+ "lstrip": false,
24
+ "normalized": false,
25
+ "rstrip": false,
26
+ "single_word": false,
27
+ "special": true
28
+ },
29
+ "151646": {
30
+ "content": "<|object_ref_start|>",
31
+ "lstrip": false,
32
+ "normalized": false,
33
+ "rstrip": false,
34
+ "single_word": false,
35
+ "special": true
36
+ },
37
+ "151647": {
38
+ "content": "<|object_ref_end|>",
39
+ "lstrip": false,
40
+ "normalized": false,
41
+ "rstrip": false,
42
+ "single_word": false,
43
+ "special": true
44
+ },
45
+ "151648": {
46
+ "content": "<|box_start|>",
47
+ "lstrip": false,
48
+ "normalized": false,
49
+ "rstrip": false,
50
+ "single_word": false,
51
+ "special": true
52
+ },
53
+ "151649": {
54
+ "content": "<|box_end|>",
55
+ "lstrip": false,
56
+ "normalized": false,
57
+ "rstrip": false,
58
+ "single_word": false,
59
+ "special": true
60
+ },
61
+ "151650": {
62
+ "content": "<|quad_start|>",
63
+ "lstrip": false,
64
+ "normalized": false,
65
+ "rstrip": false,
66
+ "single_word": false,
67
+ "special": true
68
+ },
69
+ "151651": {
70
+ "content": "<|quad_end|>",
71
+ "lstrip": false,
72
+ "normalized": false,
73
+ "rstrip": false,
74
+ "single_word": false,
75
+ "special": true
76
+ },
77
+ "151652": {
78
+ "content": "<|vision_start|>",
79
+ "lstrip": false,
80
+ "normalized": false,
81
+ "rstrip": false,
82
+ "single_word": false,
83
+ "special": true
84
+ },
85
+ "151653": {
86
+ "content": "<|vision_end|>",
87
+ "lstrip": false,
88
+ "normalized": false,
89
+ "rstrip": false,
90
+ "single_word": false,
91
+ "special": true
92
+ },
93
+ "151654": {
94
+ "content": "<|vision_pad|>",
95
+ "lstrip": false,
96
+ "normalized": false,
97
+ "rstrip": false,
98
+ "single_word": false,
99
+ "special": true
100
+ },
101
+ "151655": {
102
+ "content": "<|image_pad|>",
103
+ "lstrip": false,
104
+ "normalized": false,
105
+ "rstrip": false,
106
+ "single_word": false,
107
+ "special": true
108
+ },
109
+ "151656": {
110
+ "content": "<|video_pad|>",
111
+ "lstrip": false,
112
+ "normalized": false,
113
+ "rstrip": false,
114
+ "single_word": false,
115
+ "special": true
116
+ },
117
+ "151657": {
118
+ "content": "<tool_call>",
119
+ "lstrip": false,
120
+ "normalized": false,
121
+ "rstrip": false,
122
+ "single_word": false,
123
+ "special": false
124
+ },
125
+ "151658": {
126
+ "content": "</tool_call>",
127
+ "lstrip": false,
128
+ "normalized": false,
129
+ "rstrip": false,
130
+ "single_word": false,
131
+ "special": false
132
+ },
133
+ "151659": {
134
+ "content": "<|fim_prefix|>",
135
+ "lstrip": false,
136
+ "normalized": false,
137
+ "rstrip": false,
138
+ "single_word": false,
139
+ "special": false
140
+ },
141
+ "151660": {
142
+ "content": "<|fim_middle|>",
143
+ "lstrip": false,
144
+ "normalized": false,
145
+ "rstrip": false,
146
+ "single_word": false,
147
+ "special": false
148
+ },
149
+ "151661": {
150
+ "content": "<|fim_suffix|>",
151
+ "lstrip": false,
152
+ "normalized": false,
153
+ "rstrip": false,
154
+ "single_word": false,
155
+ "special": false
156
+ },
157
+ "151662": {
158
+ "content": "<|fim_pad|>",
159
+ "lstrip": false,
160
+ "normalized": false,
161
+ "rstrip": false,
162
+ "single_word": false,
163
+ "special": false
164
+ },
165
+ "151663": {
166
+ "content": "<|repo_name|>",
167
+ "lstrip": false,
168
+ "normalized": false,
169
+ "rstrip": false,
170
+ "single_word": false,
171
+ "special": false
172
+ },
173
+ "151664": {
174
+ "content": "<|file_sep|>",
175
+ "lstrip": false,
176
+ "normalized": false,
177
+ "rstrip": false,
178
+ "single_word": false,
179
+ "special": false
180
+ },
181
+ "151665": {
182
+ "content": "<tool_response>",
183
+ "lstrip": false,
184
+ "normalized": false,
185
+ "rstrip": false,
186
+ "single_word": false,
187
+ "special": false
188
+ },
189
+ "151666": {
190
+ "content": "</tool_response>",
191
+ "lstrip": false,
192
+ "normalized": false,
193
+ "rstrip": false,
194
+ "single_word": false,
195
+ "special": false
196
+ },
197
+ "151667": {
198
+ "content": "<think>",
199
+ "lstrip": false,
200
+ "normalized": false,
201
+ "rstrip": false,
202
+ "single_word": false,
203
+ "special": false
204
+ },
205
+ "151668": {
206
+ "content": "</think>",
207
+ "lstrip": false,
208
+ "normalized": false,
209
+ "rstrip": false,
210
+ "single_word": false,
211
+ "special": false
212
+ },
213
+ "151669": {
214
+ "content": "<image>",
215
+ "lstrip": false,
216
+ "normalized": false,
217
+ "rstrip": false,
218
+ "single_word": false,
219
+ "special": true
220
+ },
221
+ "151670": {
222
+ "content": "<video>",
223
+ "lstrip": false,
224
+ "normalized": false,
225
+ "rstrip": false,
226
+ "single_word": false,
227
+ "special": true
228
+ }
229
+ },
230
+ "additional_special_tokens": [
231
+ "<|im_start|>",
232
+ "<|im_end|>",
233
+ "<|object_ref_start|>",
234
+ "<|object_ref_end|>",
235
+ "<|box_start|>",
236
+ "<|box_end|>",
237
+ "<|quad_start|>",
238
+ "<|quad_end|>",
239
+ "<|vision_start|>",
240
+ "<|vision_end|>",
241
+ "<|vision_pad|>",
242
+ "<|image_pad|>",
243
+ "<|video_pad|>"
244
+ ],
245
+ "auto_map": {
246
+ "AutoProcessor": "processing_r.RProcessor"
247
+ },
248
+ "bos_token": null,
249
+ "clean_up_tokenization_spaces": false,
250
+ "eos_token": "<|im_end|>",
251
+ "errors": "replace",
252
+ "extra_special_tokens": {},
253
+ "model_max_length": 131072,
254
+ "pad_token": "<|endoftext|>",
255
+ "processor_class": "RProcessor",
256
+ "split_special_tokens": false,
257
+ "tokenizer_class": "Qwen2Tokenizer",
258
+ "unk_token": null
259
+ }
training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:770c24dd83e15987930731f2a96d0bb2e4c94209023b8a6aaddde3b735cf2289
3
+ size 6289
vocab.json ADDED
The diff for this file is too large to render. See raw diff