SorenDreano commited on
Commit
7f3fcea
·
verified ·
1 Parent(s): d92a654

Upload processor

Browse files
chat_template.json CHANGED
@@ -1,3 +1,3 @@
1
  {
2
- "chat_template": "{% set image_placeholder = '<|vision_start|><|image_pad|><|vision_end|>' %}\n{% for message in messages %}\n {#--- Handle User Messages with Template and Examples ---#}\n {%- if message['role'] == 'user' and template -%}\n {% if loop.first and message['role'] != 'system' %}\n {{- '<|im_start|>system\nYou are NuExtract, an information extraction tool created by NuMind.<|im_end|>' }}\n {% endif %}\n \n {{- '<|im_start|>' + message['role'] -}}\n \n {#--- Template Section ---#}\n {{ '\n# Template:' }}\n {{- '\n' + template + '\n' }}\n \n {#--- Examples Section (if provided) ---#}\n {% if examples -%}\n {{- '# Examples:' }}\n {% for example in examples %}\n {{- '## Input:\n' }}\n {#--- Handle image examples ---#}\n {% if example['input'] is mapping and example['input']['type'] == 'image' %}\n {{- image_placeholder | trim -}}\n {% elif example['input'] == '<image>' %}\n {{- image_placeholder | trim -}}\n {% else %}\n {{- example['input'] -}}\n {% endif %}\n {{- '\n## Output:\n' ~ example['output'] }}\n {% endfor %}\n {%- endif %}\n \n {#--- Context Section: Handle various content types ---#}\n {{- '# Context:\n' }}\n {%- if message['content'] is string -%}\n {#--- Simple string content ---#}\n {{- message['content'] | trim -}}\n {%- elif message['content'] is mapping and message['content']['type'] == 'image' -%}\n {#--- Single image document ---#}\n {{- image_placeholder | trim -}}\n {%- else -%}\n {#--- List of content items (mixed text/images) ---#}\n {#--- First, determine what the actual input content is (not ICL images) ---#}\n {%- set ns = namespace(has_text_input=false, text_content='') -%}\n \n {#--- Count content types and identify actual input document ---#}\n {%- for content in message['content'] -%}\n {%- if content is mapping and content.get('type') == 'text' -%}\n {%- if content.get('text') != '<image>' -%}\n {%- set ns.has_text_input = true -%}\n {%- set ns.text_content = content['text'] -%}\n {%- endif -%}\n {%- elif content is string -%}\n {%- if content != '<image>' -%}\n {%- set ns.has_text_input = true -%}\n {%- set ns.text_content = content -%}\n {%- endif -%}\n {%- endif -%}\n {%- endfor -%}\n \n {#--- Determine what to output based on actual input type ---#}\n {%- if ns.has_text_input -%}\n {#--- Main input is text, so output the text content ---#}\n {{- ns.text_content | trim -}}\n {%- else -%}\n {#--- Main input is image or <image> placeholder ---#}\n {%- set ns2 = namespace(found_image=false) -%}\n {%- for content in message['content'] -%}\n {%- if content is mapping and content.get('type') == 'image' and not ns2.found_image -%}\n {{- image_placeholder | trim -}}\n {%- set ns2.found_image = true -%}\n {%- elif content is mapping and content.get('type') == 'text' and content.get('text') == '<image>' and not ns2.found_image -%}\n {{- image_placeholder | trim -}}\n {%- set ns2.found_image = true -%}\n {%- elif content is string and content == '<image>' and not ns2.found_image -%}\n {{- image_placeholder | trim -}}\n {%- set ns2.found_image = true -%}\n {%- endif -%}\n {%- endfor -%}\n {%- endif -%}\n {%- endif -%}\n {{- '<|im_end|>\n'}}\n \n {#--- Handle All Other Messages (Assistant, System, etc.) ---#}\n {% else %}\n {% if loop.first and message['role'] != 'system' %}\n {{- '<|im_start|>system\nYou are a helpful assistant.<|im_end|>' }}\n {% endif %}\n \n {{- '<|im_start|>' + message['role'] + '\n' }}\n \n {#--- Same content handling logic as above but without template/examples ---#}\n {%- if message['content'] is string -%}\n {{- message['content'] | trim }}\n {%- elif message['content'] is mapping and message['content']['type'] == 'image' -%}\n {{- image_placeholder | trim }}\n {%- else -%}\n {%- for content in message['content'] -%}\n {%- if content is string -%}\n {{- content | trim -}}\n {%- elif content is mapping and content.get('type') == 'text' and content.get('text') == '<image>' -%}\n {{- image_placeholder | trim }}\n {%- elif content is mapping and content.get('type') == 'text' -%}\n {{- content['text'] | trim -}}\n {%- elif content is mapping and content.get('type') == 'image' -%}\n {# Skip adding image placeholder - it's already in the text #}\n {%- endif -%}\n {%- endfor -%}\n {%- endif -%}\n {{- '<|im_end|>'}}\n {% endif %}\n{% endfor -%}\n{#--- Add Generation Prompt if Requested ---#}\n{%- if add_generation_prompt %}\n {{- '<|im_start|>assistant' }}\n{% endif -%}"
3
  }
 
1
  {
2
+ "chat_template": "{%- set image_placeholder = '<|vision_start|><|image_pad|><|vision_end|>' -%}\n{%- for message in messages -%}\n {%- if message['role'] == 'user' -%}\n {%- if loop.first and message['role'] != 'system' -%}\n {{- '<|im_start|>system\n' -}}\n {%- if template -%}\n {#--- If template, extraction task ---#}\n {{- 'You are NuExtract, an information extraction tool created by NuMind.' -}}\n {%- else -%}\n {#--- Else, template generation task ---#}\n {{- 'You are a helpful assistant.' -}}\n {%- endif -%}\n {{ '<|im_end|>\n' }}\n {%- endif -%}\n {{- '<|im_start|>' + message['role'] + '\n' -}}\n {%- if template -%}\n {#--- Template Section ---#}\n {{- '# Template:\n' -}}\n {{- template -}}\n {{- '\n' -}}\n \n {%- if examples -%}\n {#--- Examples can only exist in the extraction task ---#}\n {{- '# Examples:\n' -}}\n {%- for example in examples -%}\n {{- '## Input:\n' -}}\n {%- if example['input'] is mapping and (example['input']['type'] == 'image' or example['input']['type'] == 'image_url') -%}\n {{- image_placeholder | trim -}}\n {%- elif example['input'] == '<image>' -%}\n {#--- Keep compatibility with <image> for now ---#}\n {{- image_placeholder | trim -}}\n {%- else -%}\n {#--- Text input example ---#}\n {{- example['input'] -}}\n {%- endif -%}\n {{- '\n' -}}\n {{- '## Output:\n' -}}\n {{- example['output'] -}}\n {{- '\n' -}}\n {%- endfor -%}\n {%- endif -%}\n {{- '# Context:\n' -}}\n {%- endif -%}\n \n {%- if message['content'] is string -%}\n {#--- Simple string content ---#}\n message['content'] | trim -}}\n {%- elif message['content'] is mapping and (message['content']['type'] == 'image' or message['content']['type'] == 'image_url') -%}\n {{- image_placeholder | trim -}}\n {%- else -%}\n {#--- List of content items (mixed text/images) ---#}\n {#--- First, determine what the actual input content is (not ICL images) ---#}\n {%- set ns = namespace(has_text_input=false, text_content='') -%}\n \n {#--- Count content types and identify actual input document ---#}\n {%- for content in message['content'] -%}\n {%- if content is mapping and content.get('type') == 'text' -%}\n {%- if content.get('text') != '<image>' -%}\n {#--- Keep compatibility with <image> for now ---#}\n {%- set ns.has_text_input = true -%}\n {%- set ns.text_content = content['text'] -%}\n {%- endif -%}\n {%- elif content is string -%}\n {%- if content != '<image>' -%}\n {#--- Keep compatibility with <image> for now ---#}\n {%- set ns.has_text_input = true -%}\n {%- set ns.text_content = content -%}\n {%- endif -%}\n {%- endif -%}\n {%- endfor -%}\n \n {#--- Determine what to output based on actual input type ---#}\n {%- if ns.has_text_input -%}\n {#--- Main input is text, so output the text content ---#}\n {{- ns.text_content | trim -}}\n {%- else -%}\n {#--- Main input is image or <image> placeholder ---#}\n {%- set ns2 = namespace(found_image=false) -%}\n {%- for content in message['content'] -%}\n {%- if content is mapping and (content.get('type') == 'image' or content.get('type') == 'image_url') and not ns2.found_image -%}\n {{- image_placeholder | trim -}}\n {%- set ns2.found_image = true -%}\n {%- elif content is mapping and content.get('type') == 'text' and content.get('text') == '<image>' and not ns2.found_image -%}\n {#--- Keep compatibility with <image> for now ---#}\n {{- image_placeholder | trim -}}\n {%- set ns2.found_image = true -%}\n {%- elif content is string and content == '<image>' and not ns2.found_image -%}\n {#--- Keep compatibility with <image> for now ---#}\n {{- image_placeholder | trim -}}\n {%- set ns2.found_image = true -%}\n {%- endif -%}\n {%- endfor -%}\n {%- endif -%}\n {%- endif -%}\n {{- '<|im_end|>\n'}}\n {%- endif -%}\n{%- endfor -%}\n{%- if add_generation_prompt -%}\n {{- '<|im_start|>assistant\n' -}}\n{%- endif -%}"
3
  }
preprocessor_config.json CHANGED
@@ -14,7 +14,7 @@
14
  0.26130258,
15
  0.27577711
16
  ],
17
- "max_pixels": 2352000,
18
  "merge_size": 2,
19
  "min_pixels": 200704,
20
  "patch_size": 14,
@@ -22,8 +22,8 @@
22
  "resample": 3,
23
  "rescale_factor": 0.00392156862745098,
24
  "size": {
25
- "longest_edge": 12845056,
26
- "shortest_edge": 3136
27
  },
28
  "temporal_patch_size": 2
29
  }
 
14
  0.26130258,
15
  0.27577711
16
  ],
17
+ "max_pixels": 23000000,
18
  "merge_size": 2,
19
  "min_pixels": 200704,
20
  "patch_size": 14,
 
22
  "resample": 3,
23
  "rescale_factor": 0.00392156862745098,
24
  "size": {
25
+ "longest_edge": 23000000,
26
+ "shortest_edge": 200704
27
  },
28
  "temporal_patch_size": 2
29
  }
tokenizer.json CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:9c5ae00e602b8860cbd784ba82a8aa14e8feecec692e7076590d014d7b7fdafa
3
- size 11421896
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ba0c439f7be467bf47d12a7e6f9adc6116201056fc60c67f431c679b7c16afc8
3
+ size 11422064
tokenizer_config.json CHANGED
@@ -1,4 +1,5 @@
1
  {
 
2
  "add_bos_token": false,
3
  "add_prefix_space": false,
4
  "added_tokens_decoder": {
@@ -195,16 +196,19 @@
195
  "<|video_pad|>"
196
  ],
197
  "bos_token": null,
198
- "chat_template": "{% set image_placeholder = '<|vision_start|><|image_pad|><|vision_end|>' %}\n{% for message in messages %}\n {#--- Handle User Messages with Template and Examples ---#}\n {%- if message['role'] == 'user' and template -%}\n {% if loop.first and message['role'] != 'system' %}\n {{- '<|im_start|>system\nYou are NuExtract, an information extraction tool created by NuMind.<|im_end|>' }}\n {% endif %}\n \n {{- '<|im_start|>' + message['role'] -}}\n \n {#--- Template Section ---#}\n {{ '\n# Template:' }}\n {{- '\n' + template + '\n' }}\n \n {#--- Examples Section (if provided) ---#}\n {% if examples -%}\n {{- '# Examples:' }}\n {% for example in examples %}\n {{- '## Input:\n' }}\n {#--- Handle image examples ---#}\n {% if example['input'] is mapping and example['input']['type'] == 'image' %}\n {{- image_placeholder | trim -}}\n {% elif example['input'] == '<image>' %}\n {{- image_placeholder | trim -}}\n {% else %}\n {{- example['input'] -}}\n {% endif %}\n {{- '\n## Output:\n' ~ example['output'] }}\n {% endfor %}\n {%- endif %}\n \n {#--- Context Section: Handle various content types ---#}\n {{- '# Context:\n' }}\n {%- if message['content'] is string -%}\n {#--- Simple string content ---#}\n {{- message['content'] | trim -}}\n {%- elif message['content'] is mapping and message['content']['type'] == 'image' -%}\n {#--- Single image document ---#}\n {{- image_placeholder | trim -}}\n {%- else -%}\n {#--- List of content items (mixed text/images) ---#}\n {#--- First, determine what the actual input content is (not ICL images) ---#}\n {%- set ns = namespace(has_text_input=false, text_content='') -%}\n \n {#--- Count content types and identify actual input document ---#}\n {%- for content in message['content'] -%}\n {%- if content is mapping and content.get('type') == 'text' -%}\n {%- if content.get('text') != '<image>' -%}\n {%- set ns.has_text_input = true -%}\n {%- set ns.text_content = content['text'] -%}\n {%- endif -%}\n {%- elif content is string -%}\n {%- if content != '<image>' -%}\n {%- set ns.has_text_input = true -%}\n {%- set ns.text_content = content -%}\n {%- endif -%}\n {%- endif -%}\n {%- endfor -%}\n \n {#--- Determine what to output based on actual input type ---#}\n {%- if ns.has_text_input -%}\n {#--- Main input is text, so output the text content ---#}\n {{- ns.text_content | trim -}}\n {%- else -%}\n {#--- Main input is image or <image> placeholder ---#}\n {%- set ns2 = namespace(found_image=false) -%}\n {%- for content in message['content'] -%}\n {%- if content is mapping and content.get('type') == 'image' and not ns2.found_image -%}\n {{- image_placeholder | trim -}}\n {%- set ns2.found_image = true -%}\n {%- elif content is mapping and content.get('type') == 'text' and content.get('text') == '<image>' and not ns2.found_image -%}\n {{- image_placeholder | trim -}}\n {%- set ns2.found_image = true -%}\n {%- elif content is string and content == '<image>' and not ns2.found_image -%}\n {{- image_placeholder | trim -}}\n {%- set ns2.found_image = true -%}\n {%- endif -%}\n {%- endfor -%}\n {%- endif -%}\n {%- endif -%}\n {{- '<|im_end|>\n'}}\n \n {#--- Handle All Other Messages (Assistant, System, etc.) ---#}\n {% else %}\n {% if loop.first and message['role'] != 'system' %}\n {{- '<|im_start|>system\nYou are a helpful assistant.<|im_end|>' }}\n {% endif %}\n \n {{- '<|im_start|>' + message['role'] + '\n' }}\n \n {#--- Same content handling logic as above but without template/examples ---#}\n {%- if message['content'] is string -%}\n {{- message['content'] | trim }}\n {%- elif message['content'] is mapping and message['content']['type'] == 'image' -%}\n {{- image_placeholder | trim }}\n {%- else -%}\n {%- for content in message['content'] -%}\n {%- if content is string -%}\n {{- content | trim -}}\n {%- elif content is mapping and content.get('type') == 'text' and content.get('text') == '<image>' -%}\n {{- image_placeholder | trim }}\n {%- elif content is mapping and content.get('type') == 'text' -%}\n {{- content['text'] | trim -}}\n {%- elif content is mapping and content.get('type') == 'image' -%}\n {# Skip adding image placeholder - it's already in the text #}\n {%- endif -%}\n {%- endfor -%}\n {%- endif -%}\n {{- '<|im_end|>'}}\n {% endif %}\n{% endfor -%}\n{#--- Add Generation Prompt if Requested ---#}\n{%- if add_generation_prompt %}\n {{- '<|im_start|>assistant' }}\n{% endif -%}",
199
  "clean_up_tokenization_spaces": false,
200
  "eos_token": "<|im_end|>",
201
  "errors": "replace",
202
  "extra_special_tokens": {},
203
- "max_pixels": 2352000,
 
204
  "min_pixels": 200704,
205
  "model_max_length": 131072,
 
206
  "pad_token": "<|endoftext|>",
207
- "padding_side": "left",
 
208
  "processor_class": "Qwen2_5_VLProcessor",
209
  "split_special_tokens": false,
210
  "tokenizer_class": "Qwen2Tokenizer",
 
1
  {
2
+ "_commit_hash": null,
3
  "add_bos_token": false,
4
  "add_prefix_space": false,
5
  "added_tokens_decoder": {
 
196
  "<|video_pad|>"
197
  ],
198
  "bos_token": null,
199
+ "chat_template": "{%- set image_placeholder = '<|vision_start|><|image_pad|><|vision_end|>' -%}\n{%- for message in messages -%}\n {%- if message['role'] == 'user' -%}\n {%- if loop.first and message['role'] != 'system' -%}\n {{- '<|im_start|>system\n' -}}\n {%- if template -%}\n {#--- If template, extraction task ---#}\n {{- 'You are NuExtract, an information extraction tool created by NuMind.' -}}\n {%- else -%}\n {#--- Else, template generation task ---#}\n {{- 'You are a helpful assistant.' -}}\n {%- endif -%}\n {{ '<|im_end|>\n' }}\n {%- endif -%}\n {{- '<|im_start|>' + message['role'] + '\n' -}}\n {%- if template -%}\n {#--- Template Section ---#}\n {{- '# Template:\n' -}}\n {{- template -}}\n {{- '\n' -}}\n \n {%- if examples -%}\n {#--- Examples can only exist in the extraction task ---#}\n {{- '# Examples:\n' -}}\n {%- for example in examples -%}\n {{- '## Input:\n' -}}\n {%- if example['input'] is mapping and (example['input']['type'] == 'image' or example['input']['type'] == 'image_url') -%}\n {{- image_placeholder | trim -}}\n {%- elif example['input'] == '<image>' -%}\n {#--- Keep compatibility with <image> for now ---#}\n {{- image_placeholder | trim -}}\n {%- else -%}\n {#--- Text input example ---#}\n {{- example['input'] -}}\n {%- endif -%}\n {{- '\n' -}}\n {{- '## Output:\n' -}}\n {{- example['output'] -}}\n {{- '\n' -}}\n {%- endfor -%}\n {%- endif -%}\n {{- '# Context:\n' -}}\n {%- endif -%}\n \n {%- if message['content'] is string -%}\n {#--- Simple string content ---#}\n message['content'] | trim -}}\n {%- elif message['content'] is mapping and (message['content']['type'] == 'image' or message['content']['type'] == 'image_url') -%}\n {{- image_placeholder | trim -}}\n {%- else -%}\n {#--- List of content items (mixed text/images) ---#}\n {#--- First, determine what the actual input content is (not ICL images) ---#}\n {%- set ns = namespace(has_text_input=false, text_content='') -%}\n \n {#--- Count content types and identify actual input document ---#}\n {%- for content in message['content'] -%}\n {%- if content is mapping and content.get('type') == 'text' -%}\n {%- if content.get('text') != '<image>' -%}\n {#--- Keep compatibility with <image> for now ---#}\n {%- set ns.has_text_input = true -%}\n {%- set ns.text_content = content['text'] -%}\n {%- endif -%}\n {%- elif content is string -%}\n {%- if content != '<image>' -%}\n {#--- Keep compatibility with <image> for now ---#}\n {%- set ns.has_text_input = true -%}\n {%- set ns.text_content = content -%}\n {%- endif -%}\n {%- endif -%}\n {%- endfor -%}\n \n {#--- Determine what to output based on actual input type ---#}\n {%- if ns.has_text_input -%}\n {#--- Main input is text, so output the text content ---#}\n {{- ns.text_content | trim -}}\n {%- else -%}\n {#--- Main input is image or <image> placeholder ---#}\n {%- set ns2 = namespace(found_image=false) -%}\n {%- for content in message['content'] -%}\n {%- if content is mapping and (content.get('type') == 'image' or content.get('type') == 'image_url') and not ns2.found_image -%}\n {{- image_placeholder | trim -}}\n {%- set ns2.found_image = true -%}\n {%- elif content is mapping and content.get('type') == 'text' and content.get('text') == '<image>' and not ns2.found_image -%}\n {#--- Keep compatibility with <image> for now ---#}\n {{- image_placeholder | trim -}}\n {%- set ns2.found_image = true -%}\n {%- elif content is string and content == '<image>' and not ns2.found_image -%}\n {#--- Keep compatibility with <image> for now ---#}\n {{- image_placeholder | trim -}}\n {%- set ns2.found_image = true -%}\n {%- endif -%}\n {%- endfor -%}\n {%- endif -%}\n {%- endif -%}\n {{- '<|im_end|>\n'}}\n {%- endif -%}\n{%- endfor -%}\n{%- if add_generation_prompt -%}\n {{- '<|im_start|>assistant\n' -}}\n{%- endif -%}",
200
  "clean_up_tokenization_spaces": false,
201
  "eos_token": "<|im_end|>",
202
  "errors": "replace",
203
  "extra_special_tokens": {},
204
+ "max_length": null,
205
+ "max_pixels": 23000000,
206
  "min_pixels": 200704,
207
  "model_max_length": 131072,
208
+ "pad_to_multiple_of": null,
209
  "pad_token": "<|endoftext|>",
210
+ "pad_token_type_id": 0,
211
+ "padding_side": "right",
212
  "processor_class": "Qwen2_5_VLProcessor",
213
  "split_special_tokens": false,
214
  "tokenizer_class": "Qwen2Tokenizer",