Upload folder using huggingface_hub

Browse files

Files changed (11) hide show

.gitattributes +1 -0
README.md +187 -0
chat_template.jinja +122 -0
config.json +54 -0
generation_config.json +7 -0
model.safetensors +3 -0
preprocessor_config.json +27 -0
processor_config.json +8 -0
special_tokens_map.json +30 -0
tokenizer.json +3 -0
tokenizer_config.json +0 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+tokenizer.json filter=lfs diff=lfs merge=lfs -text

README.md ADDED Viewed

	@@ -0,0 +1,187 @@

+---
+library_name: transformers
+pipeline_tag: image-text-to-text
+inference: true
+widget:
+  - text: Hello!
+    example_title: Hello world
+    group: Python
+base_model:
+- ServiceNow-AI/Apriel-1.5-15b-Thinker
+---
+This tiny model is intended for debugging. It is randomly initialized using the configuration adapted from [ServiceNow-AI/Apriel-1.5-15b-Thinker](https://huggingface.co/ServiceNow-AI/Apriel-1.5-15b-Thinker).
+### Example usage:
+```python
+import re
+import requests
+import torch
+from PIL import Image
+from transformers import AutoProcessor, AutoModelForImageTextToText
+# Load model
+model_id = "tiny-random/apriel-1.5"
+model = AutoModelForImageTextToText.from_pretrained(
+    model_id,
+    dtype=torch.bfloat16,
+    device_map="auto"
+)
+processor = AutoProcessor.from_pretrained(model_id)
+url = "https://picsum.photos/id/237/200/300"
+image = Image.open(requests.get(url, stream=True).raw).convert("RGB")
+chat = [
+    {
+        "role": "user",
+        "content": [
+            {"type": "text", "text": "Which animal is this?"},
+            {"type": "image"},
+        ],
+    }
+]
+prompt = processor.apply_chat_template(chat, add_generation_prompt=True, tokenize=False)
+inputs = processor(text=prompt, images=[image], return_tensors="pt").to(model.device)
+inputs.pop("token_type_ids", None)
+inputs['pixel_values'] = inputs['pixel_values'].to(model.dtype)
+with torch.no_grad():
+    output_ids = model.generate(**inputs, max_new_tokens=128, do_sample=True, temperature=0.6)
+generated_ids = output_ids[:, inputs['input_ids'].shape[1]:]
+output = processor.decode(generated_ids[0], skip_special_tokens=False)
+print("Image Response:", output)
+```
+### Codes to create this repo:
+```python
+import json
+from pathlib import Path
+import accelerate
+import torch
+from huggingface_hub import file_exists, hf_hub_download
+from transformers import (
+    AutoConfig,
+    AutoModelForCausalLM,
+    AutoProcessor,
+    GenerationConfig,
+    AutoModelForImageTextToText,
+    set_seed,
+)
+source_model_id = "ServiceNow-AI/Apriel-1.5-15b-Thinker"
+save_folder = "/tmp/tiny-random/apriel-1.5"
+processor = AutoProcessor.from_pretrained(source_model_id, trust_remote_code=True)
+processor.save_pretrained(save_folder)
+with open(hf_hub_download(source_model_id, filename='config.json', repo_type='model'), 'r', encoding='utf-8') as f:
+    config_json = json.load(f)
+config_json['text_config'].update({
+    'head_dim': 32,
+    'hidden_size': 8,
+    'intermediate_size': 64,
+    'num_hidden_layers': 2,
+    'num_attention_heads': 8,
+    'num_key_value_heads': 4,
+})
+config_json['vision_config'].update(
+    {
+        'head_dim': 32,
+        'intermediate_size': 256,
+        'hidden_size': 32 * 4,
+        'num_attention_heads': 4,
+        'num_hidden_layers': 2,
+    }
+)
+with open(f"{save_folder}/config.json", "w", encoding='utf-8') as f:
+    json.dump(config_json, f, indent=2)
+config = AutoConfig.from_pretrained(
+    save_folder,
+    trust_remote_code=True,
+)
+print(config)
+torch.set_default_dtype(torch.bfloat16)
+model = AutoModelForImageTextToText.from_config(config, trust_remote_code=True).to(torch.bfloat16)
+torch.set_default_dtype(torch.float32)
+if file_exists(filename="generation_config.json", repo_id=source_model_id, repo_type='model'):
+    model.generation_config = GenerationConfig.from_pretrained(
+        source_model_id, trust_remote_code=True,
+    )
+    model.generation_config.do_sample = True
+    print(model.generation_config)
+model = model.cpu()
+with torch.no_grad():
+    for name, p in sorted(model.named_parameters()):
+        torch.nn.init.normal_(p, 0, 0.1)
+        print(name, p.shape)
+model.save_pretrained(save_folder)
+```
+### Printing the model:
+```text
+LlavaForConditionalGeneration(
+  (model): LlavaModel(
+    (vision_tower): PixtralVisionModel(
+      (patch_conv): Conv2d(3, 128, kernel_size=(16, 16), stride=(16, 16), bias=False)
+      (ln_pre): PixtralRMSNorm((128,), eps=1e-05)
+      (transformer): PixtralTransformer(
+        (layers): ModuleList(
+          (0-1): 2 x PixtralAttentionLayer(
+            (attention_norm): PixtralRMSNorm((128,), eps=1e-05)
+            (feed_forward): PixtralMLP(
+              (gate_proj): Linear(in_features=128, out_features=256, bias=False)
+              (up_proj): Linear(in_features=128, out_features=256, bias=False)
+              (down_proj): Linear(in_features=256, out_features=128, bias=False)
+              (act_fn): SiLU()
+            )
+            (attention): PixtralAttention(
+              (k_proj): Linear(in_features=128, out_features=128, bias=False)
+              (v_proj): Linear(in_features=128, out_features=128, bias=False)
+              (q_proj): Linear(in_features=128, out_features=128, bias=False)
+              (o_proj): Linear(in_features=128, out_features=128, bias=False)
+            )
+            (ffn_norm): PixtralRMSNorm((128,), eps=1e-05)
+          )
+        )
+      )
+      (patch_positional_embedding): PixtralRotaryEmbedding()
+    )
+    (multi_modal_projector): LlavaMultiModalProjector(
+      (linear_1): Linear(in_features=128, out_features=8, bias=True)
+      (act): GELUActivation()
+      (linear_2): Linear(in_features=8, out_features=8, bias=True)
+    )
+    (language_model): MistralModel(
+      (embed_tokens): Embedding(131072, 8)
+      (layers): ModuleList(
+        (0-1): 2 x MistralDecoderLayer(
+          (self_attn): MistralAttention(
+            (q_proj): Linear(in_features=8, out_features=256, bias=False)
+            (k_proj): Linear(in_features=8, out_features=128, bias=False)
+            (v_proj): Linear(in_features=8, out_features=128, bias=False)
+            (o_proj): Linear(in_features=256, out_features=8, bias=False)
+          )
+          (mlp): MistralMLP(
+            (gate_proj): Linear(in_features=8, out_features=64, bias=False)
+            (up_proj): Linear(in_features=8, out_features=64, bias=False)
+            (down_proj): Linear(in_features=64, out_features=8, bias=False)
+            (act_fn): SiLU()
+          )
+          (input_layernorm): MistralRMSNorm((8,), eps=1e-05)
+          (post_attention_layernorm): MistralRMSNorm((8,), eps=1e-05)
+        )
+      )
+      (norm): MistralRMSNorm((8,), eps=1e-05)
+      (rotary_emb): MistralRotaryEmbedding()
+    )
+  )
+  (lm_head): Linear(in_features=8, out_features=131072, bias=False)
+)
+```

chat_template.jinja ADDED Viewed

	@@ -0,0 +1,122 @@

+{%- set available_tools_string, thought_instructions, add_tool_id, tool_output_format = '', '', true, "default" -%}
+{%- if tools is not none and tools|length > 0 -%}
+    {%- set available_tools_string -%}
+You are provided with function signatures within <available_tools></available_tools> XML tags. You may call one or more functions to assist with the user query. Don't make assumptions about the arguments. You should infer the argument values from previous user responses and the system message. Here are the available tools:
+<available_tools>
+{% for tool in tools %}
+{{ tool|string }}
+{% endfor %}
+</available_tools>
+{%- endset -%}
+{%- endif -%}
+{%- if tool_output_format is none or tool_output_format == "default" -%}
+{%- set tool_output_instructions -%}
+Return all function calls as a list of json objects within <tool_call></tool_call> XML tags. Each json object should contain a function name and arguments as follows:
+<tool_calls>[{"name": <function-name-1>, "arguments": <args-dict-1>}, {"name": <function-name-2>, "arguments": <args-dict-2>},...]</tool_calls>
+{%- endset -%}
+{%- elif tool_output_format == "yaml" -%}
+{%- set tool_output_instructions -%}
+Return all function calls as a list of yaml objects within <tool_call></tool_call> XML tags. Each yaml object should contain a function name and arguments as follows:
+<tool_calls>
+- name: <function-name-1>
+  arguments: <args-dict-1>
+- name: <function-name-2>
+  arguments: <args-dict-2>
+...
+</tool_calls>
+{%- endset -%}
+{%- endif -%}
+{%- if add_thoughts -%}
+{%- set thought_instructions -%}
+Prior to generating the function calls, you should generate the reasoning for why you're calling the function. Please generate these reasoning thoughts between <thinking> and </thinking> XML tags.
+{%- endset -%}
+{%- endif -%}
+{{- bos_token -}}
+{%- set reasoning_prompt='You are a thoughtful and systematic AI assistant built by ServiceNow Language Models (SLAM) lab. Before providing an answer, analyze the problem carefully and present your reasoning step by step. After explaining your thought process, provide the final solution in the following format: [BEGIN FINAL RESPONSE] ... [END FINAL RESPONSE].' -%}
+{%- if messages[0]['role'] != 'system' and tools is not none and tools|length > 0 -%}
+    {{- '<|system|>\n' + reasoning_prompt + available_tools_string + "\n" + tool_output_instructions + '\n<|end|>\n' -}}
+{%- endif -%}
+{%- if messages|selectattr('role', 'equalto', 'system')|list|length == 0 -%}
+{{- '<|system|>\n' + reasoning_prompt + '\n<|end|>\n' -}}
+{%- endif -%}
+{%- for message in messages -%}
+    {%- if message['role'] == 'user' -%}
+        {{- '<|user|>\n' }}
+        {%- if message['content'] is not string %}
+            {%- for chunk in message['content'] %}
+                {%- if chunk['type'] == 'text' %}
+                    {{- chunk['text'] }}
+                {%- elif chunk['type'] == 'image' or chunk['type'] == 'image_url'%}
+                    {{- '[IMG]' }}
+                {%- else %}
+                    {{- raise_exception('Unrecognized content type!') }}
+                {%- endif %}
+            {%- endfor %}
+        {%- else %}
+            {{- message['content'] }}
+        {%- endif %}
+        {{- '\n<|end|>\n' }}
+    {%- elif message['role'] == 'content' -%}
+        {%- if message['content'] is not string %}
+            {{- '<|content|>\n' + message['content'][0]['text'] + '\n<|end|>\n' -}}
+        {%- else %}
+            {{- '<|content|>\n' + message['content'] + '\n<|end|>\n' -}}
+        {%- endif -%}
+    {%- elif message['role'] == 'system' -%}
+        {%- if message['content'] is not none and message['content']|length > 0 %}
+            {%- if message['content'] is string %}
+                {%- set system_message = message['content'] %}
+            {%- else %}
+                {%- set system_message = message['content'][0]['text'] %}
+            {%- endif %}
+        {%- else %}
+            {%- set system_message = '' %}
+        {%- endif %}
+        {%- if tools is not none and tools|length > 0 -%}
+            {{- '<|system|>\n' + reasoning_prompt + system_message + '\n' + available_tools_string + '\n<|end|>\n' -}}
+        {%- else -%}
+            {{- '<|system|>\n' + reasoning_prompt + system_message + '\n<|end|>\n' -}}
+        {%- endif -%}
+    {%- elif message['role'] == 'assistant' -%}
+        {%- if loop.last -%}
+            {%- set add_tool_id = false -%}
+        {%- endif -%}
+        {{- '<|assistant|>\n' -}}
+        {%- if message['content'] is not none and message['content']|length > 0 -%}
+            {%- if message['content'] is not string and message['content'][0]['text'] is not none %}
+                {{- message['content'][0]['text'] }}
+            {%- else %}
+                {{- message['content'] -}}
+            {%- endif -%}
+        {%- elif message['chosen'] is not none and message['chosen']|length > 0 -%}
+            {{- message['chosen'][0] -}}
+        {%- endif -%}
+        {%- if add_thoughts and 'thought' in message and message['thought'] is not none -%}
+            {{- '<thinking>' + message['thought'] + '</thinking>' -}}
+        {%- endif -%}
+        {%- if message['tool_calls'] is not none and message['tool_calls']|length > 0 -%}
+            {{- '\n<tool_calls>[' -}}
+            {%- for tool_call in message["tool_calls"] -%}
+                {{- '{"name": "' + tool_call['function']['name'] + '", "arguments": ' + tool_call['function']['arguments']|string -}}
+                {%- if add_tool_id == true -%}
+                    {{- ', "id": "' + tool_call['id'] + '"' -}}
+                {%- endif -%}
+                {{- '}' -}}
+                {%- if not loop.last -%}{{- ', ' -}}{%- endif -%}
+            {%- endfor -%}
+            {{- ']</tool_calls>' -}}
+        {%- endif -%}
+        {{- '\n<|end|>\n' + eos_token -}}
+    {%- elif message['role'] == 'tool' -%}
+        {%- if message['content'] is string %}
+            {%- set tool_message = message['content'] %}
+        {%- else %}
+            {%- set tool_message = message['content'][0]['text'] %}
+        {%- endif -%}
+        {{- '<|tool_result|>\n' + tool_message|string + '\n<|end|>\n' -}}
+    {%- endif -%}
+    {%- if loop.last and add_generation_prompt and message['role'] != 'assistant' -%}
+        {{- '<|assistant|>\n' -}}
+    {%- endif -%}
+{%- endfor -%}

config.json ADDED Viewed

	@@ -0,0 +1,54 @@

+{
+  "architectures": [
+    "LlavaForConditionalGeneration"
+  ],
+  "dtype": "bfloat16",
+  "ignore_index": -100,
+  "image_seq_length": 1,
+  "image_token_index": 10,
+  "model_type": "llava",
+  "multimodal_projector_bias": true,
+  "projector_hidden_act": "gelu",
+  "text_config": {
+    "_attn_implementation_autoset": false,
+    "attention_dropout": 0.0,
+    "head_dim": 32,
+    "hidden_act": "silu",
+    "hidden_size": 8,
+    "initializer_range": 0.02,
+    "intermediate_size": 64,
+    "max_position_embeddings": 262400,
+    "model_type": "mistral",
+    "num_attention_heads": 8,
+    "num_hidden_layers": 2,
+    "num_key_value_heads": 4,
+    "rms_norm_eps": 1e-05,
+    "rope_theta": 1000000000.0,
+    "sliding_window": null,
+    "tf_legacy_loss": false,
+    "use_bfloat16": false,
+    "use_cache": true,
+    "vocab_size": 131072
+  },
+  "transformers_version": "4.57.0.dev0",
+  "vision_config": {
+    "_attn_implementation_autoset": false,
+    "attention_dropout": 0.0,
+    "head_dim": 32,
+    "hidden_act": "silu",
+    "hidden_size": 128,
+    "image_size": 1024,
+    "initializer_range": 0.02,
+    "intermediate_size": 256,
+    "model_type": "pixtral",
+    "num_attention_heads": 4,
+    "num_channels": 3,
+    "num_hidden_layers": 2,
+    "patch_size": 16,
+    "rope_theta": 10000.0,
+    "tf_legacy_loss": false,
+    "use_bfloat16": false
+  },
+  "vision_feature_layer": -1,
+  "vision_feature_select_strategy": "full"
+}

generation_config.json ADDED Viewed

	@@ -0,0 +1,7 @@

+{
+  "_from_model_config": true,
+  "bos_token_id": 1,
+  "do_sample": true,
+  "eos_token_id": 2,
+  "transformers_version": "4.57.0.dev0"
+}

model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:88b138e76f9bb1b8ebd1ac83c81bdebafb60efb3871e8a32ea4309d6da1bd1a1
+size 5085984

preprocessor_config.json ADDED Viewed

	@@ -0,0 +1,27 @@

+{
+  "do_convert_rgb": true,
+  "do_normalize": true,
+  "do_rescale": true,
+  "do_resize": true,
+  "image_mean": [
+    0.48145466,
+    0.4578275,
+    0.40821073
+  ],
+  "image_processor_type": "PixtralImageProcessor",
+  "image_std": [
+    0.26862954,
+    0.26130258,
+    0.27577711
+  ],
+  "patch_size": {
+    "height": 16,
+    "width": 16
+  },
+  "processor_class": "PixtralProcessor",
+  "resample": 3,
+  "rescale_factor": 0.00392156862745098,
+  "size": {
+    "longest_edge": 1024
+  }
+}

processor_config.json ADDED Viewed

	@@ -0,0 +1,8 @@

+{
+  "image_break_token": "[IMG_BREAK]",
+  "image_end_token": "[IMG_END]",
+  "image_token": "[IMG]",
+  "patch_size": 16,
+  "processor_class": "PixtralProcessor",
+  "spatial_merge_size": 1
+}

special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,30 @@

+{
+  "bos_token": {
+    "content": "<s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "</s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": {
+    "content": "<pad>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "unk_token": {
+    "content": "<unk>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

tokenizer.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:84f33e6f52b2833e8cc17229af8eea363f640a898f19a48184a2c7f6f5a88337
+size 17077329

tokenizer_config.json ADDED Viewed

The diff for this file is too large to render. See raw diff