Firworks commited on 12 days ago

Commit

2d48701

verified ·

1 Parent(s): fd0a929

Add NVFP4 quantized checkpoint

Browse files

Files changed (37) hide show

README.md +27 -0
chat_template.jinja +80 -0
config.json +376 -0
configuration_step3p5.py +59 -0
generation_config.json +11 -0
model-00001-of-00026.safetensors +3 -0
model-00002-of-00026.safetensors +3 -0
model-00003-of-00026.safetensors +3 -0
model-00004-of-00026.safetensors +3 -0
model-00005-of-00026.safetensors +3 -0
model-00006-of-00026.safetensors +3 -0
model-00007-of-00026.safetensors +3 -0
model-00008-of-00026.safetensors +3 -0
model-00009-of-00026.safetensors +3 -0
model-00010-of-00026.safetensors +3 -0
model-00011-of-00026.safetensors +3 -0
model-00012-of-00026.safetensors +3 -0
model-00013-of-00026.safetensors +3 -0
model-00014-of-00026.safetensors +3 -0
model-00015-of-00026.safetensors +3 -0
model-00016-of-00026.safetensors +3 -0
model-00017-of-00026.safetensors +3 -0
model-00018-of-00026.safetensors +3 -0
model-00019-of-00026.safetensors +3 -0
model-00020-of-00026.safetensors +3 -0
model-00021-of-00026.safetensors +3 -0
model-00022-of-00026.safetensors +3 -0
model-00023-of-00026.safetensors +3 -0
model-00024-of-00026.safetensors +3 -0
model-00025-of-00026.safetensors +3 -0
model-00026-of-00026.safetensors +3 -0
model.safetensors.index.json +0 -0
modeling_step3p5.py +900 -0
recipe.yaml +7 -0
special_tokens_map.json +23 -0
tokenizer.json +0 -0
tokenizer_config.json +0 -0

README.md ADDED Viewed

	@@ -0,0 +1,27 @@

+---
+datasets:
+- Rombo-Org/Optimized_Reasoning
+base_model:
+- stepfun-ai/Step-3.5-Flash
+tags:
+  - nvfp4
+  - fp4
+  - quantized
+---
+# Step-3.5-Flash-nvfp4
+**Format:** NVFP4 — weights & activations quantized to FP4 with dual scaling.
+**Base model:** `stepfun-ai/Step-3.5-Flash`
+**How it was made:** One-shot calibration with LLM Compressor (NVFP4 recipe), long-seq calibration (1 samples of 512 length) with Rombo-Org/Optimized_Reasoning.
+> Notes: Keep `lm_head` in high precision; calibrate on long, domain-relevant sequences.
+Check the original model card for information about this model.
+# Running the model with VLLM in Docker
+```sh
+sudo docker run --runtime nvidia --gpus all -p 8000:8000 --ipc=host vllm/vllm-openai:nightly --model Firworks/Step-3.5-Flash-nvfp4 --dtype auto --max-model-len 32768
+```
+This was tested on an RTX Pro 6000 Blackwell cloud instance.
+If there are other models you're interested in seeing quantized to NVFP4 for use on the DGX Spark, or other modern Blackwell (or newer) cards let me know. I'm trying to make more NVFP4 models available to allow more people to try them out.

chat_template.jinja ADDED Viewed

	@@ -0,0 +1,80 @@

+{% macro render_content(content) %}{% if content is none %}{{- '' }}{% elif content is string %}{{- content }}{% elif content is mapping %}{{- content['value'] if 'value' in content else content['text'] }}{% elif content is iterable %}{% for item in content %}{% if item.type == 'text' %}{{- item['value'] if 'value' in item else item['text'] }}{% elif item.type == 'image' %}<im_patch>{% endif %}{% endfor %}{% endif %}{% endmacro %}
+{{bos_token}}{%- if tools %}
+    {{- '<|im_start|>system\n' }}
+    {%- if messages[0].role == 'system' %}
+        {{- render_content(messages[0].content) + '\n\n' }}
+    {%- endif %}
+    {{- "# Tools\n\nYou have access to the following functions in JSONSchema format:\n\n<tools>" }}
+    {%- for tool in tools %}
+        {{- "\n" }}
+        {{- tool | tojson(ensure_ascii=False) }}
+    {%- endfor %}
+    {{- "\n</tools>\n\nIf you choose to call a function ONLY reply in the following format with NO suffix:\n\n<tool_call>\n<function=example_function_name>\n<parameter=example_parameter_1>\nvalue_1\n</parameter>\n<parameter=example_parameter_2>\nThis is the value for the second parameter\nthat can span\nmultiple lines\n</parameter>\n</function>\n</tool_call>\n\n<IMPORTANT>\nReminder:\n- Function calls MUST follow the specified format: an inner <function=...>\n...\n</function> block must be nested within <tool_call>\n...\n</tool_call> XML tags\n- Required parameters MUST be specified\n</IMPORTANT><|im_end|>\n" }}
+{%- else %}
+    {%- if messages[0].role == 'system' %}
+        {{- '<|im_start|>system\n' + render_content(messages[0].content) + '<|im_end|>\n' }}
+    {%- endif %}
+{%- endif %}
+{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %}
+{%- for message in messages[::-1] %}
+    {%- set index = (messages|length - 1) - loop.index0 %}
+    {%- if ns.multi_step_tool and message.role == "user" and render_content(message.content) is string and not(render_content(message.content).startswith('<tool_response>') and render_content(message.content).endswith('</tool_response>')) %}
+        {%- set ns.multi_step_tool = false %}
+        {%- set ns.last_query_index = index %}
+    {%- endif %}
+{%- endfor %}
+{%- for message in messages %}
+    {%- set content = render_content(message.content) %}
+    {%- if (message.role == "user") or (message.role == "system" and not loop.first) %}
+        {%- set role_name = 'observation' if (message.role == "system" and not loop.first and message.name == 'observation') else message.role %}
+        {{- '<|im_start|>' + role_name + '\n' + content + '<|im_end|>' + '\n' }}
+    {%- elif message.role == "assistant" %}
+        {%- if message.reasoning_content is string %}
+            {%- set reasoning_content = render_content(message.reasoning_content) %}
+        {%- else %}
+            {%- if '</think>' in content %}
+                {%- set reasoning_content = content.split('</think>')[0].rstrip('\n').split('<think>')[-1].lstrip('\n') %}
+                {%- set content = content.split('</think>')[-1].lstrip('\n') %}
+            {%- else %}
+                {%- set reasoning_content = '' %}
+            {%- endif %}
+        {%- endif %}
+        {%- if loop.index0 > ns.last_query_index %}
+            {{- '<|im_start|>' + message.role + '\n<think>\n' + reasoning_content + '\n</think>\n' + content }}
+        {%- else %}
+            {{- '<|im_start|>' + message.role + '\n' + content }}
+        {%- endif %}
+        {%- if message.tool_calls %}
+            {%- for tool_call in message.tool_calls %}
+                {%- if tool_call.function is defined %}
+                    {%- set tool_call = tool_call.function %}
+                {%- endif %}
+                {{- '<tool_call>\n<function=' + tool_call.name + '>\n' }}
+                {%- if tool_call.arguments is defined %}
+                    {%- set arguments = tool_call.arguments %}
+                    {%- for args_name, args_value in arguments|items %}
+                        {{- '<parameter=' + args_name + '>\n' }}
+                        {%- set args_value = args_value | tojson(ensure_ascii=False) | safe if args_value is mapping or (args_value is sequence and args_value is not string) else args_value | string %}
+                        {{- args_value }}
+                        {{- '\n</parameter>\n' }}
+                    {%- endfor %}
+                {%- endif %}
+                {{- '</function>\n</tool_call>' }}
+            {%- endfor %}
+        {%- endif %}
+        {{- '<|im_end|>\n' }}
+    {%- elif message.role == "tool" %}
+        {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %}
+            {{- '<|im_start|>tool_response\n' }}
+        {%- endif %}
+        {{- '<tool_response>' }}
+        {{- content }}
+        {{- '</tool_response>' }}
+        {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
+            {{- '<|im_end|>\n' }}
+        {%- endif %}
+    {%- endif %}
+{%- endfor %}
+{%- if add_generation_prompt %}
+    {{- '<|im_start|>assistant\n<think>\n' }}
+{%- endif %}

config.json ADDED Viewed

	@@ -0,0 +1,376 @@

+{
+  "architectures": [
+    "Step3p5ForCausalLM"
+  ],
+  "att_impl_type": "GQA",
+  "attention_other_setting": {
+    "attention_type": "sliding_attention",
+    "head_dim": 128,
+    "num_attention_groups": 8,
+    "num_attention_heads": 96,
+    "true_head_dim": 128
+  },
+  "auto_map": {
+    "AutoConfig": "configuration_step3p5.Step3p5Config",
+    "AutoModelForCausalLM": "modeling_step3p5.Step3p5ForCausalLM"
+  },
+  "bos_token_id": 0,
+  "dtype": "bfloat16",
+  "eos_token_id": [
+    1,
+    2,
+    128007
+  ],
+  "head_dim": 128,
+  "hidden_size": 4096,
+  "intermediate_size": 11264,
+  "layer_types": [
+    "full_attention",
+    "sliding_attention",
+    "sliding_attention",
+    "sliding_attention",
+    "full_attention",
+    "sliding_attention",
+    "sliding_attention",
+    "sliding_attention",
+    "full_attention",
+    "sliding_attention",
+    "sliding_attention",
+    "sliding_attention",
+    "full_attention",
+    "sliding_attention",
+    "sliding_attention",
+    "sliding_attention",
+    "full_attention",
+    "sliding_attention",
+    "sliding_attention",
+    "sliding_attention",
+    "full_attention",
+    "sliding_attention",
+    "sliding_attention",
+    "sliding_attention",
+    "full_attention",
+    "sliding_attention",
+    "sliding_attention",
+    "sliding_attention",
+    "full_attention",
+    "sliding_attention",
+    "sliding_attention",
+    "sliding_attention",
+    "full_attention",
+    "sliding_attention",
+    "sliding_attention",
+    "sliding_attention",
+    "full_attention",
+    "sliding_attention",
+    "sliding_attention",
+    "sliding_attention",
+    "full_attention",
+    "sliding_attention",
+    "sliding_attention",
+    "sliding_attention",
+    "full_attention",
+    "sliding_attention",
+    "sliding_attention",
+    "sliding_attention"
+  ],
+  "max_position_embeddings": 262144,
+  "max_seq_len": 262144,
+  "model_type": "step3p5",
+  "moe_every_n_layer": 1,
+  "moe_intermediate_size": 1280,
+  "moe_layer_offset": 0,
+  "moe_layers_enum": "3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44",
+  "moe_num_experts": 288,
+  "moe_router_activation": "sigmoid",
+  "moe_router_scaling_factor": 3.0,
+  "moe_top_k": 8,
+  "need_fp32_gate": true,
+  "norm_expert_weight": true,
+  "num_attention_groups": 8,
+  "num_attention_heads": 64,
+  "num_hidden_layers": 45,
+  "num_nextn_predict_layers": 3,
+  "partial_rotary_factor": 0.5,
+  "partial_rotary_factors": [
+    0.5,
+    1.0,
+    1.0,
+    1.0,
+    0.5,
+    1.0,
+    1.0,
+    1.0,
+    0.5,
+    1.0,
+    1.0,
+    1.0,
+    0.5,
+    1.0,
+    1.0,
+    1.0,
+    0.5,
+    1.0,
+    1.0,
+    1.0,
+    0.5,
+    1.0,
+    1.0,
+    1.0,
+    0.5,
+    1.0,
+    1.0,
+    1.0,
+    0.5,
+    1.0,
+    1.0,
+    1.0,
+    0.5,
+    1.0,
+    1.0,
+    1.0,
+    0.5,
+    1.0,
+    1.0,
+    1.0,
+    0.5,
+    1.0,
+    1.0,
+    1.0,
+    0.5,
+    1.0,
+    1.0,
+    1.0
+  ],
+  "quantization_config": {
+    "config_groups": {
+      "group_0": {
+        "format": "nvfp4-pack-quantized",
+        "input_activations": {
+          "actorder": null,
+          "block_structure": null,
+          "dynamic": "local",
+          "group_size": 16,
+          "num_bits": 4,
+          "observer": "static_minmax",
+          "observer_kwargs": {},
+          "scale_dtype": "torch.float8_e4m3fn",
+          "strategy": "tensor_group",
+          "symmetric": true,
+          "type": "float",
+          "zp_dtype": null
+        },
+        "output_activations": null,
+        "targets": [
+          "Linear",
+          "MoELinear"
+        ],
+        "weights": {
+          "actorder": null,
+          "block_structure": null,
+          "dynamic": false,
+          "group_size": 16,
+          "num_bits": 4,
+          "observer": "static_minmax",
+          "observer_kwargs": {},
+          "scale_dtype": "torch.float8_e4m3fn",
+          "strategy": "tensor_group",
+          "symmetric": true,
+          "type": "float",
+          "zp_dtype": null
+        }
+      }
+    },
+    "format": "nvfp4-pack-quantized",
+    "global_compression_ratio": null,
+    "ignore": [
+      "lm_head"
+    ],
+    "kv_cache_scheme": null,
+    "quant_method": "compressed-tensors",
+    "quantization_status": "compressed",
+    "sparsity_config": {},
+    "transform_config": {},
+    "version": "0.13.0"
+  },
+  "rms_norm_eps": 1e-05,
+  "rope_parameters": {
+    "factor": 2.0,
+    "high_freq_factor": 32.0,
+    "low_freq_factor": 1.0,
+    "original_max_position_embeddings": 131072,
+    "rope_type": "llama3"
+  },
+  "rope_scaling": {
+    "factor": 2.0,
+    "high_freq_factor": 32.0,
+    "low_freq_factor": 1.0,
+    "original_max_position_embeddings": 131072,
+    "rope_type": "llama3"
+  },
+  "rope_theta": [
+    5000000.0,
+    10000.0,
+    10000.0,
+    10000.0,
+    5000000.0,
+    10000.0,
+    10000.0,
+    10000.0,
+    5000000.0,
+    10000.0,
+    10000.0,
+    10000.0,
+    5000000.0,
+    10000.0,
+    10000.0,
+    10000.0,
+    5000000.0,
+    10000.0,
+    10000.0,
+    10000.0,
+    5000000.0,
+    10000.0,
+    10000.0,
+    10000.0,
+    5000000.0,
+    10000.0,
+    10000.0,
+    10000.0,
+    5000000.0,
+    10000.0,
+    10000.0,
+    10000.0,
+    5000000.0,
+    10000.0,
+    10000.0,
+    10000.0,
+    5000000.0,
+    10000.0,
+    10000.0,
+    10000.0,
+    5000000.0,
+    10000.0,
+    10000.0,
+    10000.0,
+    5000000.0,
+    10000.0,
+    10000.0,
+    10000.0
+  ],
+  "share_expert_dim": 1280,
+  "sink": false,
+  "sliding_window": 512,
+  "swiglu_limits": [
+    0.0,
+    0.0,
+    0.0,
+    0.0,
+    0.0,
+    0.0,
+    0.0,
+    0.0,
+    0.0,
+    0.0,
+    0.0,
+    0.0,
+    0.0,
+    0.0,
+    0.0,
+    0.0,
+    0.0,
+    0.0,
+    0.0,
+    0.0,
+    0.0,
+    0.0,
+    0.0,
+    0.0,
+    0.0,
+    0.0,
+    0.0,
+    0.0,
+    0.0,
+    0.0,
+    0.0,
+    0.0,
+    0.0,
+    0.0,
+    0.0,
+    0.0,
+    0.0,
+    0.0,
+    0.0,
+    0.0,
+    0.0,
+    0.0,
+    0.0,
+    7,
+    7,
+    0.0,
+    0.0,
+    0.0
+  ],
+  "swiglu_limits_shared": [
+    0.0,
+    0.0,
+    0.0,
+    0.0,
+    0.0,
+    0.0,
+    0.0,
+    0.0,
+    0.0,
+    0.0,
+    0.0,
+    0.0,
+    0.0,
+    0.0,
+    0.0,
+    0.0,
+    0.0,
+    0.0,
+    0.0,
+    0.0,
+    0.0,
+    0.0,
+    0.0,
+    0.0,
+    0.0,
+    0.0,
+    0.0,
+    0.0,
+    0.0,
+    0.0,
+    0.0,
+    0.0,
+    0.0,
+    0.0,
+    0.0,
+    0.0,
+    0.0,
+    0.0,
+    0.0,
+    0.0,
+    0.0,
+    0.0,
+    0.0,
+    0.0,
+    16,
+    0.0,
+    0.0,
+    0.0
+  ],
+  "transformers_version": "4.57.3",
+  "use_cache": false,
+  "use_head_wise_attn_gate": true,
+  "use_moe": true,
+  "use_moe_router_bias": true,
+  "use_qk_norm": true,
+  "use_rope_layers": [],
+  "vocab_size": 128896,
+  "yarn_only_types": [
+    "full_attention"
+  ],
+  "zero_centered": true
+}

configuration_step3p5.py ADDED Viewed

	@@ -0,0 +1,59 @@

+from typing import Any, Optional, Union
+from transformers.configuration_utils import PretrainedConfig
+class Step3p5Config(PretrainedConfig):
+    model_type = "step3p5"
+    architectures = ["Step3p5ForCausalLM"]
+    def __init__(
+        self,
+        hidden_size: int = 4096,
+        intermediate_size: int = 11264,
+        num_attention_heads: int = 64,
+        num_attention_groups: int = 8,
+        num_hidden_layers: int = 45,
+        max_seq_len: int = 128000,
+        vocab_size: int = 128815,
+        rms_norm_eps: float = 1e-5,
+        moe_intermediate_size: int = 1280,
+        moe_num_experts: int = 288,
+        moe_top_k: int = 8,
+        rope_theta: float = 10000,
+        rope_scaling: Optional[dict[str, Any]] = None,
+        max_position_embeddings: int = 128000,
+        share_expert_dims: int = 1280,
+        head_dim: int = 128,
+        norm_expert_weight: bool = True,
+        layer_types: list[str] = None,
+        sliding_window: Optional[int] = None,
+        moe_layers_enum: tuple[int] = (3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14,
+                                       15, 16, 17, 18, 19, 20, 21, 22, 23, 24,
+                                       25, 26, 27, 28, 29, 30, 31, 32, 33, 34,
+                                       35, 36, 37, 38, 39, 40, 41, 42, 43, 44),
+        **kwargs,
+    ) -> None:
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.num_attention_heads = num_attention_heads
+        self.num_attention_groups = num_attention_groups
+        self.num_hidden_layers = num_hidden_layers
+        self.max_seq_len = max_seq_len
+        self.vocab_size = vocab_size
+        self.rms_norm_eps = rms_norm_eps
+        self.moe_intermediate_size = moe_intermediate_size
+        self.moe_num_experts = moe_num_experts
+        self.moe_top_k = moe_top_k
+        self.rope_theta = rope_theta
+        self.rope_scaling = rope_scaling
+        self.max_position_embeddings = max_position_embeddings
+        self.share_expert_dim = share_expert_dims
+        self.head_dim = head_dim
+        self.norm_expert_weight = norm_expert_weight
+        self.moe_layers_enum = moe_layers_enum
+        self.layer_types = layer_types
+        self.sliding_window = sliding_window
+        super().__init__(**kwargs)

generation_config.json ADDED Viewed

	@@ -0,0 +1,11 @@

+{
+  "_from_model_config": true,
+  "bos_token_id": 0,
+  "do_sample": true,
+  "eos_token_id": [
+    1,
+    2,
+    128007
+  ],
+  "transformers_version": "4.57.3"
+}

model-00001-of-00026.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a8a4116f97b83f5272cbcf4a14d879bd4cf7cced6cfb1f94c9959e22deee32af
+size 4967057968

model-00002-of-00026.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:00d3c857834bae585f823926408406efe936f96bc96a187bd166f2ea9e984852
+size 4388928280

model-00003-of-00026.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:72845c5cd1603b4642f0ffc8e0f0202e3ffd0365f7dade7dff1c9b4953d43e95
+size 4317831672

model-00004-of-00026.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:6f137d8752377ca68fa1c7982528bef8caeeb0141767c40477a7d0f8a005978b
+size 4369980176

model-00005-of-00026.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:589470dfae75ba75232ad47f0f43399ac7a376c60e30c23fd667107793a57c8b
+size 4388928400

model-00006-of-00026.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:78f23487e3bab1f0048b335a3138e15533cac08fc1539247bcae8cbef3d059e6
+size 4298883640

model-00007-of-00026.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:0f97bd8e16073c02428151b0171fbacb6579f3d340022b958f17f81c877b2c50
+size 4388928376

model-00008-of-00026.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:402fa96eea50c25016c1751d0893718fcf7097072e60f7b4bac39ce13086a8d6
+size 4369980288

model-00009-of-00026.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:d179a8923dab1deb79a13de991191313ef261794fcb96fd0371fd24dcd99c44a
+size 4317831736

model-00010-of-00026.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:92eb378130807584c9f8f2a04b29115205f36c21c49344616a3272d81a3adaaf
+size 4388928376

model-00011-of-00026.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:138c4db286faca86fb22d534d68c90922eb6d9a9f3f6544cbd2cba8756b0e419
+size 4369980288

model-00012-of-00026.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:1d7460bbc4db7812de1559686a83d22502c7bcff90fb383429bd2ec82a3fd9b5
+size 4317831736

model-00013-of-00026.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:cee0b32bd83e9bae3830457deee6ec5f0b78f11bf5bf70f4d9726711cca97ad0
+size 4369980280

model-00014-of-00026.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:471cab510b6c6bc4c5ca8473542548e0f33556e42dc93fc36148c4da04e97d44
+size 4388928384

model-00015-of-00026.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:c058151516351df8dd559daa6322842dcfa3358aa8d009dcd7eb5c84acad571e
+size 4317831736

model-00016-of-00026.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e0bb9ecac2ae3f3c1458c15b3ea09bd6c77102e44666f579d4d5f57de3f48bb0
+size 4369980280

model-00017-of-00026.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:c66c30f7e9fb6af842634957b92791caacff7a191ead59a9b768da0fc9eb72df
+size 4388928384

model-00018-of-00026.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:7902621aa51b9eaa855da55e89f17a265e3b0351691d58a712eb16fea66e2b88
+size 4298883640

model-00019-of-00026.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:92ae180ac81f7e402b7ddee3f6f51399e9dbe7f684e5399f53dd395a74811e16
+size 4388928376

model-00020-of-00026.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:4a5b1b25ee6d4f5a932a50cd17868b81b157de890d8ab47da33af620e45cb491
+size 4369980288

model-00021-of-00026.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a2cbee091b139efe89c3f4aaade8b0f1840640a792a47c484a3a7612b27f814f
+size 4317831736

model-00022-of-00026.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:1489fde66e452704854197f8dc833ce9e5f51275addec07d6293728e3100f8c8
+size 4388928376

model-00023-of-00026.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:285f76218c43fce239d54372e2e57040c0ccf7be7992abe6c6709ef656feb3bc
+size 4369980288

model-00024-of-00026.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:17f9c6fd63a52aab3560d9c0e26dc76457527c049b2e488119a1c406983c40b3
+size 4317831736

model-00025-of-00026.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:1e037a341d982cc982d4e21c8f3c2c9a1ec48712c6403f080e44b22875b22ad1
+size 4369980280

model-00026-of-00026.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:3b2542bce70caefe00ef78727dd0adf1c4def65fc3000b342e0401c0abe8c576
+size 2763483928

model.safetensors.index.json ADDED Viewed

The diff for this file is too large to render. See raw diff

modeling_step3p5.py ADDED Viewed

	@@ -0,0 +1,900 @@

+# Copyright 2025 The LLAMA4 and HuggingFace Inc. team. All rights reserved.
+#
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from dataclasses import dataclass
+from typing import Callable, Optional, Tuple, Union
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from transformers.activations import ACT2FN
+from transformers.cache_utils import Cache, DynamicCache
+from transformers.generation import GenerationMixin
+from transformers.masking_utils import (create_causal_mask,
+                                        create_sliding_window_causal_mask)
+from transformers.modeling_flash_attention_utils import FlashAttentionKwargs
+from transformers.modeling_layers import GradientCheckpointingLayer
+from transformers.modeling_outputs import BaseModelOutputWithPast, ModelOutput
+from transformers.modeling_rope_utils import (ROPE_INIT_FUNCTIONS,
+                                              dynamic_rope_update)
+from transformers.modeling_utils import (ALL_ATTENTION_FUNCTIONS,
+                                         PreTrainedModel)
+from transformers.processing_utils import Unpack
+from transformers.utils import TransformersKwargs, can_return_tuple, logging
+from .configuration_step3p5 import Step3p5Config
+logger = logging.get_logger(__name__)
+__all__ = ["Step3p5Model", "Step3p5ForCausalLM"]
+class Step3p5RotaryEmbedding(nn.Module):
+    def __init__(self, config: Step3p5Config, device=None, layer_idx=None):
+        super().__init__()
+        # BC: "rope_type" was originally "type"
+        self.layer_idx = layer_idx
+        if config.rope_parameters is not None:
+            self.rope_type = config.rope_parameters.get(
+                "rope_type", config.rope_parameters.get("type"))
+        else:
+            self.rope_type = "default"
+        self.max_seq_len_cached = config.max_position_embeddings
+        self.original_max_seq_len = config.max_position_embeddings
+        partial_rotary_factors = getattr(config, "partial_rotary_factors",
+                                         None)
+        if partial_rotary_factors is not None:
+            config.partial_rotary_factor = partial_rotary_factors[
+                self.layer_idx]
+        else:
+            config.partial_rotary_factor = 1.0
+        self.rope_theta = config.rope_theta
+        if isinstance(config.rope_theta, list):
+            self.rope_theta = config.rope_theta.copy()
+            config.rope_theta = self.rope_theta[self.layer_idx]
+        self.config = config
+        self.rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type]
+        inv_freq, self.attention_scaling = self.rope_init_fn(
+            self.config, device)
+        self.register_buffer("inv_freq", inv_freq, persistent=False)
+        self.original_inv_freq = self.inv_freq
+        config.rope_theta = self.rope_theta
+    @torch.no_grad()
+    @dynamic_rope_update  # power user: used with advanced RoPE types (e.g. dynamic rope)
+    def forward(self, x, position_ids):
+        inv_freq_expanded = self.inv_freq[None, :, None].float().expand(
+            position_ids.shape[0], -1, 1).to(x.device)
+        position_ids_expanded = position_ids[:, None, :].float().to(x.device)
+        device_type = x.device.type if isinstance(
+            x.device.type, str) and x.device.type != "mps" else "cpu"
+        with torch.autocast(device_type=device_type,
+                            enabled=False):  # Force float32
+            freqs = (inv_freq_expanded.float()
+                     @ position_ids_expanded.float()).transpose(1, 2)
+            emb = torch.cat((freqs, freqs), dim=-1)
+            cos = emb.cos() * self.attention_scaling
+            sin = emb.sin() * self.attention_scaling
+        return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype)
+def rotate_half(x):
+    """Rotates half the hidden dims of the input."""
+    x1 = x[..., :x.shape[-1] // 2]
+    x2 = x[..., x.shape[-1] // 2:]
+    return torch.cat((-x2, x1), dim=-1)
+def apply_rotary_pos_emb(q, k, cos, sin, position_ids=None, unsqueeze_dim=1):
+    """Applies Rotary Position Embedding to the query and key tensors.
+    Args:
+        q (`torch.Tensor`): The query tensor.
+        k (`torch.Tensor`): The key tensor.
+        cos (`torch.Tensor`): The cosine part of the rotary embedding.
+        sin (`torch.Tensor`): The sine part of the rotary embedding.
+        position_ids (`torch.Tensor`, *optional*):
+            Deprecated and unused.
+        unsqueeze_dim (`int`, *optional*, defaults to 1):
+            The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
+            sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
+            that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
+            k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
+            cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
+            the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
+    Returns:
+        `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
+    """
+    rotary_dim = cos.shape[-1]
+    q_rot, q_pass = q[..., :rotary_dim], q[..., rotary_dim:]
+    k_rot, k_pass = k[..., :rotary_dim], k[..., rotary_dim:]
+    # Apply rotary embeddings on the first half or full tensor
+    q_embed = (q_rot * cos) + (rotate_half(q_rot) * sin)
+    k_embed = (k_rot * cos) + (rotate_half(k_rot) * sin)
+    # Concatenate back to full shape
+    q_embed = torch.cat([q_embed, q_pass], dim=-1)
+    k_embed = torch.cat([k_embed, k_pass], dim=-1)
+    return q_embed, k_embed
+def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
+    """
+    This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
+    num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
+    """
+    batch, num_key_value_heads, slen, head_dim = hidden_states.shape
+    if n_rep == 1:
+        return hidden_states
+    hidden_states = hidden_states[:, :,
+                                  None, :, :].expand(batch,
+                                                     num_key_value_heads,
+                                                     n_rep, slen, head_dim)
+    return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen,
+                                 head_dim)
+# Adapted from transformers.models.llama.modeling_llama.eager_attention_forward -> llama4 doesn't cast attn weights to fp32
+def eager_attention_forward(
+    module: nn.Module,
+    query: torch.Tensor,
+    key: torch.Tensor,
+    value: torch.Tensor,
+    attention_mask: Optional[torch.Tensor],
+    scaling: float,
+    dropout: float = 0.0,
+    **kwargs,
+):
+    key_states = repeat_kv(key, module.num_key_value_groups)
+    value_states = repeat_kv(value, module.num_key_value_groups)
+    # breakpoint()
+    attn_weights = torch.matmul(query, key_states.transpose(2, 3)) * scaling
+    if attention_mask is not None:
+        causal_mask = attention_mask[:, :, :, :key_states.shape[-2]]
+        attn_weights = attn_weights + causal_mask
+    attn_weights = nn.functional.softmax(attn_weights, dim=-1)
+    attn_weights = nn.functional.dropout(attn_weights,
+                                         p=dropout,
+                                         training=module.training)
+    attn_output = torch.matmul(attn_weights, value_states)
+    attn_output = attn_output.transpose(1, 2).contiguous()
+    return attn_output, attn_weights
+@dataclass
+class Step3p5CausalLMOutputWithPast(ModelOutput):
+    r"""
+    loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
+        Language modeling loss (for next-token prediction).
+    logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
+        Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
+    past_key_values (`Cache`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
+        Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
+        `(batch_size, num_heads, sequence_length, embed_size_per_head)`)
+        Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
+        `past_key_values` input) to speed up sequential decoding.
+    """
+    loss: Optional[torch.FloatTensor] = None
+    last_hidden_state: Optional[torch.FloatTensor] = None
+    logits: torch.FloatTensor = None
+    past_key_values: Optional[list[torch.FloatTensor]] = None
+    hidden_states: Optional[tuple[torch.FloatTensor]] = None
+    attentions: Optional[tuple[torch.FloatTensor]] = None
+class Step3p5MLP(nn.Module):
+    def __init__(self, config, intermediate_size=None, swiglu_limit=None):
+        super().__init__()
+        self.config = config
+        self.hidden_size = config.hidden_size
+        self.intermediate_size = intermediate_size if intermediate_size is not None else config.intermediate_size
+        self.gate_proj = nn.Linear(self.hidden_size,
+                                   self.intermediate_size,
+                                   bias=False)
+        self.up_proj = nn.Linear(self.hidden_size,
+                                 self.intermediate_size,
+                                 bias=False)
+        self.down_proj = nn.Linear(self.intermediate_size,
+                                   self.hidden_size,
+                                   bias=False)
+        self.act_fn = ACT2FN["silu"]
+        self.limit = swiglu_limit
+    def forward(self, x):
+        up = self.up_proj(x)
+        gate = self.act_fn(self.gate_proj(x))
+        if self.limit is not None:
+            gate = gate.clamp(min=None, max=self.limit)
+            up = up.clamp(min=-self.limit, max=self.limit)
+        return self.down_proj(gate * up)
+def sigmoid_routing_function(gating_output: torch.Tensor, topk: int,
+                             renormalize: bool):
+    gating_output = gating_output.float()
+    gate_prob = torch.sigmoid(gating_output)
+    gate_prob = gate_prob / gate_prob.sum(dim=-1, keepdim=True)
+    topk_prob, indices = torch.topk(gate_prob, k=topk, dim=1)
+    expert_topk_weight = topk_prob
+    if renormalize:
+        expert_topk_weight = expert_topk_weight / torch.sum(
+            expert_topk_weight, dim=-1, keepdim=True)
+    return expert_topk_weight, indices
+def softmax_routing_function(gating_output: torch.Tensor, top_k: int,
+                             renormalize: bool):
+    gating_output = gating_output.float()
+    gate_prob = torch.softmax(gating_output, dim=-1)
+    gate_prob = gate_prob / gate_prob.sum(dim=-1, keepdim=True)
+    topk_prob, indices = torch.topk(gate_prob, k=top_k, dim=1)
+    expert_topk_weight = topk_prob
+    if renormalize:
+        expert_topk_weight = expert_topk_weight / torch.sum(
+            expert_topk_weight, dim=-1, keepdim=True)
+    return expert_topk_weight, indices.to(torch.int32)
+class MoELinear(nn.Module):
+    def __init__(self, num_experts, in_features, out_features):
+        super().__init__()
+        self.num_experts = num_experts
+        self.in_features = in_features
+        self.out_features = out_features
+        self.weight = nn.Parameter(
+            torch.empty(num_experts, out_features, in_features))
+    def forward(self, x, expert_id):
+        x = F.linear(x.float(), self.weight[expert_id].float())
+        return x
+class Step3p5MoEMLP(nn.Module):
+    def __init__(self, config, swiglu_limit=None):
+        super().__init__()
+        self.num_experts = config.moe_num_experts
+        self.top_k = config.moe_top_k
+        self.hidden_size = config.hidden_size
+        self.moe_intermediate_size = config.moe_intermediate_size
+        self.use_moe_router_bias = config.use_moe_router_bias
+        if self.use_moe_router_bias:
+            self.router_bias = nn.Parameter(torch.zeros(config.moe_num_experts,
+                                                        dtype=torch.float32),
+                                            requires_grad=False)
+            self.custom_routing_function = self.router_bias_func
+        elif config.moe_router_activation == "sigmoid":
+            self.custom_routing_function = sigmoid_routing_function
+        else:
+            self.custom_routing_function = None
+        self.need_fp32_gate = config.need_fp32_gate
+        self.routed_scaling_factor = getattr(config,
+                                             "moe_router_scaling_factor", 1.0)
+        # gating
+        self.gate = nn.Linear(self.hidden_size, self.num_experts, bias=False)
+        self.act_fn = ACT2FN["silu"]
+        self.limit = swiglu_limit
+        self.up_proj = MoELinear(self.num_experts, self.hidden_size,
+                                 self.moe_intermediate_size)
+        self.gate_proj = MoELinear(self.num_experts, self.hidden_size,
+                                   self.moe_intermediate_size)
+        self.down_proj = MoELinear(self.num_experts,
+                                   self.moe_intermediate_size,
+                                   self.hidden_size)
+    def router_bias_func(self, gating_output: torch.Tensor, topk: int,
+                         renormalize: bool):
+        gate_prob = torch.sigmoid(gating_output.float())
+        gate_prob_with_bias = gate_prob + self.router_bias.unsqueeze(0)
+        _, indices = torch.topk(gate_prob_with_bias, k=topk, dim=1)
+        topk_prob = torch.gather(gate_prob, 1, indices)
+        expert_topk_weight = topk_prob
+        if renormalize:
+            expert_topk_weight = expert_topk_weight / (
+                torch.sum(expert_topk_weight, dim=-1, keepdim=True) + 1e-20)
+        return expert_topk_weight, indices
+    def get_expert_output(self, inputs: torch.Tensor, expert_id):
+        #if self.limit is None:
+        up = self.up_proj(inputs, expert_id)
+        gate = self.act_fn(self.gate_proj(inputs, expert_id))
+        if self.limit is not None:
+            gate = gate.clamp(min=None, max=self.limit)
+            up = up.clamp(min=-self.limit, max=self.limit)
+        return self.down_proj(gate * up, expert_id)
+    def forward(self, hidden_states):
+        """ """
+        batch_size, sequence_length, hidden_dim = hidden_states.shape
+        hidden_states = hidden_states.view(-1, hidden_dim)
+        if self.need_fp32_gate:
+            router_logits = torch.matmul(hidden_states.to(torch.float32), self.gate.weight.t().to(torch.float32))
+        else:
+            # router_logits: (batch * sequence_length, n_experts)
+            router_logits = self.gate(hidden_states)
+        if self.custom_routing_function:
+            routing_weights, selected_experts = self.custom_routing_function(
+                router_logits, self.top_k, renormalize=True)
+        else:
+            routing_weights = F.softmax(router_logits,
+                                        dim=1,
+                                        dtype=torch.float)
+            routing_weights, selected_experts = torch.topk(routing_weights,
+                                                           self.top_k,
+                                                           dim=-1)
+        routing_weights = routing_weights * self.routed_scaling_factor
+        final_hidden_states = torch.zeros(
+            (batch_size * sequence_length, hidden_dim),
+            dtype=hidden_states.dtype,
+            device=hidden_states.device)
+        # One hot encode the selected experts to create an expert mask
+        # this will be used to easily index which expert is going to be sollicitated
+        expert_mask = torch.nn.functional.one_hot(
+            selected_experts, num_classes=self.num_experts).permute(2, 1, 0)
+        # Loop over all available experts in the model and perform the computation on each expert
+        for expert_idx in range(self.num_experts):
+            idx, top_x = torch.where(expert_mask[expert_idx])
+            # Index the correct hidden states and compute the expert hidden state for
+            # the current expert. We need to make sure to multiply the output hidden
+            # states by `routing_weights` on the corresponding tokens (top-1 and top-2)
+            current_state = hidden_states[None, top_x].reshape(-1, hidden_dim)
+            current_hidden_states = (
+                self.get_expert_output(current_state, expert_idx) *
+                routing_weights[top_x, idx, None])
+            # However `index_add_` only support torch tensors for indexing so we'll use
+            # the `top_x` tensor here.
+            final_hidden_states.index_add_(
+                0, top_x, current_hidden_states.to(hidden_states.dtype))
+        final_hidden_states = final_hidden_states.reshape(
+            batch_size, sequence_length, hidden_dim)
+        return final_hidden_states
+class Step3p5RMSNorm(nn.Module):
+    def __init__(
+        self,
+        hidden_size: int,
+        eps: float = 1e-5,
+    ) -> None:
+        super().__init__()
+        self.weight = nn.Parameter(torch.ones(hidden_size))
+        self.variance_epsilon = eps
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        dtype = x.dtype
+        x = x.float()
+        variance = x.pow(2).mean(dim=-1, keepdim=True)
+        normed = x * torch.rsqrt(variance + self.variance_epsilon)
+        normed = normed * (self.weight.float() + 1)
+        return normed.to(dtype)
+class Step3p5Attention(nn.Module):
+    def __init__(self, config: Step3p5Config, layer_idx):
+        super().__init__()
+        self.config = config
+        self.layer_idx = layer_idx
+        self.num_attention_heads = config.num_attention_heads
+        self.num_key_value_heads = config.num_attention_groups
+        layer_types = getattr(config, "layer_types", [])
+        if layer_types:
+            enable_sliding_window = layer_types[
+                self.layer_idx] == "sliding_attention"
+        else:
+            enable_sliding_window = self.layer_idx % 2 == 0
+        if hasattr(config, "yarn_only_types") and layer_types[
+                self.layer_idx] not in config.yarn_only_types:
+            config.rope_parameters = None
+        else:
+            config.rope_parameters = getattr(config, "rope_scaling", None)
+        self.sliding_window = config.sliding_window
+        if enable_sliding_window:
+            self.num_attention_heads = config.attention_other_setting[
+                "num_attention_heads"]
+            self.num_key_value_heads = config.attention_other_setting[
+                "num_attention_groups"]
+        if self.sliding_window is not None and enable_sliding_window:
+            self.sliding_window = (self.sliding_window)
+        else:
+            self.sliding_window = None
+        self.head_dim = getattr(config, "head_dim",
+                        config.hidden_size // self.num_attention_heads)
+        self.num_key_value_groups = self.num_attention_heads // self.num_key_value_heads
+        self.rotary_emb = Step3p5RotaryEmbedding(config, layer_idx=layer_idx)
+        self.q_size = self.num_attention_heads * self.head_dim
+        self.kv_size = self.num_key_value_heads * self.head_dim
+        self.scaling = self.head_dim**-0.5
+        self.q_proj = nn.Linear(config.hidden_size, self.q_size, bias=False)
+        self.k_proj = nn.Linear(config.hidden_size, self.kv_size, bias=False)
+        self.v_proj = nn.Linear(config.hidden_size, self.kv_size, bias=False)
+        self.o_proj = nn.Linear(self.q_size, config.hidden_size, bias=False)
+        self.q_norm = Step3p5RMSNorm(self.head_dim,
+                                    eps=config.rms_norm_eps)
+        self.k_norm = Step3p5RMSNorm(self.head_dim,
+                                    eps=config.rms_norm_eps)
+        self.use_head_wise_attn_gate = config.use_head_wise_attn_gate
+        if self.use_head_wise_attn_gate:
+            self.g_proj = nn.Linear(config.hidden_size,
+                                    self.num_attention_heads,
+                                    bias=False)
+        self.use_rope = True
+        use_rope_layers = getattr(config, "use_rope_layers", None)
+        if use_rope_layers:
+            self.use_rope = use_rope_layers[self.layer_idx]
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor],
+        past_key_value: Optional[Cache] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        **kwargs: Unpack[FlashAttentionKwargs],
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor],
+               Optional[Tuple[torch.Tensor]]]:
+        input_shape = hidden_states.shape[:-1]
+        hidden_shape = (*input_shape, -1, self.head_dim)
+        query_states = self.q_norm(
+            self.q_proj(hidden_states).view(hidden_shape)).transpose(1, 2)
+        key_states = self.k_norm(
+            self.k_proj(hidden_states).view(hidden_shape)).transpose(1, 2)
+        value_states = self.v_proj(hidden_states).view(hidden_shape).transpose(
+            1, 2)
+        if self.use_head_wise_attn_gate:
+            gate_states = self.g_proj(hidden_states)
+        cos, sin = self.rotary_emb(hidden_states, position_ids)
+        # cos, sin = position_embeddings
+        query_states, key_states = apply_rotary_pos_emb(
+            query_states, key_states, cos, sin)
+        # query_states, key_states = apply_rotary_pos_emb(query_norm_states, key_norm_states, cos, sin)
+        if past_key_value is not None:
+            # sin and cos are specific to RoPE models; position_ids needed for the static cache
+            cache_kwargs = {
+                "sin": sin,
+                "cos": cos,
+                "cache_position": cache_position
+            }
+            key_states, value_states = past_key_value.update(
+                key_states, value_states, self.layer_idx, cache_kwargs)
+        attention_interface: Callable = eager_attention_forward
+        # TODO: considering FP8；
+        # RuntimeError: Expected attn_mask dtype to be bool or float or to match query dtype,
+        # but got attn_mask.dtype: long int and  query.dtype: c10::BFloat16 instead.
+        if self.config._attn_implementation != "eager":
+            attention_interface = ALL_ATTENTION_FUNCTIONS[
+                self.config._attn_implementation]
+        attn_output, attn_weights = attention_interface(
+            self,
+            query_states,
+            key_states,
+            value_states,
+            attention_mask,
+            dropout=0.0 if not self.training else self.attention_dropout,
+            scaling=self.scaling,
+            sliding_window=self.sliding_window,  # main diff with Llama
+            **kwargs,
+        )
+        attn_output = attn_output.reshape(*input_shape, -1)
+        if self.use_head_wise_attn_gate:
+            output = attn_output.view(
+                *attn_output.shape[:-1], self.num_attention_heads,
+                self.head_dim) * gate_states.unsqueeze(-1).sigmoid()
+            attn_output = output.view(*attn_output.shape)
+        attn_output = self.o_proj(attn_output)
+        return attn_output, attn_weights
+class Step3p5DecoderLayer(GradientCheckpointingLayer):
+    def __init__(self, config, layer_idx):
+        super().__init__()
+        self.hidden_size = config.hidden_size
+        self.layer_idx = layer_idx
+        self.self_attn = Step3p5Attention(config, layer_idx)
+        self.attention_type = config.layer_types[layer_idx]
+        moe_layers_enum = getattr(config, "moe_layers_enum", None)
+        if moe_layers_enum is not None:
+            moe_layers_idx = [
+                int(i) for i in moe_layers_enum.strip().split(',')
+            ]
+        else:
+            moe_layers_idx = [i for i in range(1, config.num_hidden_layers)]
+        self.is_moe_layer = layer_idx in moe_layers_idx
+        self.use_moe = False
+        if config.swiglu_limits_shared and config.swiglu_limits_shared[
+                layer_idx] is not None and config.swiglu_limits_shared[
+                    layer_idx] != 0:
+            swiglu_limit_shared = config.swiglu_limits_shared[layer_idx]
+        else:
+            swiglu_limit_shared = None
+        if config.swiglu_limits and config.swiglu_limits[
+                layer_idx] is not None and config.swiglu_limits[layer_idx] != 0:
+            swiglu_limit = config.swiglu_limits[layer_idx]
+        else:
+            swiglu_limit = None
+        if self.is_moe_layer:
+            self.moe = Step3p5MoEMLP(config, swiglu_limit=swiglu_limit)  #
+            self.share_expert = Step3p5MLP(
+                config,
+                intermediate_size=config.share_expert_dim,
+                swiglu_limit=swiglu_limit_shared)
+            self.use_moe = True
+        else:
+            self.mlp = Step3p5MLP(config,
+                                 intermediate_size=config.intermediate_size,
+                                 swiglu_limit=swiglu_limit_shared)
+        self.input_layernorm = Step3p5RMSNorm(
+            config.hidden_size,
+            eps=config.rms_norm_eps)
+        self.post_attention_layernorm = Step3p5RMSNorm(
+            config.hidden_size,
+            eps=config.rms_norm_eps)
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_value: Optional[tuple[torch.Tensor]] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        **kwargs: Unpack[FlashAttentionKwargs],
+    ) -> torch.FloatTensor:
+        residual = hidden_states
+        hidden_states = self.input_layernorm(hidden_states)
+        hidden_states, _ = self.self_attn(
+            hidden_states=hidden_states,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_value=past_key_value,
+            cache_position=cache_position,
+            **kwargs,
+        )
+        hidden_states = residual + hidden_states
+        # Fully Connected
+        residual = hidden_states
+        hidden_states = self.post_attention_layernorm(hidden_states)
+        if self.use_moe:
+            share_output = self.share_expert(hidden_states)
+            moe_output = self.moe(hidden_states)
+            ffn_output = moe_output + share_output
+        else:
+            ffn_output = self.mlp(hidden_states)
+        if isinstance(ffn_output, tuple):
+            hidden_states, _ = ffn_output
+        else:
+            hidden_states = ffn_output
+        hidden_states = residual + hidden_states
+        return hidden_states
+class Step3p5PreTrainedModel(PreTrainedModel):
+    # Link this model family to its configuration class so PreTrainedModel.from_pretrained
+    # can load the config instead of failing with a NoneType error.
+    config_class = Step3p5Config
+    supports_gradient_checkpointing = True
+    _skip_keys_device_placement = ["past_key_values"]
+    _keys_to_ignore_on_load_unexpected = [
+        r"model\.layers\.45\.*",
+        r"model\.layers\.46\.*",
+        r"model\.layers\.47\.*"
+    ]
+    _supports_flash_attn = False
+    _supports_sdpa = True
+    _supports_flex_attn = True
+    _supports_static_cache = True
+    _supports_attention_backend = True
+class Step3p5Model(Step3p5PreTrainedModel, GenerationMixin):
+    _no_split_modules = ["Step3p5DecoderLayer"]
+    base_model_prefix = "model"
+    _tied_weights_keys = ["lm_head.weight"]
+    config: Step3p5Config
+    def __init__(self, config: Step3p5Config):
+        super().__init__(config)
+        self.padding_idx = config.pad_token_id
+        self.vocab_size = config.vocab_size
+        self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size,
+                                         self.padding_idx)
+        self.layers = nn.ModuleList([
+            Step3p5DecoderLayer(config, layer_idx)
+            for layer_idx in range(config.num_hidden_layers)
+        ])
+        self.norm = Step3p5RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.gradient_checkpointing = False
+        self.has_sliding_layers = "sliding_attention" in self.config.layer_types
+        # Initialize weights and apply final processing
+        self.post_init()
+    def get_input_embeddings(self):
+        return self.embed_tokens
+    @can_return_tuple
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Cache] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        **kwargs: Unpack[TransformersKwargs],
+    ) -> Union[tuple, BaseModelOutputWithPast]:
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (output_hidden_states
+                                if output_hidden_states is not None else
+                                self.config.output_hidden_states)
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        if (input_ids is None) ^ (inputs_embeds is not None):
+            raise ValueError(
+                "You must specify exactly one of input_ids or inputs_embeds")
+        if self.gradient_checkpointing and self.training and use_cache:
+            logger.warning_once(
+                "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`."
+            )
+            use_cache = False
+        if inputs_embeds is None:
+            inputs_embeds = self.embed_tokens(
+                input_ids.to(self.embed_tokens.weight.device))
+        if use_cache and past_key_values is None:
+            past_key_values = DynamicCache()
+        if cache_position is None:
+            past_seen_tokens = past_key_values.get_seq_length(
+            ) if past_key_values is not None else 0
+            cache_position = torch.arange(past_seen_tokens,
+                                          past_seen_tokens +
+                                          inputs_embeds.shape[1],
+                                          device=inputs_embeds.device)
+        if position_ids is None:
+            position_ids = cache_position.unsqueeze(0)
+        hidden_states = inputs_embeds
+        # It may already have been prepared by e.g. `generate`
+        if not isinstance(causal_mask_mapping := attention_mask, dict):
+            # Prepare mask arguments
+            mask_kwargs = {
+                "config": self.config,
+                "input_embeds": inputs_embeds,
+                "attention_mask": attention_mask,
+                "cache_position": cache_position,
+                "past_key_values": past_key_values,
+                "position_ids": position_ids,
+            }
+            # Create the masks
+            causal_mask_mapping = {
+                "full_attention": create_causal_mask(**mask_kwargs),
+            }
+            # The sliding window alternating layers are not always activated depending on the config
+            if self.has_sliding_layers:
+                causal_mask_mapping[
+                    "sliding_attention"] = create_sliding_window_causal_mask(
+                        **mask_kwargs)
+        # # create position embeddings to be shared across the decoder layers
+        # decoder layers
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attns = () if output_attentions else None
+        for decoder_layer in self.layers[:self.config.num_hidden_layers]:
+            if output_hidden_states:
+                all_hidden_states += (hidden_states, )
+            layer_outputs = decoder_layer(
+                hidden_states,
+                attention_mask=causal_mask_mapping[
+                    decoder_layer.attention_type],
+                position_ids=position_ids,
+                past_key_value=past_key_values,
+                output_attentions=output_attentions,
+                use_cache=use_cache,
+                cache_position=cache_position,
+                **kwargs,
+            )
+            hidden_states = layer_outputs
+        hidden_states = self.norm(hidden_states)
+        return BaseModelOutputWithPast(
+            last_hidden_state=hidden_states,
+            past_key_values=past_key_values if use_cache else None,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attns,
+        )
+class Step3p5ForCausalLM(Step3p5PreTrainedModel, GenerationMixin):
+    _tied_weights_keys = ["lm_head.weight"]
+    config: Step3p5Config
+    def __init__(self, config: Step3p5Config):
+        super().__init__(config)
+        self.model = Step3p5Model(config)
+        self.lm_head = nn.Linear(config.hidden_size,
+                                 config.vocab_size,
+                                 bias=False)
+        self.post_init()
+    def get_input_embeddings(self):
+        return self.model.get_input_embeddings()
+    def set_input_embeddings(self, value):
+        self.model.set_input_embeddings(value)
+    def get_output_embeddings(self):
+        return self.model.get_output_embeddings()
+    def set_output_embeddings(self, new_embeddings):
+        self.model.set_output_embeddings(new_embeddings)
+    def set_decoder(self, decoder):
+        self.model.set_decoder(decoder)
+    def get_decoder(self):
+        return self.model.get_decoder()
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        num_patches=None,
+        patch_pixel_values=None,
+        patch_newline_mask=None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Cache] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        **kwargs: Unpack[TransformersKwargs],
+    ) -> Union[tuple, Step3p5CausalLMOutputWithPast]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
+            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
+            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
+        Example:
+        ```python
+        >>> from transformers import AutoTokenizer, Llama4ForCausalLM
+        >>> model = Llama4ForCausalLM.from_pretrained("meta-llama4/Llama4-2-7b-hf")
+        >>> tokenizer = AutoTokenizer.from_pretrained("meta-llama4/Llama4-2-7b-hf")
+        >>> prompt = "Hey, are you conscious? Can you talk to me?"
+        >>> inputs = tokenizer(prompt, return_tensors="pt")
+        >>> # Generate
+        >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
+        >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
+        "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you."
+        ```"""
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (output_hidden_states
+                                if output_hidden_states is not None else
+                                self.config.output_hidden_states)
+        # breakpoint()
+        outputs = self.model(
+            input_ids=input_ids,
+            num_patches=num_patches,
+            patch_pixel_values=patch_pixel_values,
+            patch_newline_mask=patch_newline_mask,
+            position_ids=position_ids,
+            attention_mask=attention_mask,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            cache_position=cache_position,
+            **kwargs,
+        )
+        hidden_states = outputs.last_hidden_state
+        logits = self.lm_head(hidden_states)
+        return Step3p5CausalLMOutputWithPast(logits=logits, )
+    def prepare_inputs_for_generation(
+        self,
+        input_ids,
+        past_key_values=None,
+        inputs_embeds=None,
+        pixel_values=None,
+        attention_mask=None,
+        cache_position=None,
+        logits_to_keep=None,
+        **kwargs,
+    ):
+        model_inputs = super().prepare_inputs_for_generation(
+            input_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            attention_mask=attention_mask,
+            cache_position=cache_position,
+            logits_to_keep=logits_to_keep,
+            **kwargs,
+        )
+        if cache_position[0] == 0:
+            # If we're in cached decoding stage, pixel values should be None because input ids do not contain special image token anymore
+            # Otherwise we need pixel values to be passed to model
+            model_inputs["pixel_values"] = pixel_values
+        return model_inputs
+    def _fix_state_dict_key_on_load(self, key: str) -> tuple[str, bool]:
+        if key.startswith("language_model."):
+            return key[len("language_model."):], True
+        return key, False

recipe.yaml ADDED Viewed

	@@ -0,0 +1,7 @@

+default_stage:
+  default_modifiers:
+    QuantizationModifier:
+      targets: [Linear, MoELinear]
+      ignore: [lm_head, 're:visual.*', 're:.*vision_tower.*', 're:.*video_tower.*', 're:.*audio_tower.*',
+        're:.*multi_modal_projector.*']
+      scheme: NVFP4

special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,23 @@

+{
+  "bos_token": {
+    "content": "<｜begin▁of▁sentence｜>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "<|im_end|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": {
+    "content": "<｜end▁of▁sentence｜>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

tokenizer_config.json ADDED Viewed

The diff for this file is too large to render. See raw diff