Girinath11
/

MixtureofRecursionwithRouter

Text Generation

mixture_of_recursions

feature-extraction

recursive-transformer

technical-content

code-generation

adaptive-routing

Model card Files Files and versions

Girinath11 commited on Nov 5, 2025

Commit

752c496

·

verified ·

1 Parent(s): fdd8947

Update model_slm.py

Files changed (1) hide show

model_slm.py +58 -0

model_slm.py CHANGED Viewed

@@ -5,6 +5,64 @@ import math
 from typing import Optional, Tuple, Union
 from embeddings import TechEmbeddingLayer, create_padding_mask, create_causal_mask
 # Constants for default configuration
 DEFAULT_D_MODEL = 512
 DEFAULT_N_HEADS = 8

 from typing import Optional, Tuple, Union
 from embeddings import TechEmbeddingLayer, create_padding_mask, create_causal_mask
+# ============================================================================
+# TRANSFORMERS COMPATIBILITY - ADD THIS SECTION
+# ============================================================================
+from transformers import PretrainedConfig
+from transformers.modeling_utils import PreTrainedModel
+class MixtureOfRecursionsConfig(PretrainedConfig):
+    """Configuration class for MixtureOfRecursions model."""
+    model_type = "mixture_of_recursions"
+    def __init__(
+        self,
+        vocab_size=31985,
+        d_model=384,
+        n_layers=12,
+        n_heads=6,
+        max_steps=4,
+        dim_feedforward=2048,
+        dropout=0.1,
+        max_seq_len=128,
+        router_type="adaptive",
+        padding_idx=0,
+        pos_encoding="learned",
+        # Transformers standard names (for compatibility)
+        hidden_size=None,
+        num_hidden_layers=None,
+        num_attention_heads=None,
+        intermediate_size=None,
+        max_position_embeddings=None,
+        **kwargs
+    ):
+        super().__init__(**kwargs)
+        # Your model's parameters
+        self.vocab_size = vocab_size
+        self.d_model = d_model
+        self.n_layers = n_layers
+        self.n_heads = n_heads
+        self.max_steps = max_steps
+        self.dim_feedforward = dim_feedforward
+        self.dropout = dropout
+        self.max_seq_len = max_seq_len
+        self.router_type = router_type
+        self.padding_idx = padding_idx
+        self.pos_encoding = pos_encoding
+        # Transformers standard aliases (for compatibility)
+        self.hidden_size = hidden_size or d_model
+        self.num_hidden_layers = num_hidden_layers or n_layers
+        self.num_attention_heads = num_attention_heads or n_heads
+        self.intermediate_size = intermediate_size or dim_feedforward
+        self.max_position_embeddings = max_position_embeddings or max_seq_len
+# ============================================================================
+# END TRANSFORMERS COMPATIBILITY SECTION
+# ============================================================================
 # Constants for default configuration
 DEFAULT_D_MODEL = 512
 DEFAULT_N_HEADS = 8