Girinath11 commited on
Commit
d760a3c
·
verified ·
1 Parent(s): 2c7c331

Create configuration_mixture_of_recursions.py

Browse files
configuration_mixture_of_recursions.py ADDED
@@ -0,0 +1,73 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # configuration_mixture_of_recursions.py
2
+ # Create this file in your repository root
3
+
4
+ from transformers import PretrainedConfig
5
+
6
+
7
+ class MixtureOfRecursionsConfig(PretrainedConfig):
8
+ """
9
+ Configuration class for MixtureOfRecursions model.
10
+
11
+ This class stores the configuration of a MixtureOfRecursions model with recursive transformers.
12
+ """
13
+ model_type = "mixture_of_recursions"
14
+
15
+ def __init__(
16
+ self,
17
+ vocab_size=10000,
18
+ hidden_size=256,
19
+ num_hidden_layers=4,
20
+ num_attention_heads=8,
21
+ intermediate_size=1024,
22
+ max_position_embeddings=512,
23
+ max_recursion_depth=3,
24
+ attention_dropout=0.1,
25
+ hidden_dropout=0.1,
26
+ initializer_range=0.02,
27
+ layer_norm_eps=1e-5,
28
+ use_cache=True,
29
+ pad_token_id=0,
30
+ bos_token_id=1,
31
+ eos_token_id=2,
32
+ tie_word_embeddings=False,
33
+ **kwargs
34
+ ):
35
+ """
36
+ Args:
37
+ vocab_size (int): Vocabulary size of the model
38
+ hidden_size (int): Dimension of the hidden representations
39
+ num_hidden_layers (int): Number of transformer layers
40
+ num_attention_heads (int): Number of attention heads
41
+ intermediate_size (int): Dimension of the feedforward network
42
+ max_position_embeddings (int): Maximum sequence length
43
+ max_recursion_depth (int): Maximum depth of recursive processing
44
+ attention_dropout (float): Dropout probability for attention layers
45
+ hidden_dropout (float): Dropout probability for hidden layers
46
+ initializer_range (float): Standard deviation for weight initialization
47
+ layer_norm_eps (float): Epsilon for layer normalization
48
+ use_cache (bool): Whether to use past key values for faster generation
49
+ pad_token_id (int): Token ID for padding
50
+ bos_token_id (int): Token ID for beginning of sequence
51
+ eos_token_id (int): Token ID for end of sequence
52
+ tie_word_embeddings (bool): Whether to tie input and output embeddings
53
+ """
54
+ super().__init__(
55
+ pad_token_id=pad_token_id,
56
+ bos_token_id=bos_token_id,
57
+ eos_token_id=eos_token_id,
58
+ tie_word_embeddings=tie_word_embeddings,
59
+ **kwargs
60
+ )
61
+
62
+ self.vocab_size = vocab_size
63
+ self.hidden_size = hidden_size
64
+ self.num_hidden_layers = num_hidden_layers
65
+ self.num_attention_heads = num_attention_heads
66
+ self.intermediate_size = intermediate_size
67
+ self.max_position_embeddings = max_position_embeddings
68
+ self.max_recursion_depth = max_recursion_depth
69
+ self.attention_dropout = attention_dropout
70
+ self.hidden_dropout = hidden_dropout
71
+ self.initializer_range = initializer_range
72
+ self.layer_norm_eps = layer_norm_eps
73
+ self.use_cache = use_cache