stefan-it commited on
Commit
ecd99b8
·
1 Parent(s): ad632e3

model: add original TensorFlow checkpoints (25k steps on English corpus)

Browse files
.gitattributes CHANGED
@@ -32,3 +32,7 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
32
  *.zip filter=lfs diff=lfs merge=lfs -text
33
  *.zst filter=lfs diff=lfs merge=lfs -text
34
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
32
  *.zip filter=lfs diff=lfs merge=lfs -text
33
  *.zst filter=lfs diff=lfs merge=lfs -text
34
  *tfevents* filter=lfs diff=lfs merge=lfs -text
35
+ model.ckpt-1025000.data-00001-of-00004 filter=lfs diff=lfs merge=lfs -text
36
+ model.ckpt-1025000.data-00002-of-00004 filter=lfs diff=lfs merge=lfs -text
37
+ model.ckpt-1025000.data-00003-of-00004 filter=lfs diff=lfs merge=lfs -text
38
+ model.ckpt-1025000.meta filter=lfs diff=lfs merge=lfs -text
model.ckpt-1025000.data-00000-of-00004 ADDED
Binary file (8 Bytes). View file
 
model.ckpt-1025000.data-00001-of-00004 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c5b2a8f4c8cf3bcaa1da9343d44d506b04c955ab679552f08e61944e4dee69f9
3
+ size 200509952
model.ckpt-1025000.data-00002-of-00004 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d1854d70b74cf0849f029aa04f1e60f714edee80ba68a39a23b2a3ea93f299f6
3
+ size 200284288
model.ckpt-1025000.data-00003-of-00004 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a4c5e341bcd1e8e665d8bd49b64af906cd3eb65b372d8fcc5ebc590863c73478
3
+ size 200286080
model.ckpt-1025000.index ADDED
Binary file (7.98 kB). View file
 
model.ckpt-1025000.meta ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f5d879a36016040ca06292f773e7b4dc3d9169d046cca2c3fe0e0d6c346f2cde
3
+ size 24379408
operative_config.gin ADDED
@@ -0,0 +1,403 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import mesh_tensorflow.optimize
2
+ import mesh_tensorflow.transformer.dataset as mesh_tensorflow2
3
+ import mesh_tensorflow.transformer.learning_rate_schedules as mesh_tensorflow3
4
+ import mesh_tensorflow.transformer.t2t_vocabulary as mesh_tensorflow4
5
+ import mesh_tensorflow.transformer.transformer as mesh_tensorflow5
6
+ import mesh_tensorflow.transformer.transformer_layers as mesh_tensorflow6
7
+ import mesh_tensorflow.transformer.utils as mesh_tensorflow7
8
+ import t5.models.mesh_transformer
9
+
10
+ # Macros:
11
+ # ==============================================================================
12
+ d_ff = 3584
13
+ d_kv = 64
14
+ d_model = 1472
15
+ dropout_rate = 0.0
16
+ inputs_length = 1024
17
+ mean_noise_span_length = 20.0
18
+ MIXTURE_NAME = 'en_corpus'
19
+ noise_density = 0.15
20
+ num_heads = 6
21
+ targets_length = @preprocessors.random_spans_targets_length()
22
+
23
+ # Parameters for adafactor_decay_rate_pow:
24
+ # ==============================================================================
25
+ adafactor_decay_rate_pow.exponent = 0.8
26
+ adafactor_decay_rate_pow.offset = 0
27
+
28
+ # Parameters for AdafactorOptimizer:
29
+ # ==============================================================================
30
+ AdafactorOptimizer.beta1 = 0.0
31
+ AdafactorOptimizer.clipping_threshold = 1.0
32
+ AdafactorOptimizer.decay_rate = None
33
+ AdafactorOptimizer.epsilon1 = 1e-30
34
+ AdafactorOptimizer.epsilon2 = 0.001
35
+ AdafactorOptimizer.exclude_from_parameter_scale = None
36
+ AdafactorOptimizer.factored = True
37
+ AdafactorOptimizer.min_dim_size_to_factor = 128
38
+ AdafactorOptimizer.multiply_by_parameter_scale = True
39
+ AdafactorOptimizer.stacked_dim_names = None
40
+
41
+ # Parameters for Bitransformer:
42
+ # ==============================================================================
43
+ Bitransformer.shared_embedding = True
44
+
45
+ # Parameters for denoise:
46
+ # ==============================================================================
47
+ denoise.passthrough_feature_keys = None
48
+
49
+ # Parameters for decoder/DenseReluDense:
50
+ # ==============================================================================
51
+ decoder/DenseReluDense.activation = ['gelu', 'linear']
52
+ decoder/DenseReluDense.dropout_rate = %dropout_rate
53
+ decoder/DenseReluDense.hidden_size = %d_ff
54
+ decoder/DenseReluDense.use_bias = False
55
+
56
+ # Parameters for encoder/DenseReluDense:
57
+ # ==============================================================================
58
+ encoder/DenseReluDense.activation = ['gelu', 'linear']
59
+ encoder/DenseReluDense.dropout_rate = %dropout_rate
60
+ encoder/DenseReluDense.hidden_size = %d_ff
61
+ encoder/DenseReluDense.use_bias = False
62
+
63
+ # Parameters for enc_dec_attention:
64
+ # ==============================================================================
65
+ # None.
66
+
67
+ # Parameters for enc_dec_attention_bias:
68
+ # ==============================================================================
69
+ # None.
70
+
71
+ # Parameters for decoder/EncDecAttention:
72
+ # ==============================================================================
73
+ decoder/EncDecAttention.relative_attention_type = None
74
+
75
+ # Parameters for get_variable_dtype:
76
+ # ==============================================================================
77
+ get_variable_dtype.activation_dtype = 'bfloat16'
78
+
79
+ # Parameters for get_vocab_embedding_cls:
80
+ # ==============================================================================
81
+ # None.
82
+
83
+ # Parameters for get_vocabulary:
84
+ # ==============================================================================
85
+ get_vocabulary.mixture_or_task_name = %MIXTURE_NAME
86
+
87
+ # Parameters for init_checkpoint_variable_mapping:
88
+ # ==============================================================================
89
+ init_checkpoint_variable_mapping.mapping_fn = None
90
+
91
+ # Parameters for decoder/LayerStack:
92
+ # ==============================================================================
93
+ decoder/LayerStack.dropout_rate = None
94
+ decoder/LayerStack.norm_epsilon = None
95
+ decoder/LayerStack.recompute_grads = False
96
+ decoder/LayerStack.sublayers_final = \
97
+ [@transformer.sublayer_rms_norm, @transformer.sublayer_dropout]
98
+ decoder/LayerStack.sublayers_initial = [@transformer.sublayer_dropout]
99
+ decoder/LayerStack.sublayers_per_layer = \
100
+ [@transformer.sublayer_rms_norm,
101
+ @transformer.sublayer_call_layer,
102
+ @transformer.sublayer_dropout,
103
+ @transformer.sublayer_residual]
104
+
105
+ # Parameters for encoder/LayerStack:
106
+ # ==============================================================================
107
+ encoder/LayerStack.dropout_rate = None
108
+ encoder/LayerStack.norm_epsilon = None
109
+ encoder/LayerStack.recompute_grads = False
110
+ encoder/LayerStack.sublayers_final = \
111
+ [@transformer.sublayer_rms_norm, @transformer.sublayer_dropout]
112
+ encoder/LayerStack.sublayers_initial = [@transformer.sublayer_dropout]
113
+ encoder/LayerStack.sublayers_per_layer = \
114
+ [@transformer.sublayer_rms_norm,
115
+ @transformer.sublayer_call_layer,
116
+ @transformer.sublayer_dropout,
117
+ @transformer.sublayer_residual]
118
+
119
+ # Parameters for learning_rate_schedule_noam:
120
+ # ==============================================================================
121
+ learning_rate_schedule_noam.linear_decay_fraction = 0.0
122
+ learning_rate_schedule_noam.multiplier = 1.0
123
+ learning_rate_schedule_noam.offset = 0
124
+ learning_rate_schedule_noam.warmup_steps = 10000
125
+
126
+ # Parameters for make_bitransformer:
127
+ # ==============================================================================
128
+ make_bitransformer.decoder_name = 'decoder'
129
+ make_bitransformer.encoder_name = 'encoder'
130
+
131
+ # Parameters for decoder/make_layer_stack:
132
+ # ==============================================================================
133
+ decoder/make_layer_stack.block_scope = True
134
+ decoder/make_layer_stack.layers = \
135
+ [@mesh_tensorflow.transformer.transformer_layers.SelfAttention,
136
+ @mesh_tensorflow.transformer.transformer_layers.EncDecAttention,
137
+ @mesh_tensorflow.transformer.transformer_layers.DenseReluDense]
138
+ decoder/make_layer_stack.num_layers = 4
139
+
140
+ # Parameters for encoder/make_layer_stack:
141
+ # ==============================================================================
142
+ encoder/make_layer_stack.block_scope = True
143
+ encoder/make_layer_stack.layers = \
144
+ [@mesh_tensorflow.transformer.transformer_layers.SelfAttention,
145
+ @mesh_tensorflow.transformer.transformer_layers.DenseReluDense]
146
+ encoder/make_layer_stack.num_layers = 12
147
+
148
+ # Parameters for mesh_train_dataset_fn:
149
+ # ==============================================================================
150
+ mesh_train_dataset_fn.mixture_or_task_name = %MIXTURE_NAME
151
+ mesh_train_dataset_fn.pack = True
152
+ mesh_train_dataset_fn.seed = None
153
+ mesh_train_dataset_fn.shuffle = False
154
+ mesh_train_dataset_fn.use_cached = False
155
+
156
+ # Parameters for noise_span_to_unique_sentinel:
157
+ # ==============================================================================
158
+ # None.
159
+
160
+ # Parameters for nonnoise_span_to_unique_sentinel:
161
+ # ==============================================================================
162
+ # None.
163
+
164
+ # Parameters for pack_dataset:
165
+ # ==============================================================================
166
+ pack_dataset.use_custom_ops = False
167
+
168
+ # Parameters for pack_or_pad:
169
+ # ==============================================================================
170
+ # None.
171
+
172
+ # Parameters for random_spans_helper:
173
+ # ==============================================================================
174
+ random_spans_helper.extra_tokens_per_span_inputs = 1
175
+ random_spans_helper.extra_tokens_per_span_targets = 1
176
+ random_spans_helper.inputs_length = %inputs_length
177
+ random_spans_helper.mean_noise_span_length = %mean_noise_span_length
178
+ random_spans_helper.noise_density = %noise_density
179
+ random_spans_helper.verbose = False
180
+
181
+ # Parameters for targets_length/random_spans_helper:
182
+ # ==============================================================================
183
+ targets_length/random_spans_helper.extra_tokens_per_span_inputs = 1
184
+ targets_length/random_spans_helper.extra_tokens_per_span_targets = 1
185
+ targets_length/random_spans_helper.inputs_length = %inputs_length
186
+ targets_length/random_spans_helper.mean_noise_span_length = %mean_noise_span_length
187
+ targets_length/random_spans_helper.noise_density = %noise_density
188
+ targets_length/random_spans_helper.verbose = False
189
+
190
+ # Parameters for random_spans_noise_mask:
191
+ # ==============================================================================
192
+ # None.
193
+
194
+ # Parameters for targets_length/random_spans_targets_length:
195
+ # ==============================================================================
196
+ # None.
197
+
198
+ # Parameters for random_spans_tokens_length:
199
+ # ==============================================================================
200
+ # None.
201
+
202
+ # Parameters for reduce_concat_tokens:
203
+ # ==============================================================================
204
+ # None.
205
+
206
+ # Parameters for rewrite_stack_variables:
207
+ # ==============================================================================
208
+ rewrite_stack_variables.max_combined_variable_size = 536870912
209
+
210
+ # Parameters for run:
211
+ # ==============================================================================
212
+ run.autostack = True
213
+ run.batch_size = ('tokens_per_batch', 1048576)
214
+ run.checkpoint_input_pipeline = False
215
+ run.dataset_split = 'train'
216
+ run.ensemble_inputs = None
217
+ run.eval_checkpoint_step = None
218
+ run.eval_dataset_fn = None
219
+ run.eval_dir_suffix = None
220
+ run.eval_summary_dir = None
221
+ run.export_checkpoint_step = None
222
+ run.export_path = ''
223
+ run.init_checkpoint = \
224
+ 'gs://t5-data/pretrained_models/byt5/small/model.ckpt-1000000'
225
+ run.iterations_per_loop = 100
226
+ run.keep_checkpoint_max = None
227
+ run.layout_rules = \
228
+ 'ensemble:ensemble,batch:batch,d_ff:model,heads:model,vocab:model,experts:batch'
229
+ run.learning_rate_schedule = @learning_rate_schedules.learning_rate_schedule_noam
230
+ run.mesh_devices = None
231
+ run.mesh_shape = @mesh_tensorflow.transformer.utils.tpu_mesh_shape()
232
+ run.mode = 'train'
233
+ run.model_type = 'bitransformer'
234
+ run.optimizer = @optimize.AdafactorOptimizer
235
+ run.output_eval_examples = True
236
+ run.perplexity_eval_steps = 100
237
+ run.predict_fn = None
238
+ run.save_checkpoints_steps = 25000
239
+ run.seen_data_init_step = 0
240
+ run.sequence_length = {'inputs': %inputs_length, 'targets': %targets_length}
241
+ run.skip_seen_data = False
242
+ run.total_run_steps = None
243
+ run.train_dataset_fn = @t5.models.mesh_transformer.mesh_train_dataset_fn
244
+ run.train_steps = 1200000
245
+ run.variable_filter = None
246
+
247
+ # Parameters for select_random_chunk:
248
+ # ==============================================================================
249
+ select_random_chunk.additional_feature_keys = None
250
+ select_random_chunk.additional_passthrough_keys = None
251
+ select_random_chunk.min_length = None
252
+ select_random_chunk.passthrough_feature_keys = None
253
+ select_random_chunk.sequence_length = None
254
+ select_random_chunk.uniform_random_start = False
255
+
256
+ # Parameters for decoder/SelfAttention:
257
+ # ==============================================================================
258
+ decoder/SelfAttention.attention_func = None
259
+ decoder/SelfAttention.attention_kwargs = None
260
+ decoder/SelfAttention.combine_dims = True
261
+ decoder/SelfAttention.dropout_rate = %dropout_rate
262
+ decoder/SelfAttention.fold_scaling_into_initializer = True
263
+ decoder/SelfAttention.hyperprompt_hidden_dim = None
264
+ decoder/SelfAttention.hyperprompt_length_decoder = None
265
+ decoder/SelfAttention.hyperprompt_length_encoder = None
266
+ decoder/SelfAttention.hyperprompt_mtlshare = False
267
+ decoder/SelfAttention.hyperprompt_task_num = 8
268
+ decoder/SelfAttention.keep_query_heads_dims = False
269
+ decoder/SelfAttention.key_value_size = %d_kv
270
+ decoder/SelfAttention.num_heads = %num_heads
271
+ decoder/SelfAttention.num_memory_heads = 0
272
+ decoder/SelfAttention.relative_attention_num_buckets = 32
273
+ decoder/SelfAttention.relative_attention_type = 'bias_shared'
274
+ decoder/SelfAttention.shared_kv = False
275
+ decoder/SelfAttention.use_hyperprompt = False
276
+ decoder/SelfAttention.z_loss_coeff = None
277
+
278
+ # Parameters for encoder/SelfAttention:
279
+ # ==============================================================================
280
+ encoder/SelfAttention.attention_func = None
281
+ encoder/SelfAttention.attention_kwargs = None
282
+ encoder/SelfAttention.combine_dims = True
283
+ encoder/SelfAttention.dropout_rate = %dropout_rate
284
+ encoder/SelfAttention.fold_scaling_into_initializer = True
285
+ encoder/SelfAttention.hyperprompt_hidden_dim = None
286
+ encoder/SelfAttention.hyperprompt_length_decoder = None
287
+ encoder/SelfAttention.hyperprompt_length_encoder = None
288
+ encoder/SelfAttention.hyperprompt_mtlshare = False
289
+ encoder/SelfAttention.hyperprompt_task_num = 8
290
+ encoder/SelfAttention.keep_query_heads_dims = False
291
+ encoder/SelfAttention.key_value_size = %d_kv
292
+ encoder/SelfAttention.num_heads = %num_heads
293
+ encoder/SelfAttention.num_memory_heads = 0
294
+ encoder/SelfAttention.relative_attention_num_buckets = 32
295
+ encoder/SelfAttention.relative_attention_type = 'bias_shared'
296
+ encoder/SelfAttention.shared_kv = False
297
+ encoder/SelfAttention.use_hyperprompt = False
298
+ encoder/SelfAttention.z_loss_coeff = None
299
+
300
+ # Parameters for sentinel_id:
301
+ # ==============================================================================
302
+ sentinel_id.return_value = None
303
+
304
+ # Parameters for serialize_num_microbatches:
305
+ # ==============================================================================
306
+ serialize_num_microbatches.tokens_per_microbatch_per_replica = 2048
307
+
308
+ # Parameters for should_load_variable:
309
+ # ==============================================================================
310
+ should_load_variable.filter_fn = None
311
+
312
+ # Parameters for SimdMeshImpl:
313
+ # ==============================================================================
314
+ SimdMeshImpl.allreduce_in_bfloat16_max_group_size = 8
315
+
316
+ # Parameters for split_tokens:
317
+ # ==============================================================================
318
+ split_tokens.additional_feature_keys = None
319
+ split_tokens.passthrough_feature_keys = None
320
+
321
+ # Parameters for sublayer_call_layer:
322
+ # ==============================================================================
323
+ # None.
324
+
325
+ # Parameters for sublayer_dropout:
326
+ # ==============================================================================
327
+ sublayer_dropout.dropout_rate = %dropout_rate
328
+
329
+ # Parameters for sublayer_mask_padding:
330
+ # ==============================================================================
331
+ # None.
332
+
333
+ # Parameters for sublayer_residual:
334
+ # ==============================================================================
335
+ # None.
336
+
337
+ # Parameters for sublayer_rms_norm:
338
+ # ==============================================================================
339
+ sublayer_rms_norm.epsilon = 1e-06
340
+ sublayer_rms_norm.name = 'rms_norm'
341
+
342
+ # Parameters for tpu_estimator_model_fn:
343
+ # ==============================================================================
344
+ tpu_estimator_model_fn.hierarchical_tiling_spec = None
345
+ tpu_estimator_model_fn.init_variable_filter = ''
346
+ tpu_estimator_model_fn.model_info_file = ''
347
+ tpu_estimator_model_fn.outer_batch_size = 1
348
+ tpu_estimator_model_fn.tpu_summaries = False
349
+ tpu_estimator_model_fn.weight_decay_checkpoint = None
350
+
351
+ # Parameters for tpu_mesh_shape:
352
+ # ==============================================================================
353
+ tpu_mesh_shape.ensemble_parallelism = None
354
+ tpu_mesh_shape.model_parallelism = 1
355
+ tpu_mesh_shape.tpu_topology = 'v3-32'
356
+
357
+ # Parameters for unit_scaling_convention:
358
+ # ==============================================================================
359
+ unit_scaling_convention.value = False
360
+
361
+ # Parameters for decoder/Unitransformer:
362
+ # ==============================================================================
363
+ decoder/Unitransformer.d_model = %d_model
364
+ decoder/Unitransformer.ensemble = None
365
+ decoder/Unitransformer.input_full_attention = False
366
+ decoder/Unitransformer.label_smoothing = 0.0
367
+ decoder/Unitransformer.loss_denominator = None
368
+ decoder/Unitransformer.loss_fn = None
369
+ decoder/Unitransformer.loss_on_targets_only = False
370
+ decoder/Unitransformer.max_length = 512
371
+ decoder/Unitransformer.positional_embedding = False
372
+ decoder/Unitransformer.shared_embedding_and_softmax_weights = False
373
+ decoder/Unitransformer.sinusoid_positional_embedding = False
374
+ decoder/Unitransformer.token_dropout_rate = 0.0
375
+ decoder/Unitransformer.vocab_divisor = 128
376
+ decoder/Unitransformer.z_loss = 0.0001
377
+
378
+ # Parameters for encoder/Unitransformer:
379
+ # ==============================================================================
380
+ encoder/Unitransformer.d_model = %d_model
381
+ encoder/Unitransformer.ensemble = None
382
+ encoder/Unitransformer.input_full_attention = False
383
+ encoder/Unitransformer.label_smoothing = 0.0
384
+ encoder/Unitransformer.loss_denominator = None
385
+ encoder/Unitransformer.loss_fn = None
386
+ encoder/Unitransformer.loss_on_targets_only = False
387
+ encoder/Unitransformer.max_length = 512
388
+ encoder/Unitransformer.positional_embedding = False
389
+ encoder/Unitransformer.shared_embedding_and_softmax_weights = False
390
+ encoder/Unitransformer.sinusoid_positional_embedding = False
391
+ encoder/Unitransformer.token_dropout_rate = 0.0
392
+ encoder/Unitransformer.vocab_divisor = 128
393
+ encoder/Unitransformer.z_loss = 0.0001
394
+
395
+ # Parameters for VarianceScalingInitializer:
396
+ # ==============================================================================
397
+ VarianceScalingInitializer.distribution = 'normal'
398
+ VarianceScalingInitializer.mode = 'fan_in'
399
+ VarianceScalingInitializer.scale = 1.0
400
+
401
+ # Parameters for VocabEmbedding:
402
+ # ==============================================================================
403
+ VocabEmbedding.scale_variable_like_classifier_weights = False