gpt_config.yaml 6.59 KB
Newer Older
xingjinliang's avatar
xingjinliang committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
# WARNING: Yaml configs is currently an experimental feature
language_model:
  # model architecture
  num_layers: 24
  hidden_size: 1024
  num_attention_heads: 16
  num_query_groups: null

  ffn_hidden_size: null
  kv_channels: null
  hidden_dropout: 0.0
  attention_dropout: 0.0
  fp32_residual_connection: False

  apply_residual_connection_post_layernorm: False
  layernorm_epsilon: 1.e-5
  layernorm_zero_centered_gamma: True
  add_bias_linear: False
  bias_activation_fusion: False
  add_qkv_bias: False
  gated_linear_unit: False
  activation_func: swiglu
  num_moe_experts: null
  rotary_interleaved: False
  window_size: null

  # initialization
  init_method: null
  init_method_std: 0.02
  output_layer_init_method: null

  # mixed-precision
  apply_query_key_layer_scaling: False
  attention_softmax_in_fp32: False

  # fusion
  bias_swiglu_fusion: True
  masked_softmax_fusion: True
  persist_layer_norm: False
  memory_efficient_layer_norm: False
  bias_dropout_fusion: True
  apply_rope_fusion: True

  # activation recomputation
  recompute_granularity: null
  recompute_method: null
  recompute_num_layers: null
  distribute_saved_activations: null

  # fp8 related
  fp8: null
  fp8_margin: 0
  fp8_interval: 1
  fp8_amax_history_len: 1
  fp8_amax_compute_algo: "most_recent"
  fp8_wgrad: True

  # miscellaneous
  clone_scatter_output_in_embedding: True

  normalization: "LayerNorm"  # alt value supported by TE: "RMSNorm"

  # MoE related
  moe_router_load_balancing_type: "aux_loss"
  moe_router_topk: 2
  moe_grouped_gemm: False
  moe_aux_loss_coeff: 0  # 1e-2 would be a good start value for load balance loss.
  moe_z_loss_coeff: null  # 1e-3 would be a good start value for z-loss
  moe_input_jitter_eps: null
  moe_token_dropping: False

model_parallel:
  # Model parallelism
  tensor_model_parallel_size: 1
  context_parallel_size: 1
  pipeline_model_parallel_size: 1
  virtual_pipeline_model_parallel_size: null
  sequence_parallel: True
  expert_model_parallel_size: 1

  # Initialization
  perform_initialization: True
  use_cpu_initialization: null

  # Training
  fp16: False
  bf16: True
  params_dtype: null # Set from above arguments for core
  timers: null

  # Optimizations
  gradient_accumulation_fusion: True
  async_tensor_model_parallel_allreduce: True
  tp_comm_overlap: False

  # Debug Options
  tp_comm_split_ag: True
  tp_comm_atomic_ag: True
  tp_comm_split_rs: True
  tp_comm_atomic_rs: True
  tp_comm_bulk_wgrad: True
  tp_comm_bulk_dgrad: True

  # Parallelism
  finalize_model_grads_func: null

  # Pipeline Parallel
  pipeline_dtype: null
  grad_scale_func: null
  enable_autocast: False
  autocast_dtype: null
  variable_seq_lengths: False
  num_microbatches_with_partial_activation_checkpoints: null
  overlap_p2p_comm: False
  batch_p2p_comm: True
  batch_p2p_sync: True
  use_ring_exchange_p2p: False
  deallocate_pipeline_outputs: False
  no_sync_func: null
  grad_sync_func: null
  param_sync_func: null
  pipeline_model_parallel_split_rank: null

  # CPU Offloading
  cpu_offloading: False
  cpu_offloading_num_layers: 0
  _cpu_offloading_context: null
  cpu_offloading_weights: False
  cpu_offloading_activations: True

  # Timing
  barrier_with_L1_time: True

# training:
use_legacy_models: False
spec: null
micro_batch_size: 2
global_batch_size: 128
rampup_batch_size: [32, 32, 65324160] 
check_for_nan_in_loss_and_grad: True
num_layers_per_virtual_pipeline_stage: null

encoder_num_layers: null
decoder_num_layers: null
rotary_seq_len_interpolation_factor: null
add_position_embedding: False
make_vocab_size_divisible_by: 128
group_query_attention: False


exit_signal_handler: False
exit_duration_in_mins: null
exit_interval: null

untie_embeddings_and_output_weights: True
position_embedding_type: rope
rotary_percent: 0.5
openai_gelu: False
squared_relu: False
swiglu: True
onnx_safe: null
bert_binary_head: True
max_position_embeddings: 4096

transformer_impl: local
use_flash_attn: False
seed: 1234
data_parallel_random_init: False

# Optimizer
optimizer: adam
lr: 2.5e-4
lr_decay_style: cosine
lr_decay_iters: null
lr_decay_samples: 255126953
lr_warmup_fraction: null
lr_warmup_iters: 0
lr_warmup_samples: 81381
lr_warmup_init: 0.0
min_lr: 2.5e-5
weight_decay: 0.1
start_weight_decay: null
end_weight_decay: null
weight_decay_incr_style: constant
clip_grad: 1.0
adam_beta1: 0.9
adam_beta2: 0.95
adam_eps: 1.e-08
sgd_momentum: 0.9
override_opt_param_scheduler: False
use_checkpoint_opt_param_scheduler: False

# checkpointing arguments
save: null
save_interval: 20000
no_save_optim: null
no_save_rng: null
load: null
no_load_optim: null
no_load_rng: null
finetune: False
use_checkpoint_args: False
exit_on_missing_checkpoint: False

# loss arguments
loss_scale: null
initial_loss_scale: 4294967296
min_loss_scale: 1.0
loss_scale_window: 1000 
hysteresis: 2
accumulate_allreduce_grads_in_fp32: False
fp16_lm_cross_entropy: False

# distributed arguments
distributed_backend: nccl
distributed_timeout_minutes: 10
overlap_grad_reduce: False
align_grad_reduce: True
overlap_param_gather: False
align_param_gather: False
scatter_gather_tensors_in_pipeline: True
local_rank: null
lazy_mpu_init: null
empty_unused_memory_level: 0
standalone_embedding_stage: False
use_distributed_optimizer: False
nccl_communicator_config_path: null

train_iters: null
eval_iters: 32
eval_interval: 2000
skip_train: False

adlr_autoresume: False
adlr_autoresume_interval: 1000

# garbage collection
manual_gc: False
manual_gc_interval: 0
manual_gc_eval: True

tp_comm_overlap_cfg: null

#data
data_path: null
split: '99,1,0'
train_data_path: null
valid_data_path: null
test_data_path: null
data_cache_path: null
mock_data: False
vocab_size: null
vocab_file: null
merge_file: null
vocab_extra_ids: 0
seq_length: 4096
encoder_seq_length: null
decoder_seq_length: null
retriever_seq_length: 256
sample_rate: 1.0
mask_prob: 0.15
short_seq_prob: 0.1
num_workers: 2
tokenizer_type: GPTSentencePieceTokenizer
tokenizer_model: null
reset_position_ids: False
reset_attention_mask: False
eod_mask_loss: False
train_samples: 268554688
dataloader_type: null

#profile:
profile: False
profile_ranks: [0]
profile_step_end: 12
profile_step_start: 10

#logging:
log_params_norm: True
log_num_zeros_in_grad: True
log_throughput: False
log_progress: False
timing_log_level: 0
timing_log_option: minmax
tensorboard_log_interval: 1
tensorboard_queue_size: 1000
log_timers_to_tensorboard: False
log_validation_ppl_to_tensorboard: False
log_memory_to_tensorboard: False
log_world_size_to_tensorboard: False
log_loss_scale_to_tensorboard: True
wandb_project: ''
wandb_exp_name: ''
wandb_save_dir: ''
enable_one_logger: True
one_logger_project: megatron-lm
one_logger_run_name: null
log_interval: 100
tensorboard_dir: null