legacy_ppo_megatron_trainer.yaml 19.5 KB
Newer Older
jerrrrry's avatar
jerrrrry committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
data:
  tokenizer: null
  train_files: ~/data/rlhf/gsm8k/train.parquet
  val_files: ~/data/rlhf/gsm8k/test.parquet
  prompt_key: prompt
  reward_fn_key: data_source
  max_prompt_length: 512
  max_response_length: 512
  train_batch_size: 1024
  val_batch_size: null # DEPRECATED: Validation datasets are sent to inference engines as a whole batch, which will schedule the memory themselves
  return_raw_input_ids: False  # This should be set to true when the tokenizer between policy and rm differs
  return_raw_chat: False
  return_full_prompt: False
  shuffle: True
  filter_overlong_prompts: False # for large-scale dataset, filtering overlong prompts could be timeconsuming. You cat set the filter_overlong_prompts_workers to use multiprocessing to speed up.
  filter_overlong_prompts_workers: 1
  truncation: error
  trust_remote_code: False  # main_ppo will check this config to determine whether to use remote code for tokenizer
  custom_cls:
      path: null
      name: null
  sampler:
    class_path: null
    class_name: null
  dataloader_num_workers: 8
  return_multi_modal_inputs: True

actor_rollout_ref:
  hybrid_engine: True
  nccl_timeout: 600 # seconds, default is 10 minutes for torch, you can set it to a larger value if you have long-running operations like 32B or 72B model using megatron
  model:
    path: ~/models/deepseek-llm-7b-chat
    custom_chat_template: null
    external_lib: null
    override_config:
      model_config: {}
      moe_config:
        freeze_moe_router: False
    enable_gradient_checkpointing: False
    gradient_checkpointing_kwargs:
      ## Activation Checkpointing
      activations_checkpoint_method: null # 'uniform', 'block'; not used with 'selective'
      # 'uniform' divides the total number of transformer layers and checkpoints the input activation of each chunk
      # 'block' checkpoints the specified number of layers per pipeline stage at the specified granularity
      activations_checkpoint_granularity: null # 'selective' or 'full'
      # 'full' will checkpoint the entire transformer layer and 'selective' only checkpoints memory intensive part of attention
      activations_checkpoint_num_layers: null # not used with 'selective'
    trust_remote_code: False
  actor:
    strategy: megatron  # This is for backward-compatibility
    ppo_mini_batch_size: 256
    ppo_micro_batch_size: null # will be deprecated, use ppo_micro_batch_size_per_gpu
    ppo_micro_batch_size_per_gpu: null
    use_dynamic_bsz: False
    ppo_max_token_len_per_gpu: 16384 # n * ${data.max_prompt_length} + ${data.max_response_length}
    use_torch_compile: True # False to disable torch compile
    # pg_losses2 = -advantages * torch.clamp(ratio, 1 - cliprange_low, 1 + cliprange_high)
    clip_ratio: 0.2 # default value if clip_ratio_low and clip_ratio_high are not specified
    clip_ratio_low: 0.2
    clip_ratio_high: 0.2
    clip_ratio_c: 3.0 # lower bound of the value for Dual-clip PPO from https://arxiv.org/pdf/1912.09729
    loss_agg_mode: "token-mean" # / "seq-mean-token-sum" / "seq-mean-token-mean"
    # NOTE: "token-mean" is the default behavior
    entropy_coeff: 0
    use_kl_loss: False # True for GRPO
    kl_loss_coef: 0.001 # for grpo
    kl_loss_type: low_var_kl # for grpo
    ppo_epochs: 1
    data_loader_seed: null
    shuffle: False
    policy_loss:   # policy loss config
      loss_mode: "vanilla" # Loss function mode: vanilla / clip-cov / kl-cov / gpg from https://arxiv.org/abs/2505.22617,
      clip_cov_ratio: 0.0002 # Ratio of tokens to be clipped for clip-cov loss
      clip_cov_lb: 1.0 # Lower bound for clip-cov loss
      clip_cov_ub: 5.0 # Upper bound for clip-cov loss
      kl_cov_ratio: 0.0002 # Ratio of tokens to be applied kl penalty for kl-cov loss
      ppo_kl_coef: 0.1 # KL divergence penalty coefficient
    optim:
      optimizer: adam
      lr: 1e-6
      clip_grad: 1.0
      total_training_steps: -1  # must be override by program
      lr_warmup_init: 0.0  # initial learning rate for warmup, default to 0.0
      lr_warmup_steps: null # Prioritized. None, 0 or Negative values mean delegating to lr_warmup_steps_ratio.
      lr_warmup_steps_ratio: 0.  # the total steps will be injected during runtime
      lr_decay_steps: null
      lr_decay_style: constant # select from constant/linear/cosine/inverse_square_root
      min_lr: 0.0 # minimum learning rate, default to 0.0
      weight_decay: 0.01
      weight_decay_incr_style: constant # select from constant/linear/cosine
      lr_wsd_decay_style: exponential # select from constant/exponential/cosine
      lr_wsd_decay_steps: null
      use_checkpoint_opt_param_scheduler: False # use checkpoint optimizer parameter scheduler
    megatron:
      param_offload: False
      grad_offload: False
      optimizer_offload: False
      tensor_model_parallel_size: 1
      expert_model_parallel_size: 1
      expert_tensor_parallel_size: null
      pipeline_model_parallel_size: 1
      virtual_pipeline_model_parallel_size: null # change VPP interface for parallelism tests
      context_parallel_size: 1
      sequence_parallel: True
      use_distributed_optimizer: True
      use_dist_checkpointing: False
      dist_checkpointing_path: null
      seed: 42
      override_transformer_config: {} # additional transformer config like: num_layers_in_first(/last)_pipeline_stage
      use_mbridge: False
    profile: # profile the actor model in `update_policy`
      use_profile: False # open it when you want to profile the actor model
      profile_ranks: null # list, you can specify the ranks to profile
      step_start: -1 # start step in update_policy
      step_end: -1 # end step
      save_path: null # the path to save the profile result
    load_weight: True
    checkpoint:
      async_save: False # save checkpoint asynchronously
      # What to include in saved checkpoints
      # with 'hf_model' you can save whole model as hf format, now only use sharded model checkpoint to save space
      save_contents: ['model', 'optimizer', 'extra']
      # For more flexibility, you can specify the contents to load from the checkpoint.
      load_contents: ${actor_rollout_ref.actor.checkpoint.save_contents}
  ref:
    strategy: ${actor_rollout_ref.actor.strategy}
    use_torch_compile: ${actor_rollout_ref.actor.use_torch_compile}
    megatron:
      param_offload: False
      tensor_model_parallel_size: 1
      expert_model_parallel_size: 1
      expert_tensor_parallel_size: None
      pipeline_model_parallel_size: 1
      virtual_pipeline_model_parallel_size: null # change VPP interface for parallelism tests
      context_parallel_size: 1
      sequence_parallel: True
      use_distributed_optimizer: False
      use_dist_checkpointing: False
      dist_checkpointing_path: null
      seed: ${actor_rollout_ref.actor.megatron.seed}
      override_transformer_config: ${actor_rollout_ref.actor.megatron.override_transformer_config}
      use_mbridge: ${actor_rollout_ref.actor.megatron.use_mbridge}
    profile:
      use_profile: False
      profile_ranks: null
      step_start: -1
      step_end: -1
      save_path: null
    load_weight: True
    log_prob_micro_batch_size: null # will be deprecated, use log_prob_micro_batch_size_per_gpu
    log_prob_micro_batch_size_per_gpu: null
    log_prob_use_dynamic_bsz: ${actor_rollout_ref.actor.use_dynamic_bsz}
    log_prob_max_token_len_per_gpu: ${actor_rollout_ref.actor.ppo_max_token_len_per_gpu}
  rollout:
    name: vllm
    mode: sync # sync: LLM, async: AsyncLLM
    temperature: 1.0
    top_k: -1 # 0 for hf rollout, -1 for vllm rollout
    top_p: 1
    prompt_length: ${data.max_prompt_length}  # for xperf_gpt
    response_length: ${data.max_response_length}
    # for vllm rollout
    dtype: bfloat16 # should align with FSDP
    gpu_memory_utilization: 0.5
    ignore_eos: False
    enforce_eager: True
    free_cache_engine: True
    load_format: dummy_megatron
    tensor_model_parallel_size: 1
    max_num_batched_tokens: 8192
    max_model_len: null
    max_num_seqs: 1024
    log_prob_micro_batch_size: null # will be deprecated, use log_prob_micro_batch_size_per_gpu
    log_prob_micro_batch_size_per_gpu: null
    log_prob_use_dynamic_bsz: ${actor_rollout_ref.actor.use_dynamic_bsz}
    log_prob_max_token_len_per_gpu: ${actor_rollout_ref.actor.ppo_max_token_len_per_gpu}
    disable_log_stats: True
    enable_chunked_prefill: False # could get higher throughput
    # for hf rollout
    do_sample: True
    layer_name_map:
      qkv_layer_name: qkv
      gate_proj_layer_name: gate_up
    # number of responses (i.e. num sample times)
    n: 1
    engine_kwargs: # inference engine parameters
      vllm:
        swap_space: null # null means "use the engine default value" (usually 4 GB), setting it to, e.g., 32 means 32 GB
        disable_mm_preprocessor_cache: False # whether to disable the preprocessor cache for multimodel models.
      sglang:
        attention_backend: null # null means use the engine default value, available options: flashinfer, triton, flashmla
    val_kwargs:
      # sampling parameters for validation
      top_k: -1 # 0 for hf rollout, -1 for vllm rollout
      top_p: 1.0
      temperature: 0
      n: 1
      do_sample: False # default eager for validation

    # Multi-turn interaction config for tools or chat.
    multi_turn:
      # set to True for multi-turn tool interaction tasks; should set rollout.name to sglang as well
      enable: False

      # null for no limit (default max_length // 3)
      max_assistant_turns: null

      # null for no tool
      tool_config_path: null

      # null for no limit (default max_length // 3)
      max_user_turns: null

      # max parallel call for tools in single turn
      max_parallel_calls: 1

      # max length of tool response
      max_tool_response_length: 256

      # truncate side of tool response: left, middle, right
      tool_response_truncate_side: middle

      # null for no interaction
      interaction_config_path: null

      # - When set to True, the model's default chat template is used for multi-turn rollout, which typically matches production behavior.
      # - When set to False, the token ids recorded for training are used instead; unlike the default chat template, these always include the model's full output,
      #   which may contain additional content such as reasoning content. This maintains the consistency between training and rollout, but it will lead to longer prompts.
      use_inference_chat_template: False

      # Tokenization is performed turn by turn and the resulting token ids are concatenated to form the full conversation.
      # To ensure this matches the result of tokenizing the entire conversation at once, a sanity check is run at the end of each multi-turn rollout to compare the two sets of token ids.
      # Some models are known to produce different tokenization results when tokenizing turn by turn vs. all at once. aThis behavior has already been validated for them.
      # To reduce excessive warnings, you can turn off the sanity check for these models if you are using their default chat template:
      # Qwen/QwQ-32B, Qwen/Qwen3-xxB
      # - disable: disable tokenization sanity check
      # - strict: enable strict tokenization sanity check (default)
      # - ignore_strippable: ignore strippable tokens when checking tokenization sanity
      tokenization_sanity_check_mode: strict

      # Format of the multi-turn interaction. Options: hermes, llama3_json, ...
      format: hermes

    # [Experimental] agent loop based rollout configs
    agent:

      # Number of agent loop workers
      num_workers: 8

      custom_async_server:
        path: null
        name: null

    # support logging rollout prob for debugging purpose
    calculate_log_probs: False
    # Nsight system profiler configs
  profiler:
    # Required when using verl.utils.omega_conf_to_dataclass to instantiate dataclass configs
    _target_: verl.utils.profiler.ProfilerConfig
    discrete: False
    all_ranks: False
    ranks: []

critic:
  rollout_n: ${actor_rollout_ref.rollout.n}
  strategy: ${actor_rollout_ref.actor.strategy}
  nccl_timeout: 600 # seconds, default is 10 minutes for torch, you can set it to a larger value if you have long-running operations like 32B or 72B model using megatron
  optim:
    optimizer: adam
    lr: 1e-6
    clip_grad: 1.0
    total_training_steps: -1  # must be override by program
    lr_warmup_init: 0.0  # initial learning rate for warmup, default to 0.0
    lr_warmup_steps: null # Prioritized. None, 0 or Negative values mean delegating to lr_warmup_steps_ratio.
    lr_warmup_steps_ratio: 0.  # the total steps will be injected during runtime
    lr_decay_steps: null
    lr_decay_style: linear # select from constant/linear/cosine/inverse_square_root
    min_lr: 0.0 # minimum learning rate, default to 0.0
    weight_decay: 0.01
    weight_decay_incr_style: constant # select from constant/linear/cosine
    lr_wsd_decay_style: exponential # select from constant/exponential/cosine
    lr_wsd_decay_steps: null
    use_checkpoint_opt_param_scheduler: False # use checkpoint optimizer parameter scheduler
  model:
    path: ~/models/deepseek-llm-7b-chat
    tokenizer_path: ${actor_rollout_ref.model.path}
    override_config:
      model_config: {}
      moe_config:
        freeze_moe_router: False
    external_lib: ${actor_rollout_ref.model.external_lib}
    trust_remote_code: False
    enable_gradient_checkpointing: False
    gradient_checkpointing_kwargs:
      ## Activation Checkpointing
      activations_checkpoint_method: null
      activations_checkpoint_granularity: null
      activations_checkpoint_num_layers: null
  megatron:
    param_offload: False
    grad_offload: False
    optimizer_offload: False
    tensor_model_parallel_size: 1
    expert_model_parallel_size: 1
    expert_tensor_parallel_size: null
    pipeline_model_parallel_size: 1
    virtual_pipeline_model_parallel_size: null # change VPP interface for parallelism tests
    context_parallel_size: 1
    sequence_parallel: True
    use_distributed_optimizer: True
    use_dist_checkpointing: False
    dist_checkpointing_path: null
    seed: ${actor_rollout_ref.actor.megatron.seed}
    override_transformer_config: ${actor_rollout_ref.actor.megatron.override_transformer_config}
    use_mbridge: ${actor_rollout_ref.actor.megatron.use_mbridge}
  load_weight: True
  ppo_mini_batch_size: ${actor_rollout_ref.actor.ppo_mini_batch_size}
  ppo_micro_batch_size: null # will be deprecated, use ppo_micro_batch_size_per_gpu
  ppo_micro_batch_size_per_gpu: null
  use_dynamic_bsz: ${actor_rollout_ref.actor.use_dynamic_bsz}
  ppo_max_token_len_per_gpu: 32768 # (${actor_rollout_ref.actor.ppo_max_token_len_per_gpu}) * 2
  forward_max_token_len_per_gpu: ${critic.ppo_max_token_len_per_gpu}
  ppo_epochs: ${actor_rollout_ref.actor.ppo_epochs}
  data_loader_seed: ${actor_rollout_ref.actor.data_loader_seed}
  shuffle: ${actor_rollout_ref.actor.shuffle}
  cliprange_value: 0.5
  loss_agg_mode: ${actor_rollout_ref.actor.loss_agg_mode}
  checkpoint:
    async_save: False # save checkpoint asynchronously
    # What to include in saved checkpoints
    # with 'hf_model' you can save whole model as hf format, now only use sharded model checkpoint to save space
    save_contents: ['model', 'optimizer', 'extra']
    load_contents: ${critic.checkpoint.save_contents}
  # Nsight system profiler configs
  profiler:
    # Required when using verl.utils.omega_conf_to_dataclass to instantiate dataclass configs
    _target_: verl.utils.profiler.ProfilerConfig
    discrete: False
    all_ranks: False
    ranks: []
reward_model:
  enable: False
  strategy: ${actor_rollout_ref.actor.strategy}
  nccl_timeout: 600 # seconds, default is 10 minutes for torch, you can set it to a larger value if you have long-running operations like 32B or 72B model using megatron
  megatron:
    param_offload: False
    tensor_model_parallel_size: 1
    expert_model_parallel_size: 1
    expert_tensor_parallel_size: null
    pipeline_model_parallel_size: 1
    virtual_pipeline_model_parallel_size: null # change VPP interface for parallelism tests
    context_parallel_size: 1
    sequence_parallel: True
    use_distributed_optimizer: False
    use_dist_checkpointing: False
    dist_checkpointing_path: null
    seed: ${actor_rollout_ref.actor.megatron.seed}
    override_transformer_config: {}
    use_mbridge: ${actor_rollout_ref.actor.megatron.use_mbridge}
  model:
    input_tokenizer: ${actor_rollout_ref.model.path}  # set this to null if the chat template is identical
    path: ~/models/FsfairX-LLaMA3-RM-v0.1
    trust_remote_code: False
    external_lib: ${actor_rollout_ref.model.external_lib}
  load_weight: True
  micro_batch_size: null # will be deprecated, use micro_batch_size_per_gpu
  micro_batch_size_per_gpu: null
  use_dynamic_bsz: ${critic.use_dynamic_bsz}
  forward_max_token_len_per_gpu: ${critic.forward_max_token_len_per_gpu}
  max_length: null
  reward_manager: naive
  launch_reward_fn_async: False # custom reward function executed async on CPU, during log_prob
  sandbox_fusion:
    url: null # faas url to run code in cloud sandbox
    max_concurrent: 64 # max concurrent requests to sandbox
    memory_limit_mb: 1024 # Max memory limit for each sandbox process in MB
  # Nsight system profiler configs
  profiler:
    # Required when using verl.utils.omega_conf_to_dataclass to instantiate dataclass configs
    _target_: verl.utils.profiler.ProfilerConfig
    discrete: False
    all_ranks: False
    ranks: []

custom_reward_function:
  path: null
  name: compute_score

algorithm:
  # Required when using verl.utils.omega_conf_to_dataclass to instantiate dataclass configs
  _target_: verl.trainer.config.AlgoConfig
  gamma: 1.0
  lam: 1.0
  adv_estimator: gae
  norm_adv_by_std_in_grpo: True
  use_kl_in_reward: False
  kl_penalty: kl  # how to estimate kl divergence
  kl_ctrl:
    # Required when using verl.utils.omega_conf_to_dataclass to instantiate dataclass configs
    _target_: verl.trainer.config.KLControlConfig
    type: fixed
    kl_coef: 0.001
    horizon: 10000
    target_kl: 0.1
  use_pf_ppo: False
  pf_ppo:
    reweight_method: pow  # ["pow", "max_min", "max_random"]
    weight_pow: 2.0

trainer:
  balance_batch: True
  total_epochs: 30
  total_training_steps: null
  profile_steps: null # [1,2,5] or [] or null
  project_name: verl_examples
  experiment_name: gsm8k
  logger: ['console', 'wandb']
  log_val_generations: 0
  nnodes: 1
  n_gpus_per_node: 8
  save_freq: -1
  esi_redundant_time: 0

  # auto: find the last ckpt to resume. If can't find, start from scratch
  resume_mode: auto # or disable or resume_path if resume_from_path is set
  resume_from_path: null
  del_local_ckpt_after_load: False
  val_before_train: True
  test_freq: -1
  critic_warmup: 0
  default_hdfs_dir: null
  default_local_dir: checkpoints/${trainer.project_name}/${trainer.experiment_name}
  max_actor_ckpt_to_keep: null
  max_critic_ckpt_to_keep: null
  # The timeout for ray worker group to wait for the register center to be ready
  ray_wait_register_center_timeout: 300
  device: cuda
  # see ppo_trainer.yaml for more details
  controller_nsight_options:
    trace: "cuda,nvtx,cublas,ucx"
    cuda-memory-usage: "true"
    cuda-graph-trace: "graph"
  worker_nsight_options:
    trace: "cuda,nvtx,cublas,ucx"
    cuda-memory-usage: "true"
    cuda-graph-trace: "graph"
    capture-range: "cudaProfilerApi"
    capture-range-end: null
    kill: none
  npu_profile:
    options:
      save_path: ./profiler_data
      roles: ["all"]
      level: level1
      with_memory: False
      record_shapes: False
      with_npu: True
      with_cpu: True
      with_module: False
      with_stack: False
      analysis: True

ray_init:
  num_cpus: null # `None` means using all CPUs, which might cause hang if limited in systems like SLURM. Please set to a number allowed then.
  timeline_json_file: null