data: tokenizer: null train_files: ~/data/rlhf/gsm8k/train.parquet val_files: ~/data/rlhf/gsm8k/test.parquet prompt_key: prompt reward_fn_key: data_source max_prompt_length: 512 max_response_length: 512 train_batch_size: 1024 val_batch_size: null # DEPRECATED: Validation datasets are sent to inference engines as a whole batch, which will schedule the memory themselves return_raw_input_ids: False # This should be set to true when the tokenizer between policy and rm differs return_raw_chat: False return_full_prompt: False shuffle: True filter_overlong_prompts: False # for large-scale dataset, filtering overlong prompts could be timeconsuming. You cat set the filter_overlong_prompts_workers to use multiprocessing to speed up. filter_overlong_prompts_workers: 1 truncation: error trust_remote_code: False # main_ppo will check this config to determine whether to use remote code for tokenizer custom_cls: path: null name: null sampler: class_path: null class_name: null dataloader_num_workers: 8 return_multi_modal_inputs: True actor_rollout_ref: hybrid_engine: True nccl_timeout: 600 # seconds, default is 10 minutes for torch, you can set it to a larger value if you have long-running operations like 32B or 72B model using megatron model: path: ~/models/deepseek-llm-7b-chat custom_chat_template: null external_lib: null override_config: model_config: {} moe_config: freeze_moe_router: False enable_gradient_checkpointing: False gradient_checkpointing_kwargs: ## Activation Checkpointing activations_checkpoint_method: null # 'uniform', 'block'; not used with 'selective' # 'uniform' divides the total number of transformer layers and checkpoints the input activation of each chunk # 'block' checkpoints the specified number of layers per pipeline stage at the specified granularity activations_checkpoint_granularity: null # 'selective' or 'full' # 'full' will checkpoint the entire transformer layer and 'selective' only checkpoints memory intensive part of attention activations_checkpoint_num_layers: null # not used with 'selective' trust_remote_code: False actor: strategy: megatron # This is for backward-compatibility ppo_mini_batch_size: 256 ppo_micro_batch_size: null # will be deprecated, use ppo_micro_batch_size_per_gpu ppo_micro_batch_size_per_gpu: null use_dynamic_bsz: False ppo_max_token_len_per_gpu: 16384 # n * ${data.max_prompt_length} + ${data.max_response_length} use_torch_compile: True # False to disable torch compile # pg_losses2 = -advantages * torch.clamp(ratio, 1 - cliprange_low, 1 + cliprange_high) clip_ratio: 0.2 # default value if clip_ratio_low and clip_ratio_high are not specified clip_ratio_low: 0.2 clip_ratio_high: 0.2 clip_ratio_c: 3.0 # lower bound of the value for Dual-clip PPO from https://arxiv.org/pdf/1912.09729 loss_agg_mode: "token-mean" # / "seq-mean-token-sum" / "seq-mean-token-mean" # NOTE: "token-mean" is the default behavior entropy_coeff: 0 use_kl_loss: False # True for GRPO kl_loss_coef: 0.001 # for grpo kl_loss_type: low_var_kl # for grpo ppo_epochs: 1 data_loader_seed: null shuffle: False policy_loss: # policy loss config loss_mode: "vanilla" # Loss function mode: vanilla / clip-cov / kl-cov / gpg from https://arxiv.org/abs/2505.22617, clip_cov_ratio: 0.0002 # Ratio of tokens to be clipped for clip-cov loss clip_cov_lb: 1.0 # Lower bound for clip-cov loss clip_cov_ub: 5.0 # Upper bound for clip-cov loss kl_cov_ratio: 0.0002 # Ratio of tokens to be applied kl penalty for kl-cov loss ppo_kl_coef: 0.1 # KL divergence penalty coefficient optim: optimizer: adam lr: 1e-6 clip_grad: 1.0 total_training_steps: -1 # must be override by program lr_warmup_init: 0.0 # initial learning rate for warmup, default to 0.0 lr_warmup_steps: null # Prioritized. None, 0 or Negative values mean delegating to lr_warmup_steps_ratio. lr_warmup_steps_ratio: 0. # the total steps will be injected during runtime lr_decay_steps: null lr_decay_style: constant # select from constant/linear/cosine/inverse_square_root min_lr: 0.0 # minimum learning rate, default to 0.0 weight_decay: 0.01 weight_decay_incr_style: constant # select from constant/linear/cosine lr_wsd_decay_style: exponential # select from constant/exponential/cosine lr_wsd_decay_steps: null use_checkpoint_opt_param_scheduler: False # use checkpoint optimizer parameter scheduler megatron: param_offload: False grad_offload: False optimizer_offload: False tensor_model_parallel_size: 1 expert_model_parallel_size: 1 expert_tensor_parallel_size: null pipeline_model_parallel_size: 1 virtual_pipeline_model_parallel_size: null # change VPP interface for parallelism tests context_parallel_size: 1 sequence_parallel: True use_distributed_optimizer: True use_dist_checkpointing: False dist_checkpointing_path: null seed: 42 override_transformer_config: {} # additional transformer config like: num_layers_in_first(/last)_pipeline_stage use_mbridge: False profile: # profile the actor model in `update_policy` use_profile: False # open it when you want to profile the actor model profile_ranks: null # list, you can specify the ranks to profile step_start: -1 # start step in update_policy step_end: -1 # end step save_path: null # the path to save the profile result load_weight: True checkpoint: async_save: False # save checkpoint asynchronously # What to include in saved checkpoints # with 'hf_model' you can save whole model as hf format, now only use sharded model checkpoint to save space save_contents: ['model', 'optimizer', 'extra'] # For more flexibility, you can specify the contents to load from the checkpoint. load_contents: ${actor_rollout_ref.actor.checkpoint.save_contents} ref: strategy: ${actor_rollout_ref.actor.strategy} use_torch_compile: ${actor_rollout_ref.actor.use_torch_compile} megatron: param_offload: False tensor_model_parallel_size: 1 expert_model_parallel_size: 1 expert_tensor_parallel_size: None pipeline_model_parallel_size: 1 virtual_pipeline_model_parallel_size: null # change VPP interface for parallelism tests context_parallel_size: 1 sequence_parallel: True use_distributed_optimizer: False use_dist_checkpointing: False dist_checkpointing_path: null seed: ${actor_rollout_ref.actor.megatron.seed} override_transformer_config: ${actor_rollout_ref.actor.megatron.override_transformer_config} use_mbridge: ${actor_rollout_ref.actor.megatron.use_mbridge} profile: use_profile: False profile_ranks: null step_start: -1 step_end: -1 save_path: null load_weight: True log_prob_micro_batch_size: null # will be deprecated, use log_prob_micro_batch_size_per_gpu log_prob_micro_batch_size_per_gpu: null log_prob_use_dynamic_bsz: ${actor_rollout_ref.actor.use_dynamic_bsz} log_prob_max_token_len_per_gpu: ${actor_rollout_ref.actor.ppo_max_token_len_per_gpu} rollout: name: vllm mode: sync # sync: LLM, async: AsyncLLM temperature: 1.0 top_k: -1 # 0 for hf rollout, -1 for vllm rollout top_p: 1 prompt_length: ${data.max_prompt_length} # for xperf_gpt response_length: ${data.max_response_length} # for vllm rollout dtype: bfloat16 # should align with FSDP gpu_memory_utilization: 0.5 ignore_eos: False enforce_eager: True free_cache_engine: True load_format: dummy_megatron tensor_model_parallel_size: 1 max_num_batched_tokens: 8192 max_model_len: null max_num_seqs: 1024 log_prob_micro_batch_size: null # will be deprecated, use log_prob_micro_batch_size_per_gpu log_prob_micro_batch_size_per_gpu: null log_prob_use_dynamic_bsz: ${actor_rollout_ref.actor.use_dynamic_bsz} log_prob_max_token_len_per_gpu: ${actor_rollout_ref.actor.ppo_max_token_len_per_gpu} disable_log_stats: True enable_chunked_prefill: False # could get higher throughput # for hf rollout do_sample: True layer_name_map: qkv_layer_name: qkv gate_proj_layer_name: gate_up # number of responses (i.e. num sample times) n: 1 engine_kwargs: # inference engine parameters vllm: swap_space: null # null means "use the engine default value" (usually 4 GB), setting it to, e.g., 32 means 32 GB disable_mm_preprocessor_cache: False # whether to disable the preprocessor cache for multimodel models. sglang: attention_backend: null # null means use the engine default value, available options: flashinfer, triton, flashmla val_kwargs: # sampling parameters for validation top_k: -1 # 0 for hf rollout, -1 for vllm rollout top_p: 1.0 temperature: 0 n: 1 do_sample: False # default eager for validation # Multi-turn interaction config for tools or chat. multi_turn: # set to True for multi-turn tool interaction tasks; should set rollout.name to sglang as well enable: False # null for no limit (default max_length // 3) max_assistant_turns: null # null for no tool tool_config_path: null # null for no limit (default max_length // 3) max_user_turns: null # max parallel call for tools in single turn max_parallel_calls: 1 # max length of tool response max_tool_response_length: 256 # truncate side of tool response: left, middle, right tool_response_truncate_side: middle # null for no interaction interaction_config_path: null # - When set to True, the model's default chat template is used for multi-turn rollout, which typically matches production behavior. # - When set to False, the token ids recorded for training are used instead; unlike the default chat template, these always include the model's full output, # which may contain additional content such as reasoning content. This maintains the consistency between training and rollout, but it will lead to longer prompts. use_inference_chat_template: False # Tokenization is performed turn by turn and the resulting token ids are concatenated to form the full conversation. # To ensure this matches the result of tokenizing the entire conversation at once, a sanity check is run at the end of each multi-turn rollout to compare the two sets of token ids. # Some models are known to produce different tokenization results when tokenizing turn by turn vs. all at once. aThis behavior has already been validated for them. # To reduce excessive warnings, you can turn off the sanity check for these models if you are using their default chat template: # Qwen/QwQ-32B, Qwen/Qwen3-xxB # - disable: disable tokenization sanity check # - strict: enable strict tokenization sanity check (default) # - ignore_strippable: ignore strippable tokens when checking tokenization sanity tokenization_sanity_check_mode: strict # Format of the multi-turn interaction. Options: hermes, llama3_json, ... format: hermes # [Experimental] agent loop based rollout configs agent: # Number of agent loop workers num_workers: 8 custom_async_server: path: null name: null # support logging rollout prob for debugging purpose calculate_log_probs: False # Nsight system profiler configs profiler: # Required when using verl.utils.omega_conf_to_dataclass to instantiate dataclass configs _target_: verl.utils.profiler.ProfilerConfig discrete: False all_ranks: False ranks: [] critic: rollout_n: ${actor_rollout_ref.rollout.n} strategy: ${actor_rollout_ref.actor.strategy} nccl_timeout: 600 # seconds, default is 10 minutes for torch, you can set it to a larger value if you have long-running operations like 32B or 72B model using megatron optim: optimizer: adam lr: 1e-6 clip_grad: 1.0 total_training_steps: -1 # must be override by program lr_warmup_init: 0.0 # initial learning rate for warmup, default to 0.0 lr_warmup_steps: null # Prioritized. None, 0 or Negative values mean delegating to lr_warmup_steps_ratio. lr_warmup_steps_ratio: 0. # the total steps will be injected during runtime lr_decay_steps: null lr_decay_style: linear # select from constant/linear/cosine/inverse_square_root min_lr: 0.0 # minimum learning rate, default to 0.0 weight_decay: 0.01 weight_decay_incr_style: constant # select from constant/linear/cosine lr_wsd_decay_style: exponential # select from constant/exponential/cosine lr_wsd_decay_steps: null use_checkpoint_opt_param_scheduler: False # use checkpoint optimizer parameter scheduler model: path: ~/models/deepseek-llm-7b-chat tokenizer_path: ${actor_rollout_ref.model.path} override_config: model_config: {} moe_config: freeze_moe_router: False external_lib: ${actor_rollout_ref.model.external_lib} trust_remote_code: False enable_gradient_checkpointing: False gradient_checkpointing_kwargs: ## Activation Checkpointing activations_checkpoint_method: null activations_checkpoint_granularity: null activations_checkpoint_num_layers: null megatron: param_offload: False grad_offload: False optimizer_offload: False tensor_model_parallel_size: 1 expert_model_parallel_size: 1 expert_tensor_parallel_size: null pipeline_model_parallel_size: 1 virtual_pipeline_model_parallel_size: null # change VPP interface for parallelism tests context_parallel_size: 1 sequence_parallel: True use_distributed_optimizer: True use_dist_checkpointing: False dist_checkpointing_path: null seed: ${actor_rollout_ref.actor.megatron.seed} override_transformer_config: ${actor_rollout_ref.actor.megatron.override_transformer_config} use_mbridge: ${actor_rollout_ref.actor.megatron.use_mbridge} load_weight: True ppo_mini_batch_size: ${actor_rollout_ref.actor.ppo_mini_batch_size} ppo_micro_batch_size: null # will be deprecated, use ppo_micro_batch_size_per_gpu ppo_micro_batch_size_per_gpu: null use_dynamic_bsz: ${actor_rollout_ref.actor.use_dynamic_bsz} ppo_max_token_len_per_gpu: 32768 # (${actor_rollout_ref.actor.ppo_max_token_len_per_gpu}) * 2 forward_max_token_len_per_gpu: ${critic.ppo_max_token_len_per_gpu} ppo_epochs: ${actor_rollout_ref.actor.ppo_epochs} data_loader_seed: ${actor_rollout_ref.actor.data_loader_seed} shuffle: ${actor_rollout_ref.actor.shuffle} cliprange_value: 0.5 loss_agg_mode: ${actor_rollout_ref.actor.loss_agg_mode} checkpoint: async_save: False # save checkpoint asynchronously # What to include in saved checkpoints # with 'hf_model' you can save whole model as hf format, now only use sharded model checkpoint to save space save_contents: ['model', 'optimizer', 'extra'] load_contents: ${critic.checkpoint.save_contents} # Nsight system profiler configs profiler: # Required when using verl.utils.omega_conf_to_dataclass to instantiate dataclass configs _target_: verl.utils.profiler.ProfilerConfig discrete: False all_ranks: False ranks: [] reward_model: enable: False strategy: ${actor_rollout_ref.actor.strategy} nccl_timeout: 600 # seconds, default is 10 minutes for torch, you can set it to a larger value if you have long-running operations like 32B or 72B model using megatron megatron: param_offload: False tensor_model_parallel_size: 1 expert_model_parallel_size: 1 expert_tensor_parallel_size: null pipeline_model_parallel_size: 1 virtual_pipeline_model_parallel_size: null # change VPP interface for parallelism tests context_parallel_size: 1 sequence_parallel: True use_distributed_optimizer: False use_dist_checkpointing: False dist_checkpointing_path: null seed: ${actor_rollout_ref.actor.megatron.seed} override_transformer_config: {} use_mbridge: ${actor_rollout_ref.actor.megatron.use_mbridge} model: input_tokenizer: ${actor_rollout_ref.model.path} # set this to null if the chat template is identical path: ~/models/FsfairX-LLaMA3-RM-v0.1 trust_remote_code: False external_lib: ${actor_rollout_ref.model.external_lib} load_weight: True micro_batch_size: null # will be deprecated, use micro_batch_size_per_gpu micro_batch_size_per_gpu: null use_dynamic_bsz: ${critic.use_dynamic_bsz} forward_max_token_len_per_gpu: ${critic.forward_max_token_len_per_gpu} max_length: null reward_manager: naive launch_reward_fn_async: False # custom reward function executed async on CPU, during log_prob sandbox_fusion: url: null # faas url to run code in cloud sandbox max_concurrent: 64 # max concurrent requests to sandbox memory_limit_mb: 1024 # Max memory limit for each sandbox process in MB # Nsight system profiler configs profiler: # Required when using verl.utils.omega_conf_to_dataclass to instantiate dataclass configs _target_: verl.utils.profiler.ProfilerConfig discrete: False all_ranks: False ranks: [] custom_reward_function: path: null name: compute_score algorithm: # Required when using verl.utils.omega_conf_to_dataclass to instantiate dataclass configs _target_: verl.trainer.config.AlgoConfig gamma: 1.0 lam: 1.0 adv_estimator: gae norm_adv_by_std_in_grpo: True use_kl_in_reward: False kl_penalty: kl # how to estimate kl divergence kl_ctrl: # Required when using verl.utils.omega_conf_to_dataclass to instantiate dataclass configs _target_: verl.trainer.config.KLControlConfig type: fixed kl_coef: 0.001 horizon: 10000 target_kl: 0.1 use_pf_ppo: False pf_ppo: reweight_method: pow # ["pow", "max_min", "max_random"] weight_pow: 2.0 trainer: balance_batch: True total_epochs: 30 total_training_steps: null profile_steps: null # [1,2,5] or [] or null project_name: verl_examples experiment_name: gsm8k logger: ['console', 'wandb'] log_val_generations: 0 nnodes: 1 n_gpus_per_node: 8 save_freq: -1 esi_redundant_time: 0 # auto: find the last ckpt to resume. If can't find, start from scratch resume_mode: auto # or disable or resume_path if resume_from_path is set resume_from_path: null del_local_ckpt_after_load: False val_before_train: True test_freq: -1 critic_warmup: 0 default_hdfs_dir: null default_local_dir: checkpoints/${trainer.project_name}/${trainer.experiment_name} max_actor_ckpt_to_keep: null max_critic_ckpt_to_keep: null # The timeout for ray worker group to wait for the register center to be ready ray_wait_register_center_timeout: 300 device: cuda # see ppo_trainer.yaml for more details controller_nsight_options: trace: "cuda,nvtx,cublas,ucx" cuda-memory-usage: "true" cuda-graph-trace: "graph" worker_nsight_options: trace: "cuda,nvtx,cublas,ucx" cuda-memory-usage: "true" cuda-graph-trace: "graph" capture-range: "cudaProfilerApi" capture-range-end: null kill: none npu_profile: options: save_path: ./profiler_data roles: ["all"] level: level1 with_memory: False record_shapes: False with_npu: True with_cpu: True with_module: False with_stack: False analysis: True ray_init: num_cpus: null # `None` means using all CPUs, which might cause hang if limited in systems like SLURM. Please set to a number allowed then. timeline_json_file: null