# Format checks enforced on CI: # 1. Comments must appear above each field. # 2. There must be a blank line between each field. # 3. Inline comments (after a field on the same line) are not allowed. # 4. Indentation level is respected for nested fields. # Target class for this configuration _target_: verl.workers.config.ActorConfig # the abstract actor configs # fsdp, fsdp2 or megatron. must be set. strategy: ??? # Split each sample into sub-batches of this size for PPO ppo_mini_batch_size: 256 # [Deprecated] Global micro batch size ppo_micro_batch_size: null # Local per-GPU micro batch size ppo_micro_batch_size_per_gpu: null # Whether to automatically adjust batch size at runtime # oc.select: the default val for ref.log_prob_use_dynamic_bsz use_dynamic_bsz: false # Max tokens per GPU in one PPO batch; affects gradient accumulation # Typically it should be: n * ${data.max_prompt_length} + ${data.max_response_length} # oc.select: the default val for ref.log_prob_max_token_len_per_gpu ppo_max_token_len_per_gpu: 16384 # PPO clip ratio clip_ratio: 0.2 # Lower bound for asymmetric clipping (used in dual-clip PPO) clip_ratio_low: 0.2 # Upper bound for asymmetric clipping (used in dual-clip PPO) clip_ratio_high: 0.2 # policy loss config policy_loss: # # Required when using verl.utils.omega_conf_to_dataclass to instantiate dataclass configs _target_: verl.workers.config.PolicyLossConfig # Loss function mode: vanilla / clip-cov / kl-cov /gpg from https://arxiv.org/abs/2505.22617 loss_mode: "vanilla" # Ratio of tokens to be clipped for clip-cov loss clip_cov_ratio: 0.0002 # Lower bound for clip-cov loss clip_cov_lb: 1.0 # Upper bound for clip-cov loss clip_cov_ub: 5.0 # Ratio of tokens to be applied kl penalty for kl-cov loss kl_cov_ratio: 0.0002 # KL divergence penalty coefficient ppo_kl_coef: 0.1 # Constant C in Dual-clip PPO; clips when advantage < 0 and ratio > C clip_ratio_c: 3.0 # Loss aggregation mode: "token-mean", "seq-mean-token-sum", or "seq-mean-token-mean" loss_agg_mode: token-mean # Entropy regularization coefficient in PPO loss entropy_coeff: 0 # Whether to use KL loss instead of KL reward penalty. True for GRPO use_kl_loss: false # Whether to use torch.compile() # oc.select: the default val for ref.use_torch_compile use_torch_compile: true # KL loss coefficient when use_kl_loss is enabled. For GRPO kl_loss_coef: 0.001 # Type of KL divergence loss. Options: "kl"(k1), "abs", "mse"(k2), "low_var_kl"(k3), "full" kl_loss_type: low_var_kl # Number of PPO epochs per batch ppo_epochs: 1 # Shuffle training data across PPO epochs shuffle: false # checkpoint configs checkpoint: # Target dataclass for this configuration _target_: verl.trainer.config.CheckpointConfig # What to include in saved checkpoints # with 'hf_model' you can save whole model as hf format, now only use sharded model checkpoint to save space save_contents: ['model', 'optimizer', 'extra'] # For more flexibility, you can specify the contents to load from the checkpoint. # .xxx refers to the local variable xxx from the same level of hierarchy similar to python pkg load_contents: ${.save_contents} # Whether to save checkpoints asynchronously. Only effective for Megatron as of now. async_save: False # optimizer configs optim: # Learning rate lr: 1e-6 # Warmup steps ratio (used if lr_warmup_steps is negative) lr_warmup_steps_ratio: 0.0 # Total training steps (must be overridden at runtime) total_training_steps: -1 # Weight decay weight_decay: 0.01 # Prioritized. None, 0 or Negative values mean delegating to lr_warmup_steps_ratio. lr_warmup_steps: -1 # Whether to use custom fused kernels (e.g., FlashAttention, fused MLP) use_fused_kernels: ${oc.select:actor_rollout_ref.model.use_fused_kernels,false}