hydra: searchpath: - file://verl/trainer/config defaults: - ppo_trainer - _self_ data: gen_batch_size: ${data.train_batch_size} reward_model: reward_kwargs: overlong_buffer_cfg: ${reward_model.overlong_buffer} reward_manager: dapo overlong_buffer: enable: False len: 0 penalty_factor: 0.0 log: False algorithm: filter_groups: enable: False # We try to avoid forgetting to set enable metric: null # acc / score / seq_reward / seq_final_reward / ... max_num_gen_batches: 0 # Non-positive values mean no upper limit trainer: project_name: verl-entropy actor_rollout_ref: actor: policy_loss: loss_mode: "vanilla" # /clip-cov / kl-cov from https://arxiv.org/abs/2505. clip_cov_ratio: 0.0002 # for clip-cov loss clip_cov_lb: 1.0 # for clip-cov loss clip_cov_ub: 5.0 # for clip-cov loss kl_cov_ratio: 0.0002 # for kl-cov loss ppo_kl_coef: 0.1 # for kl-cov loss