# Required when using verl.utils.omega_conf_to_dataclass to instantiate dataclass configs _target_: verl.workers.config.CriticConfig # Number of rollouts per update (mirrors actor rollout_n) rollout_n: ${oc.select:actor_rollout_ref.rollout.n,1} # fsdp or fsdp2 strategy used for critic model training strategy: ??? # whether to enable the critic worker. # by default it is only enabled if advantage estimator is gae # set it to True manually if you always want to enable critic worker enable: null # optimizer configs optim: # Learning rate lr: 1e-5 # Warmup steps ratio; total steps will be injected at runtime lr_warmup_steps_ratio: 0.0 # Total training steps (must be overridden at runtime) total_training_steps: -1 # Weight decay weight_decay: 0.01 # Prioritized. None, 0 or Negative values mean delegating to lr_warmup_steps_ratio. lr_warmup_steps: -1 # model config for the critic model: # Path to pretrained model weights path: ~/models/deepseek-llm-7b-chat # Tokenizer path (defaults to actor's model path) tokenizer_path: ${oc.select:actor_rollout_ref.model.path,"~/models/deepseek-llm-7b-chat"} # Hugging Face config override override_config: {} # External model implementation (optional) external_lib: ${oc.select:actor_rollout_ref.model.external_lib,null} # Whether to trust remote code from Hugging Face models trust_remote_code: ${oc.select:actor_rollout_ref.model.trust_remote_code,false} # PPO mini-batch size per update ppo_mini_batch_size: ${oc.select:actor_rollout_ref.actor.ppo_mini_batch_size,256} # [Deprecated] Global micro batch size ppo_micro_batch_size: null # Local per-GPU micro batch size ppo_micro_batch_size_per_gpu: ${oc.select:.ppo_micro_batch_size,null} # Whether to automatically adjust batch size at runtime use_dynamic_bsz: ${oc.select:actor_rollout_ref.actor.use_dynamic_bsz,false} # Max tokens per GPU in one PPO batch (doubled for critic) ppo_max_token_len_per_gpu: 32768 # Max token length per GPU in forward pass forward_max_token_len_per_gpu: ${.ppo_max_token_len_per_gpu} # Number of PPO epochs per batch ppo_epochs: ${oc.select:actor_rollout_ref.actor.ppo_epochs,1} # Shuffle training data across PPO epochs shuffle: ${oc.select:actor_rollout_ref.actor.shuffle,false} # PPO value function clipping range cliprange_value: 0.5 # Loss aggregation mode: "token-mean", "seq-mean-token-sum", or "seq-mean-token-mean" loss_agg_mode: ${oc.select:actor_rollout_ref.actor.loss_agg_mode,token-mean} # checkpoint configs checkpoint: # Target dataclass for this configuration _target_: verl.trainer.config.CheckpointConfig # What to include in saved checkpoints # with 'hf_model' you can save whole model as hf format, now only use sharded model checkpoint to save space save_contents: ['model', 'optimizer', 'extra'] # What to include when loading checkpoints load_contents: ${.save_contents} # Whether to save checkpoints asynchronously. Only effective for Megatron as of now. async_save: False # profiler configs # the corresponding dataclass is verl.utils.profiler.ProfilerConfig. profiler: # Required when using verl.utils.omega_conf_to_dataclass to instantiate dataclass configs _target_: verl.utils.profiler.ProfilerConfig # True for each task has its own database, False for all tasks in one training step share one database. discrete: False # Whether to profile all ranks. all_ranks: False # The ranks that will be profiled. [] or [0,1,...] ranks: []