# specify the default per-component configs defaults: # @.: # actor_rollout_ref.actor: trainer/config/actor/megatron_actor.yaml - actor@actor_rollout_ref.actor: megatron_actor # trainer.npu_profile: trainer/config/npu_profile/npu_profile.yaml - npu_profile@trainer.npu_profile: npu_profile # data: trainer/config/data/legacy_data.yaml - data@data: legacy_data # load the reference default config, then apply the fields in the current yaml # Reference model config. # Reference model will be enabled when actor.use_kl_loss or/and algorithm.use_kl_in_reward is/are True. - ref@actor_rollout_ref.ref: megatron_ref # Rollout model config. - rollout@actor_rollout_ref.rollout: rollout # Critic model config. - critic@critic: megatron_critic # Reward model config. - reward_model@reward_model: megatron_reward_model - _self_ actor_rollout_ref: hybrid_engine: True nccl_timeout: 600 # seconds, default is 10 minutes for torch, you can set it to a larger value if you have long-running operations like 32B or 72B model using megatron model: path: ~/models/deepseek-llm-7b-chat custom_chat_template: null external_lib: null override_config: model_config: {} moe_config: freeze_moe_router: False use_fused_kernels: False # Whether to use custom fused kernels (PostProcessing, for memory efficiency) trust_remote_code: False rollout: # may get higher throughput when set to True. When activated, Please increase max_num_batched_tokens or decrease max_model_len. enable_chunked_prefill: False load_format: dummy_megatron tensor_model_parallel_size: 1 layer_name_map: qkv_layer_name: qkv gate_proj_layer_name: gate_up profiler: _target_: verl.utils.profiler.ProfilerConfig discrete: False all_ranks: False ranks: [] custom_reward_function: path: null name: compute_score algorithm: # Required when using verl.utils.omega_conf_to_dataclass to instantiate dataclass configs _target_: verl.trainer.config.AlgoConfig gamma: 1.0 lam: 1.0 adv_estimator: gae norm_adv_by_std_in_grpo: True use_kl_in_reward: False kl_penalty: kl # how to estimate kl divergence kl_ctrl: # Required when using verl.utils.omega_conf_to_dataclass to instantiate dataclass configs _target_: verl.trainer.config.KLControlConfig type: fixed kl_coef: 0.001 horizon: 10000 target_kl: 0.1 use_pf_ppo: False pf_ppo: reweight_method: pow # ["pow", "max_min", "max_random"] weight_pow: 2.0 trainer: balance_batch: True total_epochs: 30 total_training_steps: null profile_steps: null # [1,2,5] or [] or null project_name: verl_examples experiment_name: gsm8k logger: ['console', 'wandb'] log_val_generations: 0 nnodes: 1 n_gpus_per_node: 8 save_freq: -1 esi_redundant_time: 0 # auto: find the last ckpt to resume. If can't find, start from scratch resume_mode: auto # or disable or resume_path if resume_from_path is set resume_from_path: null del_local_ckpt_after_load: False val_before_train: True test_freq: -1 critic_warmup: 0 default_hdfs_dir: null default_local_dir: checkpoints/${trainer.project_name}/${trainer.experiment_name} max_actor_ckpt_to_keep: null max_critic_ckpt_to_keep: null # The timeout for ray worker group to wait for the register center to be ready ray_wait_register_center_timeout: 300 device: cuda # see ppo_trainer.yaml for more details controller_nsight_options: trace: "cuda,nvtx,cublas,ucx" cuda-memory-usage: "true" cuda-graph-trace: "graph" worker_nsight_options: trace: "cuda,nvtx,cublas,ucx" cuda-memory-usage: "true" cuda-graph-trace: "graph" capture-range: "cudaProfilerApi" capture-range-end: null kill: none ray_init: num_cpus: null # `None` means using all CPUs, which might cause hang if limited in systems like SLURM. Please set to a number allowed then. timeline_json_file: null