# This reference configration yaml is automatically generated via 'scripts/generate_trainer_config.sh' # in which it invokes 'python3 scripts/print_cfg.py --cfg job ' to flatten the 'verl/trainer/config/ppo_trainer.yaml' config fields into a single file. # Do not modify this file directly. # The file is usually only for reference and never used. actor_rollout_ref: actor: _target_: verl.workers.config.FSDPActorConfig strategy: fsdp ppo_mini_batch_size: 256 ppo_micro_batch_size: null ppo_micro_batch_size_per_gpu: null use_dynamic_bsz: false ppo_max_token_len_per_gpu: 16384 clip_ratio: 0.2 clip_ratio_low: 0.2 clip_ratio_high: 0.2 policy_loss: _target_: verl.workers.config.PolicyLossConfig loss_mode: vanilla clip_cov_ratio: 0.0002 clip_cov_lb: 1.0 clip_cov_ub: 5.0 kl_cov_ratio: 0.0002 ppo_kl_coef: 0.1 clip_ratio_c: 3.0 loss_agg_mode: token-mean entropy_coeff: 0 use_kl_loss: false use_torch_compile: true kl_loss_coef: 0.001 kl_loss_type: low_var_kl ppo_epochs: 1 shuffle: false checkpoint: _target_: verl.trainer.config.CheckpointConfig save_contents: - model - optimizer - extra load_contents: ${.save_contents} async_save: false optim: lr: 1.0e-06 lr_warmup_steps_ratio: 0.0 total_training_steps: -1 weight_decay: 0.01 lr_warmup_steps: -1 _target_: verl.workers.config.FSDPOptimizerConfig min_lr_ratio: 0.0 num_cycles: 0.5 warmup_style: constant use_fused_kernels: ${oc.select:actor_rollout_ref.model.use_fused_kernels,false} grad_clip: 1.0 ulysses_sequence_parallel_size: 1 entropy_from_logits_with_chunking: false entropy_checkpointing: false fsdp_config: _target_: verl.workers.config.FSDPEngineConfig wrap_policy: min_num_params: 0 param_offload: false optimizer_offload: false offload_policy: false reshard_after_forward: true fsdp_size: -1 forward_prefetch: false use_remove_padding: ${oc.select:actor_rollout_ref.model.use_remove_padding,false} ref: strategy: ${actor_rollout_ref.actor.strategy} use_torch_compile: ${oc.select:actor_rollout_ref.actor.use_torch_compile,true} log_prob_micro_batch_size: null log_prob_micro_batch_size_per_gpu: null log_prob_use_dynamic_bsz: ${oc.select:actor_rollout_ref.actor.use_dynamic_bsz,false} log_prob_max_token_len_per_gpu: ${oc.select:actor_rollout_ref.actor.ppo_max_token_len_per_gpu,16384} fsdp_config: _target_: verl.workers.config.FSDPEngineConfig wrap_policy: min_num_params: 0 param_offload: false reshard_after_forward: true forward_prefetch: false ulysses_sequence_parallel_size: ${oc.select:actor_rollout_ref.actor.ulysses_sequence_parallel_size,1} entropy_from_logits_with_chunking: false entropy_checkpointing: false rollout: name: ??? mode: sync temperature: 1.0 top_k: -1 top_p: 1 prompt_length: ${oc.select:data.max_prompt_length,512} response_length: ${oc.select:data.max_response_length,512} dtype: bfloat16 gpu_memory_utilization: 0.5 ignore_eos: false enforce_eager: true free_cache_engine: true tensor_model_parallel_size: 2 max_num_batched_tokens: 8192 max_model_len: null max_num_seqs: 1024 log_prob_micro_batch_size: null log_prob_micro_batch_size_per_gpu: null log_prob_use_dynamic_bsz: ${oc.select:actor_rollout_ref.actor.use_dynamic_bsz,false} log_prob_max_token_len_per_gpu: ${oc.select:actor_rollout_ref.actor.ppo_max_token_len_per_gpu,16384} disable_log_stats: true do_sample: true 'n': 1 multi_stage_wake_up: false engine_kwargs: vllm: swap_space: null disable_mm_preprocessor_cache: false sglang: attention_backend: null val_kwargs: top_k: -1 top_p: 1.0 temperature: 0 'n': 1 do_sample: false multi_turn: enable: false max_assistant_turns: null tool_config_path: null max_user_turns: null max_parallel_calls: 1 max_tool_response_length: 256 tool_response_truncate_side: middle interaction_config_path: null use_inference_chat_template: false tokenization_sanity_check_mode: strict format: hermes calculate_log_probs: false agent: num_workers: 8 agent_loop_config_path: null custom_async_server: path: null name: null update_weights_bucket_megabytes: 512 trace: backend: null token2text: false enable_chunked_prefill: true load_format: dummy_dtensor layered_summon: false hybrid_engine: true model: path: ~/models/deepseek-llm-7b-chat custom_chat_template: null use_shm: false external_lib: null override_config: {} enable_gradient_checkpointing: true enable_activation_offload: false use_remove_padding: false lora_rank: 0 lora_alpha: 16 target_modules: all-linear exclude_modules: null use_liger: false use_fused_kernels: false fused_kernel_options: impl_backend: torch trust_remote_code: false profiler: _target_: verl.utils.profiler.ProfilerConfig discrete: false all_ranks: false ranks: [] trainer: npu_profile: options: save_path: ./profiler_data roles: - all level: level1 with_memory: false record_shapes: false with_npu: true with_cpu: true with_module: false with_stack: false analysis: true balance_batch: true total_epochs: 30 total_training_steps: null profile_steps: null controller_nsight_options: trace: cuda,nvtx,cublas,ucx cuda-memory-usage: 'true' cuda-graph-trace: graph worker_nsight_options: trace: cuda,nvtx,cublas,ucx cuda-memory-usage: 'true' cuda-graph-trace: graph capture-range: cudaProfilerApi capture-range-end: null kill: none project_name: verl_examples experiment_name: gsm8k logger: - console - wandb log_val_generations: 0 rollout_data_dir: null validation_data_dir: null nnodes: 1 n_gpus_per_node: 8 save_freq: -1 esi_redundant_time: 0 resume_mode: auto resume_from_path: null val_before_train: true val_only: false test_freq: -1 critic_warmup: 0 default_hdfs_dir: null del_local_ckpt_after_load: false default_local_dir: checkpoints/${trainer.project_name}/${trainer.experiment_name} max_actor_ckpt_to_keep: null max_critic_ckpt_to_keep: null ray_wait_register_center_timeout: 300 device: cuda use_legacy_worker_impl: auto data: tokenizer: null use_shm: false train_files: ~/data/rlhf/gsm8k/train.parquet val_files: ~/data/rlhf/gsm8k/test.parquet prompt_key: prompt reward_fn_key: data_source max_prompt_length: 512 max_response_length: 512 train_batch_size: 1024 val_batch_size: null return_raw_input_ids: false return_raw_chat: false return_full_prompt: false shuffle: true dataloader_num_workers: 8 validation_shuffle: false filter_overlong_prompts: false filter_overlong_prompts_workers: 1 truncation: error image_key: images video_key: videos trust_remote_code: false custom_cls: path: null name: null return_multi_modal_inputs: true sampler: class_path: null class_name: null datagen: path: null name: null critic: _target_: verl.workers.config.FSDPCriticConfig rollout_n: ${oc.select:actor_rollout_ref.rollout.n,1} strategy: fsdp enable: null optim: lr: 1.0e-05 lr_warmup_steps_ratio: 0.0 total_training_steps: -1 weight_decay: 0.01 lr_warmup_steps: -1 _target_: verl.workers.config.FSDPOptimizerConfig min_lr_ratio: null warmup_style: constant model: path: ~/models/deepseek-llm-7b-chat tokenizer_path: ${oc.select:actor_rollout_ref.model.path,"~/models/deepseek-llm-7b-chat"} override_config: {} external_lib: ${oc.select:actor_rollout_ref.model.external_lib,null} trust_remote_code: ${oc.select:actor_rollout_ref.model.trust_remote_code,false} _target_: verl.workers.config.FSDPCriticModelCfg use_shm: false enable_gradient_checkpointing: true enable_activation_offload: false use_remove_padding: false fsdp_config: _target_: verl.workers.config.FSDPEngineConfig param_offload: false optimizer_offload: false offload_policy: false reshard_after_forward: true wrap_policy: min_num_params: 0 fsdp_size: -1 forward_prefetch: false lora_rank: 0 lora_alpha: 16 target_modules: all-linear ppo_mini_batch_size: ${oc.select:actor_rollout_ref.actor.ppo_mini_batch_size,256} ppo_micro_batch_size: null ppo_micro_batch_size_per_gpu: ${oc.select:.ppo_micro_batch_size,null} use_dynamic_bsz: ${oc.select:actor_rollout_ref.actor.use_dynamic_bsz,false} ppo_max_token_len_per_gpu: 32768 forward_max_token_len_per_gpu: ${.ppo_max_token_len_per_gpu} ppo_epochs: ${oc.select:actor_rollout_ref.actor.ppo_epochs,1} shuffle: ${oc.select:actor_rollout_ref.actor.shuffle,false} cliprange_value: 0.5 loss_agg_mode: ${oc.select:actor_rollout_ref.actor.loss_agg_mode,token-mean} checkpoint: _target_: verl.trainer.config.CheckpointConfig save_contents: - model - optimizer - extra load_contents: ${.save_contents} async_save: false profiler: _target_: verl.utils.profiler.ProfilerConfig discrete: false all_ranks: false ranks: [] forward_micro_batch_size: ${oc.select:.ppo_micro_batch_size,null} forward_micro_batch_size_per_gpu: ${oc.select:.ppo_micro_batch_size_per_gpu,null} ulysses_sequence_parallel_size: 1 grad_clip: 1.0 reward_model: enable: false strategy: fsdp model: input_tokenizer: ${actor_rollout_ref.model.path} path: ~/models/FsfairX-LLaMA3-RM-v0.1 external_lib: ${actor_rollout_ref.model.external_lib} trust_remote_code: false use_shm: false use_remove_padding: false use_fused_kernels: ${actor_rollout_ref.model.use_fused_kernels} fsdp_config: _target_: verl.workers.config.FSDPEngineConfig wrap_policy: min_num_params: 0 param_offload: false reshard_after_forward: true fsdp_size: -1 forward_prefetch: false micro_batch_size: null micro_batch_size_per_gpu: null max_length: null use_dynamic_bsz: ${critic.use_dynamic_bsz} forward_max_token_len_per_gpu: ${critic.forward_max_token_len_per_gpu} reward_manager: naive launch_reward_fn_async: false sandbox_fusion: url: null max_concurrent: 64 memory_limit_mb: 1024 profiler: _target_: verl.utils.profiler.ProfilerConfig discrete: false all_ranks: false ranks: [] ulysses_sequence_parallel_size: 1 custom_reward_function: path: null name: compute_score algorithm: _target_: verl.trainer.config.AlgoConfig gamma: 1.0 lam: 1.0 adv_estimator: gae norm_adv_by_std_in_grpo: true use_kl_in_reward: false kl_penalty: kl kl_ctrl: _target_: verl.trainer.config.KLControlConfig type: fixed kl_coef: 0.001 horizon: 10000 target_kl: 0.1 use_pf_ppo: false pf_ppo: reweight_method: pow weight_pow: 2.0 ray_init: num_cpus: null timeline_json_file: null