data: tokenizer: null train_files: ~/data/rlhf/gsm8k/train.parquet val_files: ~/data/rlhf/gsm8k/test.parquet prompt_key: prompt reward_fn_key: data_source max_prompt_length: 512 max_response_length: 512 gen_batch_size: ${data.train_batch_size} train_batch_size: 1024 val_batch_size: null # DEPRECATED: Validation datasets are sent to inference engines as a whole batch, which will schedule the memory themselves return_raw_input_ids: False # This should be set to true when the tokenizer between policy and rm differs return_raw_chat: False shuffle: True filter_overlong_prompts: False # for large-scale dataset, filtering overlong prompts could be timeconsuming. You should disable this and set `truncation='left' truncation: error image_key: images video_key: videos custom_cls: path: null name: null actor_rollout_ref: hybrid_engine: True model: path: ~/models/deepseek-llm-7b-chat external_lib: null override_config: { } enable_gradient_checkpointing: True use_remove_padding: False actor: strategy: fsdp # This is for backward-compatibility ppo_mini_batch_size: 256 ppo_micro_batch_size: null # will be deprecated, use ppo_micro_batch_size_per_gpu ppo_micro_batch_size_per_gpu: null use_dynamic_bsz: False ppo_max_token_len_per_gpu: 16384 # n * ${data.max_prompt_length} + ${data.max_response_length} grad_clip: 1.0 # pg_losses2 = -advantages * torch.clamp(ratio, 1 - cliprange_low, 1 + cliprange_high) clip_ratio: 0.2 # default value if clip_ratio_low and clip_ratio_high are not specified clip_ratio_low: 0.2 clip_ratio_high: 0.2 clip_ratio_c: 3.0 # lower bound of the value for Dual-clip PPO from https://arxiv.org/pdf/1912.09729 loss_agg_mode: "token-mean" # / "seq-mean-token-sum" / "seq-mean-token-mean" # NOTE: "token-mean" is the default behavior entropy_coeff: 0.001 use_kl_loss: False # True for GRPO use_torch_compile: True # False to disable torch compile kl_loss_coef: 0.001 # for grpo kl_loss_type: low_var_kl # for grpo ppo_epochs: 1 shuffle: False ulysses_sequence_parallel_size: 1 # sp size checkpoint: contents: ['model', 'hf_model', 'optimizer', 'extra'] # with 'hf_model' you can save whole model as hf format, now only use sharded model checkpoint to save space optim: lr: 1e-6 lr_warmup_steps: -1 # Prioritized. Negative values mean delegating to lr_warmup_steps_ratio. lr_warmup_steps_ratio: 0. # the total steps will be injected during runtime min_lr_ratio: null # only useful for warmup with cosine warmup_style: constant # select from constant/cosine total_training_steps: -1 # must be override by program weight_decay: 0.01 fsdp_config: wrap_policy: # transformer_layer_cls_to_wrap: None min_num_params: 0 param_offload: False optimizer_offload: False fsdp_size: -1 ref: fsdp_config: param_offload: False wrap_policy: # transformer_layer_cls_to_wrap: None min_num_params: 0 log_prob_micro_batch_size: null # will be deprecated, use log_prob_micro_batch_size_per_gpu log_prob_micro_batch_size_per_gpu: null log_prob_use_dynamic_bsz: ${actor_rollout_ref.actor.use_dynamic_bsz} log_prob_max_token_len_per_gpu: ${actor_rollout_ref.actor.ppo_max_token_len_per_gpu} ulysses_sequence_parallel_size: ${actor_rollout_ref.actor.ulysses_sequence_parallel_size} # sp size rollout: name: vllm temperature: 1.0 top_k: -1 # 0 for hf rollout, -1 for vllm rollout top_p: 1 use_fire_sampling: False # https://arxiv.org/abs/2410.21236 prompt_length: ${data.max_prompt_length} # not use for opensource response_length: ${data.max_response_length} # for vllm rollout dtype: bfloat16 # should align with FSDP gpu_memory_utilization: 0.5 ignore_eos: False enforce_eager: True free_cache_engine: True load_format: dummy_dtensor tensor_model_parallel_size: 2 max_num_batched_tokens: 8192 max_model_len: null max_num_seqs: 1024 log_prob_micro_batch_size: null # will be deprecated, use log_prob_micro_batch_size_per_gpu log_prob_micro_batch_size_per_gpu: null log_prob_use_dynamic_bsz: ${actor_rollout_ref.actor.use_dynamic_bsz} log_prob_max_token_len_per_gpu: ${actor_rollout_ref.actor.ppo_max_token_len_per_gpu} disable_log_stats: True enable_chunked_prefill: True # may get higher throughput when set to True. When activated, Please increase max_num_batched_tokens or decrease max_model_len. # for hf rollout do_sample: True # number of responses (i.e. num sample times) n: 1 # > 1 for grpo val_kwargs: # sampling parameters for validation top_k: -1 # 0 for hf rollout, -1 for vllm rollout top_p: 1.0 temperature: 0 n: 1 do_sample: False # default eager for validation critic: rollout_n: ${actor_rollout_ref.rollout.n} strategy: fsdp optim: lr: 1e-5 lr_warmup_steps_ratio: 0. # the total steps will be injected during runtime min_lr_ratio: null # only useful for warmup with cosine warmup_style: constant # select from constant/cosine total_training_steps: -1 # must be override by program weight_decay: 0.01 model: path: ~/models/deepseek-llm-7b-chat tokenizer_path: ${actor_rollout_ref.model.path} override_config: { } external_lib: ${actor_rollout_ref.model.external_lib} enable_gradient_checkpointing: True use_remove_padding: False fsdp_config: param_offload: False optimizer_offload: False wrap_policy: # transformer_layer_cls_to_wrap: None min_num_params: 0 fsdp_size: -1 ppo_mini_batch_size: ${actor_rollout_ref.actor.ppo_mini_batch_size} ppo_micro_batch_size: null # will be deprecated, use ppo_micro_batch_size_per_gpu ppo_micro_batch_size_per_gpu: null forward_micro_batch_size: ${critic.ppo_micro_batch_size} forward_micro_batch_size_per_gpu: ${critic.ppo_micro_batch_size_per_gpu} use_dynamic_bsz: ${actor_rollout_ref.actor.use_dynamic_bsz} ppo_max_token_len_per_gpu: 32768 # (${actor_rollout_ref.actor.ppo_max_token_len_per_gpu}) * 2 forward_max_token_len_per_gpu: ${critic.ppo_max_token_len_per_gpu} ulysses_sequence_parallel_size: 1 # sp size ppo_epochs: ${actor_rollout_ref.actor.ppo_epochs} shuffle: ${actor_rollout_ref.actor.shuffle} grad_clip: 1.0 cliprange_value: 0.5 checkpoint: contents: ['model', 'hf_model', 'optimizer', 'extra'] # with 'hf_model' you can save whole model as hf format, now only use sharded model checkpoint to save space reward_model: enable: False strategy: fsdp model: input_tokenizer: ${actor_rollout_ref.model.path} # set this to null if the chat template is identical path: ~/models/FsfairX-LLaMA3-RM-v0.1 external_lib: ${actor_rollout_ref.model.external_lib} use_remove_padding: False fsdp_config: wrap_policy: min_num_params: 0 param_offload: False fsdp_size: -1 micro_batch_size: null # will be deprecated, use micro_batch_size_per_gpu micro_batch_size_per_gpu: null # set a number max_length: null ulysses_sequence_parallel_size: 1 # sp size use_dynamic_bsz: ${critic.use_dynamic_bsz} forward_max_token_len_per_gpu: ${critic.forward_max_token_len_per_gpu} reward_manager: naive overlong_buffer: enable: False # We try to avoid forgetting to set enable len: 0 penalty_factor: 0.0 log: False custom_reward_function: path: null name: compute_score algorithm: gamma: 1.0 lam: 1.0 adv_estimator: gae use_kl_in_reward: False kl_penalty: kl # how to estimate kl divergence kl_ctrl: type: fixed kl_coef: 0.001 horizon: 10000 target_kl: 0.1 filter_groups: enable: False # We try to avoid forgetting to set enable metric: null # acc / score / seq_reward / seq_final_reward / ... max_num_gen_batches: 0 # Non-positive values mean no upper limit trainer: balance_batch: True total_epochs: 30 total_training_steps: null project_name: verl_examples experiment_name: gsm8k logger: [ 'console', 'wandb' ] log_val_generations: 0 nnodes: 1 n_gpus_per_node: 8 save_freq: -1 # auto: find the last ckpt to resume. If can't find, start from scratch resume_mode: auto # or disable or resume_path if resume_from_path is set resume_from_path: null val_before_train: True test_freq: -1 critic_warmup: 0 default_hdfs_dir: null remove_previous_ckpt_in_save: False del_local_ckpt_after_load: False default_local_dir: checkpoints/${trainer.project_name}/${trainer.experiment_name} max_actor_ckpt_to_keep: null max_critic_ckpt_to_keep: null