Commit 7f6cc211 authored by jerrrrry's avatar jerrrrry
Browse files

Initial commit

parents
Pipeline #2874 failed with stages
in 0 seconds
# -*- coding: utf-8 -*-
export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
NOW=$(date +%Y%m%d)
export WANDB_DIR=gsm8k-grpo-lora-qwen2.5-14b-${NOW}
export WANDB_PROJECT=${WANDB_DIR}
export WANDB_EXP=14b-${NOW}
MODEL_PATH=Qwen/Qwen2.5-14B-Instruct
set -x
nproc_per_gpu=58 # 32√ → 64× → 48√ → 56√ → 60× → 58√ → 59×
nnodes=1
ngpu_per_node=2
total_procs=$(( nproc_per_gpu * nnodes * ngpu_per_node ))
mini_batch_size=$(( total_procs ))
python3 -m verl.trainer.main_ppo \
algorithm.adv_estimator=grpo \
data.train_files=data/gsm8k/train.parquet \
data.val_files=data/gsm8k/test.parquet \
data.train_batch_size=${total_procs} \
data.val_batch_size=${total_procs} \
data.max_prompt_length=512 \
data.max_response_length=1024 \
data.filter_overlong_prompts=True \
data.truncation='error' \
data.shuffle=False \
actor_rollout_ref.model.path=$MODEL_PATH \
actor_rollout_ref.model.use_shm=True \
actor_rollout_ref.model.enable_gradient_checkpointing=True \
actor_rollout_ref.model.lora_rank=32 \
actor_rollout_ref.model.lora_alpha=32 \
actor_rollout_ref.model.target_modules=all-linear \
actor_rollout_ref.actor.optim.lr=3e-5 \
actor_rollout_ref.model.use_remove_padding=True \
actor_rollout_ref.actor.ppo_mini_batch_size=${mini_batch_size} \
actor_rollout_ref.actor.ppo_micro_batch_size=${mini_batch_size} \
actor_rollout_ref.actor.use_kl_loss=True \
actor_rollout_ref.actor.kl_loss_coef=0.001 \
actor_rollout_ref.actor.kl_loss_type=low_var_kl \
actor_rollout_ref.actor.fsdp_config.fsdp_size=-1 \
actor_rollout_ref.model.enable_gradient_checkpointing=True \
actor_rollout_ref.actor.fsdp_config.param_offload=True \
actor_rollout_ref.actor.fsdp_config.optimizer_offload=True \
actor_rollout_ref.rollout.log_prob_micro_batch_size=${mini_batch_size} \
actor_rollout_ref.rollout.tensor_model_parallel_size=2 \
actor_rollout_ref.rollout.name=vllm \
actor_rollout_ref.rollout.gpu_memory_utilization=0.25 \
actor_rollout_ref.rollout.n=5 \
actor_rollout_ref.rollout.max_num_seqs=512 \
actor_rollout_ref.rollout.max_model_len=1536 \
actor_rollout_ref.rollout.max_num_batched_tokens=1536 \
actor_rollout_ref.rollout.enable_chunked_prefill=False \
actor_rollout_ref.rollout.load_format=safetensors \
actor_rollout_ref.rollout.layered_summon=True \
actor_rollout_ref.ref.log_prob_micro_batch_size=${mini_batch_size} \
actor_rollout_ref.ref.fsdp_config.param_offload=True \
actor_rollout_ref.actor.ulysses_sequence_parallel_size=2 \
actor_rollout_ref.actor.entropy_coeff=0.001 \
algorithm.kl_ctrl.kl_coef=0.001 \
algorithm.use_kl_in_reward=False \
trainer.critic_warmup=0 \
trainer.logger='["console","wandb"]' \
trainer.project_name=${WANDB_PROJECT} \
trainer.experiment_name=${WANDB_EXP} \
trainer.n_gpus_per_node=2 \
trainer.nnodes=1 \
trainer.save_freq=20 \
trainer.test_freq=5 \
trainer.total_epochs=1 $@ 2>&1 | tee ${WANDB_PROJECT}.log
set -x
gsm8k_train_path=$HOME/data/rlhf/gsm8k/train.parquet
gsm8k_test_path=$HOME/data/rlhf/math/test.parquet
model_path=Qwen/Qwen2.5-Coder-14B-Instruct
train_files="['$gsm8k_train_path']"
test_files="['$gsm8k_test_path']"
PYTHONPATH=/opt/tiger/open_verl python3 -m verl.trainer.main_ppo \
algorithm.adv_estimator=grpo \
data.train_files="$train_files" \
data.val_files="$test_files" \
data.train_batch_size=1024 \
data.max_prompt_length=1024 \
data.max_response_length=1024 \
data.filter_overlong_prompts=True \
data.truncation='error' \
actor_rollout_ref.model.path=$model_path \
actor_rollout_ref.actor.optim.lr=1e-6 \
actor_rollout_ref.model.use_remove_padding=True \
actor_rollout_ref.actor.ppo_mini_batch_size=256 \
actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=16 \
actor_rollout_ref.actor.use_kl_loss=True \
actor_rollout_ref.actor.kl_loss_coef=0.001 \
actor_rollout_ref.actor.kl_loss_type=low_var_kl \
actor_rollout_ref.actor.entropy_coeff=0 \
actor_rollout_ref.model.enable_gradient_checkpointing=True \
actor_rollout_ref.actor.fsdp_config.param_offload=True \
actor_rollout_ref.actor.fsdp_config.optimizer_offload=True \
actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=16 \
actor_rollout_ref.rollout.tensor_model_parallel_size=4 \
actor_rollout_ref.rollout.name=vllm \
actor_rollout_ref.rollout.gpu_memory_utilization=0.5 \
actor_rollout_ref.rollout.n=5 \
actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=16 \
actor_rollout_ref.ref.fsdp_config.param_offload=True \
algorithm.use_kl_in_reward=False \
trainer.critic_warmup=0 \
trainer.logger='["console","wandb"]' \
trainer.project_name='verl_grpo_example_gsm8k' \
trainer.experiment_name='qwen2_14b_function_rm' \
trainer.n_gpus_per_node=4 \
trainer.nnodes=1 \
trainer.save_freq=-1 \
trainer.test_freq=5 \
trainer.total_epochs=1 $@
# -*- coding: utf-8 -*-
export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
NOW=$(date +%Y%m%d)
export WANDB_DIR=gsm8k-grpo-lora-qwen2.5-32b-${NOW}
export WANDB_PROJECT=${WANDB_DIR}
export WANDB_EXP=32b-${NOW}
MODEL_PATH=Qwen/Qwen2.5-32B-Instruct
set -x
nproc_per_gpu=45 # 32√ → 64× → 48× → 40√ → 44√ → 46× → 45×
nnodes=1
ngpu_per_node=4
total_procs=$(( nproc_per_gpu * nnodes * ngpu_per_node ))
mini_batch_size=$(( total_procs ))
python3 -m verl.trainer.main_ppo \
algorithm.adv_estimator=grpo \
data.train_files=data/gsm8k/train.parquet \
data.val_files=data/gsm8k/test.parquet \
data.train_batch_size=${total_procs} \
data.val_batch_size=${total_procs} \
data.max_prompt_length=512 \
data.max_response_length=1024 \
data.filter_overlong_prompts=True \
data.truncation='error' \
data.shuffle=False \
actor_rollout_ref.model.path=$MODEL_PATH \
actor_rollout_ref.model.use_shm=True \
actor_rollout_ref.model.enable_gradient_checkpointing=True \
actor_rollout_ref.model.lora_rank=32 \
actor_rollout_ref.model.lora_alpha=32 \
actor_rollout_ref.model.target_modules=all-linear \
actor_rollout_ref.actor.optim.lr=3e-5 \
actor_rollout_ref.model.use_remove_padding=True \
actor_rollout_ref.actor.ppo_mini_batch_size=${mini_batch_size} \
actor_rollout_ref.actor.ppo_micro_batch_size=${mini_batch_size} \
actor_rollout_ref.actor.use_kl_loss=True \
actor_rollout_ref.actor.kl_loss_coef=0.001 \
actor_rollout_ref.actor.kl_loss_type=low_var_kl \
actor_rollout_ref.actor.fsdp_config.fsdp_size=-1 \
actor_rollout_ref.model.enable_gradient_checkpointing=True \
actor_rollout_ref.actor.fsdp_config.param_offload=True \
actor_rollout_ref.actor.fsdp_config.optimizer_offload=True \
actor_rollout_ref.rollout.log_prob_micro_batch_size=${mini_batch_size} \
actor_rollout_ref.rollout.tensor_model_parallel_size=4 \
actor_rollout_ref.rollout.name=vllm \
actor_rollout_ref.rollout.gpu_memory_utilization=0.3 \
actor_rollout_ref.rollout.n=5 \
actor_rollout_ref.rollout.max_num_seqs=512 \
actor_rollout_ref.rollout.max_model_len=1536 \
actor_rollout_ref.rollout.max_num_batched_tokens=1536 \
actor_rollout_ref.rollout.enable_chunked_prefill=False \
actor_rollout_ref.rollout.load_format=safetensors \
actor_rollout_ref.rollout.layered_summon=True \
actor_rollout_ref.ref.log_prob_micro_batch_size=${mini_batch_size} \
actor_rollout_ref.ref.fsdp_config.param_offload=True \
actor_rollout_ref.actor.ulysses_sequence_parallel_size=2 \
actor_rollout_ref.actor.entropy_coeff=0.001 \
algorithm.kl_ctrl.kl_coef=0.001 \
algorithm.use_kl_in_reward=False \
trainer.critic_warmup=0 \
trainer.logger='["console","wandb"]' \
trainer.project_name=${WANDB_PROJECT} \
trainer.experiment_name=${WANDB_EXP} \
trainer.n_gpus_per_node=4 \
trainer.nnodes=1 \
trainer.save_freq=20 \
trainer.test_freq=5 \
trainer.total_epochs=1 $@ 2>&1 | tee ${WANDB_PROJECT}.log
set -x
# we need this to avoid fragmentation of GPU memory
export PYTORCH_CUDA_ALLOC_CONF=max_split_size_mb:256
gsm8k_train_path=$HOME/data/rlhf/gsm8k/train.parquet
gsm8k_test_path=$HOME/data/rlhf/math/test.parquet
train_files="['$gsm8k_train_path']"
test_files="['$gsm8k_test_path']"
model_path=Qwen/Qwen2.5-32B
python3 -m verl.trainer.main_ppo --config-path=config \
--config-name='ppo_megatron_trainer.yaml'\
algorithm.adv_estimator=grpo \
data.train_files="$train_files" \
data.val_files="$test_files" \
data.train_batch_size=512 \
data.max_prompt_length=2048 \
data.max_response_length=6144 \
data.filter_overlong_prompts=True \
data.truncation='error' \
actor_rollout_ref.model.path=$model_path \
actor_rollout_ref.actor.optim.lr=1e-6 \
actor_rollout_ref.actor.ppo_mini_batch_size=256 \
actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=1 \
actor_rollout_ref.actor.megatron.tensor_model_parallel_size=8 \
actor_rollout_ref.actor.megatron.param_offload=True \
actor_rollout_ref.actor.megatron.grad_offload=True \
actor_rollout_ref.actor.megatron.optimizer_offload=True \
actor_rollout_ref.actor.use_kl_loss=True \
actor_rollout_ref.actor.kl_loss_coef=0.001 \
actor_rollout_ref.actor.kl_loss_type=low_var_kl \
actor_rollout_ref.actor.entropy_coeff=0 \
actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=1 \
actor_rollout_ref.rollout.tensor_model_parallel_size=8 \
actor_rollout_ref.rollout.name=vllm \
actor_rollout_ref.rollout.gpu_memory_utilization=0.6 \
actor_rollout_ref.rollout.n=5 \
actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=1 \
actor_rollout_ref.ref.megatron.param_offload=True \
algorithm.use_kl_in_reward=False \
trainer.critic_warmup=0 \
trainer.logger=console \
trainer.project_name='megatron_vllm_qwen2_32b' \
trainer.experiment_name='qwen2_32b_grpo_8_h20' \
trainer.n_gpus_per_node=8 \
trainer.nnodes=1 \
trainer.save_freq=-1 \
trainer.test_freq=5 \
trainer.total_epochs=15 $@
# -*- coding: utf-8 -*-
export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
NOW=$(date +%Y%m%d)
export WANDB_DIR=gsm8k-grpo-lora-qwen2.5-3b-${NOW}
export WANDB_PROJECT=${WANDB_DIR}
export WANDB_EXP=3b-${NOW}
MODEL_PATH=Qwen/Qwen2.5-3B-Instruct
set -x
nproc_per_gpu=62
nnodes=1
ngpu_per_node=1
total_procs=$(( nproc_per_gpu * nnodes * ngpu_per_node ))
mini_batch_size=$(( total_procs ))
python3 -m verl.trainer.main_ppo \
algorithm.adv_estimator=grpo \
data.train_files=data/gsm8k/train.parquet \
data.val_files=data/gsm8k/test.parquet \
data.train_batch_size=${total_procs} \
data.val_batch_size=${total_procs} \
data.max_prompt_length=512 \
data.max_response_length=1024 \
data.filter_overlong_prompts=True \
data.truncation='error' \
data.shuffle=False \
actor_rollout_ref.model.path=$MODEL_PATH \
actor_rollout_ref.model.use_shm=True \
actor_rollout_ref.model.enable_gradient_checkpointing=True \
actor_rollout_ref.model.lora_rank=32 \
actor_rollout_ref.model.lora_alpha=32 \
actor_rollout_ref.model.target_modules=all-linear \
actor_rollout_ref.actor.optim.lr=3e-5 \
actor_rollout_ref.model.use_remove_padding=True \
actor_rollout_ref.actor.ppo_mini_batch_size=${mini_batch_size} \
actor_rollout_ref.actor.ppo_micro_batch_size=${mini_batch_size} \
actor_rollout_ref.actor.use_kl_loss=True \
actor_rollout_ref.actor.kl_loss_coef=0.001 \
actor_rollout_ref.actor.kl_loss_type=low_var_kl \
actor_rollout_ref.actor.fsdp_config.fsdp_size=-1 \
actor_rollout_ref.model.enable_gradient_checkpointing=True \
actor_rollout_ref.actor.fsdp_config.param_offload=True \
actor_rollout_ref.actor.fsdp_config.optimizer_offload=True \
actor_rollout_ref.rollout.log_prob_micro_batch_size=${mini_batch_size} \
actor_rollout_ref.rollout.tensor_model_parallel_size=1 \
actor_rollout_ref.rollout.name=vllm \
actor_rollout_ref.rollout.gpu_memory_utilization=0.1 \
actor_rollout_ref.rollout.n=5 \
actor_rollout_ref.rollout.max_num_seqs=512 \
actor_rollout_ref.rollout.max_model_len=1536 \
actor_rollout_ref.rollout.max_num_batched_tokens=1536 \
actor_rollout_ref.rollout.enable_chunked_prefill=False \
actor_rollout_ref.rollout.load_format=safetensors \
actor_rollout_ref.rollout.layered_summon=True \
actor_rollout_ref.ref.log_prob_micro_batch_size=${mini_batch_size} \
actor_rollout_ref.ref.fsdp_config.param_offload=True \
actor_rollout_ref.actor.ulysses_sequence_parallel_size=1 \
actor_rollout_ref.actor.entropy_coeff=0.001 \
algorithm.kl_ctrl.kl_coef=0.001 \
algorithm.use_kl_in_reward=False \
trainer.critic_warmup=0 \
trainer.logger='["console","wandb"]' \
trainer.project_name=${WANDB_PROJECT} \
trainer.experiment_name=${WANDB_EXP} \
trainer.n_gpus_per_node=1 \
trainer.nnodes=1 \
trainer.save_freq=20 \
trainer.test_freq=5 \
trainer.total_epochs=1 $@ 2>&1 | tee ${WANDB_PROJECT}.log
set -x
gsm8k_train_path=$HOME/data/rlhf/gsm8k/train.parquet
gsm8k_val_path=$HOME/data/rlhf/math/test.parquet
model_path=Qwen/Qwen2-72B-Instruct
python3 -m verl.trainer.main_ppo \
algorithm.adv_estimator=grpo \
data.train_files=$data_path \
data.val_files=$gsm8k_val_path \
data.train_batch_size=1024 \
data.max_prompt_length=512 \
data.max_response_length=512 \
data.filter_overlong_prompts=True \
data.truncation='error' \
actor_rollout_ref.model.path=model_path \
actor_rollout_ref.actor.optim.lr=1e-6 \
actor_rollout_ref.model.use_remove_padding=True \
actor_rollout_ref.actor.ppo_mini_batch_size=256 \
actor_rollout_ref.actor.use_dynamic_bsz=True \
actor_rollout_ref.actor.ppo_max_token_len_per_gpu=24000 \
actor_rollout_ref.actor.use_kl_loss=True \
actor_rollout_ref.actor.kl_loss_coef=0.001 \
actor_rollout_ref.actor.kl_loss_type=low_var_kl \
actor_rollout_ref.actor.entropy_coeff=0 \
actor_rollout_ref.model.enable_gradient_checkpointing=True \
actor_rollout_ref.actor.fsdp_config.param_offload=True \
actor_rollout_ref.actor.fsdp_config.optimizer_offload=True \
actor_rollout_ref.rollout.tensor_model_parallel_size=16 \
actor_rollout_ref.rollout.name=vllm \
actor_rollout_ref.rollout.gpu_memory_utilization=0.6 \
actor_rollout_ref.rollout.n=5 \
actor_rollout_ref.ref.fsdp_config.param_offload=True \
algorithm.use_kl_in_reward=False \
trainer.critic_warmup=0 \
trainer.logger='["console","wandb"]' \
trainer.project_name='verl_grpo_example_gsm8k' \
trainer.experiment_name='Qwen2_72B_Instruct' \
trainer.n_gpus_per_node=8 \
trainer.nnodes=4 \
trainer.save_freq=-1 \
trainer.test_freq=5 \
trainer.total_epochs=1 $@
\ No newline at end of file
set -x
#### important: vllm version must be >= 0.8.3
gsm8k_train_path=$HOME/data/rlhf/gsm8k/train.parquet
gsm8k_val_path=$HOME/data/rlhf/math/test.parquet
model_path=Qwen/Qwen2-72B-Instruct
python3 -m verl.trainer.main_ppo \
algorithm.adv_estimator=grpo \
data.train_files=$gsm8k_train_path \
data.val_files=$gsm8k_val_path \
data.train_batch_size=1024 \
data.max_prompt_length=512 \
data.max_response_length=512 \
data.filter_overlong_prompts=True \
data.truncation='error' \
actor_rollout_ref.model.path=$model_path \
actor_rollout_ref.actor.optim.lr=1e-6 \
actor_rollout_ref.model.use_remove_padding=True \
actor_rollout_ref.actor.ppo_mini_batch_size=256 \
actor_rollout_ref.actor.use_dynamic_bsz=True \
actor_rollout_ref.actor.ppo_max_token_len_per_gpu=24000 \
actor_rollout_ref.actor.use_kl_loss=True \
actor_rollout_ref.actor.kl_loss_coef=0.001 \
actor_rollout_ref.actor.kl_loss_type=low_var_kl \
actor_rollout_ref.actor.entropy_coeff=0 \
actor_rollout_ref.model.enable_gradient_checkpointing=True \
actor_rollout_ref.actor.fsdp_config.param_offload=True \
actor_rollout_ref.actor.fsdp_config.optimizer_offload=True \
actor_rollout_ref.rollout.tensor_model_parallel_size=16 \
actor_rollout_ref.rollout.name=vllm \
actor_rollout_ref.rollout.gpu_memory_utilization=0.6 \
actor_rollout_ref.rollout.n=5 \
actor_rollout_ref.ref.fsdp_config.param_offload=True \
algorithm.use_kl_in_reward=False \
trainer.critic_warmup=0 \
trainer.logger='["console","wandb"]' \
trainer.project_name='verl_grpo_example_gsm8k' \
trainer.experiment_name='Qwen2_72B_Instruct' \
trainer.n_gpus_per_node=8 \
trainer.nnodes=4 \
trainer.save_freq=-1 \
trainer.test_freq=5 \
trainer.total_epochs=1 $@
\ No newline at end of file
# -*- coding: utf-8 -*-
export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
NOW=$(date +%Y%m%d)
export WANDB_DIR=gsm8k-grpo-lora-qwen2.5-72b-${NOW}
export WANDB_PROJECT=${WANDB_DIR}
export WANDB_EXP=72b-${NOW}
MODEL_PATH=Qwen/Qwen2.5-72B-Instruct
set -x
nproc_per_gpu=22 # 16√ → 32× → 24× → 20√ → 22√ → 23×
nnodes=1
ngpu_per_node=8
total_procs=$(( nproc_per_gpu * nnodes * ngpu_per_node ))
mini_batch_size=$(( total_procs ))
python3 -m verl.trainer.main_ppo \
algorithm.adv_estimator=grpo \
data.train_files=data/gsm8k/train.parquet \
data.val_files=data/gsm8k/test.parquet \
data.train_batch_size=${total_procs} \
data.val_batch_size=${total_procs} \
data.max_prompt_length=512 \
data.max_response_length=1024 \
data.filter_overlong_prompts=True \
data.truncation='error' \
data.shuffle=False \
actor_rollout_ref.model.path=$MODEL_PATH \
actor_rollout_ref.model.use_shm=True \
actor_rollout_ref.model.enable_gradient_checkpointing=True \
actor_rollout_ref.model.lora_rank=32 \
actor_rollout_ref.model.lora_alpha=32 \
actor_rollout_ref.model.target_modules=all-linear \
actor_rollout_ref.actor.optim.lr=3e-5 \
actor_rollout_ref.model.use_remove_padding=True \
actor_rollout_ref.actor.ppo_mini_batch_size=${mini_batch_size} \
actor_rollout_ref.actor.ppo_micro_batch_size=${mini_batch_size} \
actor_rollout_ref.actor.use_kl_loss=True \
actor_rollout_ref.actor.kl_loss_coef=0.001 \
actor_rollout_ref.actor.kl_loss_type=low_var_kl \
actor_rollout_ref.actor.fsdp_config.fsdp_size=-1 \
actor_rollout_ref.model.enable_gradient_checkpointing=True \
actor_rollout_ref.actor.fsdp_config.param_offload=True \
actor_rollout_ref.actor.fsdp_config.optimizer_offload=True \
actor_rollout_ref.rollout.log_prob_micro_batch_size=${mini_batch_size} \
actor_rollout_ref.rollout.tensor_model_parallel_size=8 \
actor_rollout_ref.rollout.name=vllm \
actor_rollout_ref.rollout.gpu_memory_utilization=0.4 \
actor_rollout_ref.rollout.n=5 \
actor_rollout_ref.rollout.max_num_seqs=512 \
actor_rollout_ref.rollout.max_model_len=1536 \
actor_rollout_ref.rollout.max_num_batched_tokens=1536 \
actor_rollout_ref.rollout.enable_chunked_prefill=False \
actor_rollout_ref.rollout.load_format=safetensors \
actor_rollout_ref.rollout.layered_summon=True \
actor_rollout_ref.ref.log_prob_micro_batch_size=${mini_batch_size} \
actor_rollout_ref.ref.fsdp_config.param_offload=True \
actor_rollout_ref.actor.ulysses_sequence_parallel_size=2 \
actor_rollout_ref.actor.entropy_coeff=0.001 \
algorithm.kl_ctrl.kl_coef=0.001 \
algorithm.use_kl_in_reward=False \
trainer.critic_warmup=0 \
trainer.logger='["console","wandb"]' \
trainer.project_name=${WANDB_PROJECT} \
trainer.experiment_name=${WANDB_EXP} \
trainer.n_gpus_per_node=8 \
trainer.nnodes=1 \
trainer.save_freq=20 \
trainer.test_freq=5 \
trainer.total_epochs=15 $@ 2>&1 | tee ${WANDB_PROJECT}.log
# -*- coding: utf-8 -*-
export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
NOW=$(date +%Y%m%d)
export WANDB_DIR=gsm8k-grpo-lora-qwen2.5-7b-${NOW}
export WANDB_PROJECT=${WANDB_DIR}
export WANDB_EXP=7b-${NOW}
MODEL_PATH=Qwen/Qwen2.5-7B-Instruct
set -x
nproc_per_gpu=16 # 64√ → 128× → 96√ → 112× → 104× → 100√ → 102× → 101×
nnodes=1
ngpu_per_node=1
total_procs=$(( nproc_per_gpu * nnodes * ngpu_per_node ))
mini_batch_size=$(( total_procs ))
python3 -m verl.trainer.main_ppo \
algorithm.adv_estimator=grpo \
data.train_files=data/gsm8k/train.parquet \
data.val_files=data/gsm8k/test.parquet \
data.train_batch_size=${total_procs} \
data.val_batch_size=${total_procs} \
data.max_prompt_length=512 \
data.max_response_length=1024 \
data.filter_overlong_prompts=True \
data.truncation='error' \
data.shuffle=False \
actor_rollout_ref.model.path=$MODEL_PATH \
actor_rollout_ref.model.use_shm=True \
actor_rollout_ref.model.enable_gradient_checkpointing=True \
actor_rollout_ref.model.lora_rank=32 \
actor_rollout_ref.model.lora_alpha=32 \
actor_rollout_ref.model.target_modules=all-linear \
actor_rollout_ref.actor.optim.lr=3e-5 \
actor_rollout_ref.model.use_remove_padding=True \
actor_rollout_ref.actor.ppo_mini_batch_size=${mini_batch_size} \
actor_rollout_ref.actor.ppo_micro_batch_size=${mini_batch_size} \
actor_rollout_ref.actor.use_kl_loss=True \
actor_rollout_ref.actor.kl_loss_coef=0.001 \
actor_rollout_ref.actor.kl_loss_type=low_var_kl \
actor_rollout_ref.actor.fsdp_config.fsdp_size=-1 \
actor_rollout_ref.model.enable_gradient_checkpointing=True \
actor_rollout_ref.actor.fsdp_config.param_offload=True \
actor_rollout_ref.actor.fsdp_config.optimizer_offload=True \
actor_rollout_ref.rollout.log_prob_micro_batch_size=${mini_batch_size} \
actor_rollout_ref.rollout.tensor_model_parallel_size=1 \
actor_rollout_ref.rollout.name=vllm \
actor_rollout_ref.rollout.gpu_memory_utilization=0.2 \
actor_rollout_ref.rollout.n=5 \
actor_rollout_ref.rollout.max_num_seqs=512 \
actor_rollout_ref.rollout.max_model_len=1536 \
actor_rollout_ref.rollout.max_num_batched_tokens=1536 \
actor_rollout_ref.rollout.enable_chunked_prefill=False \
actor_rollout_ref.rollout.load_format=safetensors \
actor_rollout_ref.rollout.layered_summon=True \
actor_rollout_ref.ref.log_prob_micro_batch_size=${mini_batch_size} \
actor_rollout_ref.ref.fsdp_config.param_offload=True \
actor_rollout_ref.actor.ulysses_sequence_parallel_size=1 \
actor_rollout_ref.actor.entropy_coeff=0.001 \
algorithm.kl_ctrl.kl_coef=0.001 \
algorithm.use_kl_in_reward=False \
trainer.critic_warmup=0 \
trainer.logger='["console","wandb"]' \
trainer.project_name=${WANDB_PROJECT} \
trainer.experiment_name=${WANDB_EXP} \
trainer.n_gpus_per_node=1 \
trainer.nnodes=1 \
trainer.save_freq=20 \
trainer.test_freq=5 \
trainer.total_epochs=1 $@ 2>&1 | tee ${WANDB_PROJECT}.log
set -x
gsm8k_train_path=$HOME/data/rlhf/gsm8k/train.parquet
gsm8k_test_path=$HOME/data/rlhf/math/test.parquet
model_path=Qwen/Qwen2-7B-Instruct
train_files="['$gsm8k_train_path']"
test_files="['$gsm8k_test_path']"
PYTHONPATH=/opt/tiger/open_verl python3 -m verl.trainer.main_ppo \
algorithm.adv_estimator=grpo \
data.train_files="$train_files" \
data.val_files="$test_files" \
data.train_batch_size=1024 \
data.max_prompt_length=1024 \
data.max_response_length=1024 \
data.filter_overlong_prompts=True \
data.truncation='error' \
actor_rollout_ref.model.path=$model_path \
actor_rollout_ref.actor.optim.lr=1e-6 \
actor_rollout_ref.model.use_remove_padding=True \
actor_rollout_ref.actor.ppo_mini_batch_size=256 \
actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=16 \
actor_rollout_ref.actor.use_kl_loss=True \
actor_rollout_ref.actor.kl_loss_coef=0.001 \
actor_rollout_ref.actor.kl_loss_type=low_var_kl \
actor_rollout_ref.actor.entropy_coeff=0 \
actor_rollout_ref.model.enable_gradient_checkpointing=True \
actor_rollout_ref.actor.fsdp_config.param_offload=True \
actor_rollout_ref.actor.fsdp_config.optimizer_offload=True \
actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=16 \
actor_rollout_ref.rollout.tensor_model_parallel_size=2 \
actor_rollout_ref.rollout.name=vllm \
actor_rollout_ref.rollout.gpu_memory_utilization=0.6 \
actor_rollout_ref.rollout.n=5 \
actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=16 \
actor_rollout_ref.ref.fsdp_config.param_offload=True \
algorithm.use_kl_in_reward=False \
trainer.critic_warmup=0 \
trainer.logger='["console","wandb"]' \
trainer.project_name='verl_grpo_example_gsm8k' \
trainer.experiment_name='qwen2_7b_function_rm' \
trainer.n_gpus_per_node=2 \
trainer.nnodes=1 \
trainer.save_freq=-1 \
trainer.test_freq=5 \
trainer.total_epochs=15 $@
# -------------------------------
# build-system
# -------------------------------
[build-system]
requires = [
"setuptools>=61.0",
"wheel"
]
build-backend = "setuptools.build_meta"
# -------------------------------
# project (PEP 621 metadata)
# -------------------------------
[project]
name = "verl"
# We'll mark the version as "dynamic" because it's read from the file "verl/version/version"
# (PEP 621 calls this "dynamic version").
# The actual version is specified in the [tool.setuptools.dynamic] section below.
dynamic = ["version", "dependencies", "optional-dependencies", "authors", "urls"]
description = "verl: Volcano Engine Reinforcement Learning for LLM"
license = {text = "Apache-2.0"} # Changed from file to text format
readme = {file = "README.md", content-type = "text/markdown"}
requires-python = ">=3.10"
# -------------------------------
# tool.ruff - Linting configuration
# -------------------------------
[tool.ruff]
# Note: While the formatter will attempt to format lines such that they remain within the line-length,
# it isn't a hard upper bound, and formatted lines may exceed the line-length.
line-length = 120
exclude = ["tests/workers/rollout/test_sglang_async_rollout_sf_tools.py", "scripts/legacy_model_merger.py"]
[tool.ruff.lint]
isort = {known-first-party = ["verl"]}
# c.f. https://github.com/vllm-project/vllm/blob/ce8d6b75fc0586045df75ee1568a5b5f9957251b/pyproject.toml
select = [
# pycodestyle
"E",
# Pyflakes
"F",
# pyupgrade
"UP",
# flake8-bugbear
"B",
# isort
"I",
"G",
]
ignore = [
# star imports
"F405", "F403",
# lambda expression assignment
"E731",
# Loop control variable not used within loop body
"B007",
# f-string format
"UP032",
# `.log()` statement uses f-string
"G004",
# X | None for type annotations
"UP045",
# deprecated import
"UP035",
]
# -------------------------------
# tool.mypy - typechecking config
# -------------------------------
[tool.mypy]
pretty = true
ignore_missing_imports = true
explicit_package_bases = true
follow_imports = "skip"
# Blanket silence
ignore_errors = true
[[tool.mypy.overrides]]
module = [
"verl.trainer.config.algorithm",
"verl.trainer.ppo.core_algos",
]
ignore_errors = false
# -------------------------------
# tool.setuptools - Additional config
# -------------------------------
[tool.setuptools]
# True means `setuptools` will attempt to include all relevant files in package_data automatically.
# This corresponds to `include_package_data=True` in setup.py.
include-package-data = true
# We read the version from a file in 'verl/version/version'
[tool.setuptools.dynamic]
version = {file = "verl/version/version"}
# If you need to mimic `package_dir={'': '.'}`:
[tool.setuptools.package-dir]
"" = "."
# If you need to include specific non-Python data (like YAML files or version file):
# This is the rough equivalent of package_data={'': ['version/*'], 'verl': ['trainer/config/*.yaml']}
[tool.setuptools.package-data]
verl = [
"version/*",
"trainer/config/*.yaml",
"trainer/config/*/*.yaml",
]
# Recipe
The examples under `recipes/` are representative extensions to verl for specific end-to-end RL training recipes.
The help the community reproduce experiments, verl team provides a snapshot of the codebase when each recipe is initially PR'ed to verl main. You can find them via [github branches](https://github.com/volcengine/verl/branches/all?query=recipe)
# Awesome work using verl
- [Logic-RL](https://github.com/Unakar/Logic-RL): a reproduction of DeepSeek R1 Zero on 2K Tiny Logic Puzzle Dataset. ![GitHub Repo stars](https://img.shields.io/github/stars/Unakar/Logic-RL)
- [Seed-Coder](https://github.com/ByteDance-Seed/Seed-Coder): RL training of Seed-Coder boosts performance on competitive programming ![GitHub Repo stars](https://img.shields.io/github/stars/ByteDance-Seed/Seed-Coder)
- [all-hands/openhands-lm-32b-v0.1](https://www.all-hands.dev/blog/introducing-openhands-lm-32b----a-strong-open-coding-agent-model): A strong, open coding agent model, trained with [multi-turn fine-tuning](https://github.com/volcengine/verl/pull/195)
- [s3](https://github.com/pat-jj/s3) **Efficient Yet Effective** Search Agent Training via RL ![GitHub Repo stars](https://img.shields.io/github/stars/pat-jj/s3)
- [Rec-R1](https://arxiv.org/pdf/2503.24289): Bridging Generative Large Language Models and Recommendation Systems via Reinforcement Learning
- [Explore RL Data Scaling](https://arxiv.org/abs/2503.22230): Exploring Data Scaling Trends and Effects in Reinforcement Learning from Human Feedback
- [FIRE](https://arxiv.org/abs/2410.21236): Flaming-hot initiation with regular execution sampling for large language models
- [DQO](https://arxiv.org/abs/2410.09302): Enhancing multi-Step reasoning abilities of language models through direct Q-function optimization
- [ProRL](https://arxiv.org/abs/2505.24864): Prolonged Reinforcement Learning Expands Reasoning Boundaries in Large Language Models
- [cognition-engineering](https://github.com/gair-nlp/cognition-engineering): Test time scaling drives cognition engineering. ![GitHub Repo stars](https://img.shields.io/github/stars/gair-nlp/cognition-engineering)
- [Trust Region Preference Approximation](https://github.com/XueruiSu/Trust-Region-Preference-Approximation): A simple and stable **reinforcement learning algorithm** for LLM reasoning. ![GitHub Repo stars](https://img.shields.io/github/stars/XueruiSu/Trust-Region-Preference-Approximation)
- [AdaRFT](https://github.com/uscnlp-lime/verl): Efficient Reinforcement Finetuning via **Adaptive Curriculum Learning** ![GitHub Repo stars](https://img.shields.io/github/stars/uscnlp-lime/verl)
- [critic-rl](https://github.com/HKUNLP/critic-rl): LLM critics for code generation ![GitHub Repo stars](https://img.shields.io/github/stars/HKUNLP/critic-rl)
- [self-rewarding-reasoning-LLM](https://arxiv.org/pdf/2502.19613): self-rewarding and correction with **generative reward models** ![GitHub Repo stars](https://img.shields.io/github/stars/RLHFlow/Self-rewarding-reasoning-LLM)
- [DeepEnlighten](https://github.com/DolbyUUU/DeepEnlighten): Reproduce R1 with **social reasoning** tasks and analyze key findings ![GitHub Repo stars](https://img.shields.io/github/stars/DolbyUUU/DeepEnlighten)
- [MetaSpatial](https://github.com/PzySeere/MetaSpatial): Reinforcing **3D Spatial Reasoning** in **VLMs** for the **Metaverse** ![GitHub Repo stars](https://img.shields.io/github/stars/PzySeere/MetaSpatial)
- [PURE](https://github.com/CJReinforce/PURE): **Credit assignment** is the key to successful reinforcement fine-tuning using **process reward model** ![GitHub Repo stars](https://img.shields.io/github/stars/CJReinforce/PURE)
- [cognitive-behaviors](https://github.com/kanishkg/cognitive-behaviors): Cognitive Behaviors that Enable Self-Improving Reasoners, or, Four Habits of Highly Effective STaRs ![GitHub Repo stars](https://img.shields.io/github/stars/kanishkg/cognitive-behaviors)
- [deepscaler](https://github.com/agentica-project/rllm/tree/deepscaler): iterative context scaling with GRPO ![GitHub Repo stars](https://img.shields.io/github/stars/agentica-project/deepscaler)
- [DAPO](https://dapo-sia.github.io/): the fully open source SOTA RL algorithm that beats DeepSeek-R1-zero-32B ![GitHub Repo stars](https://img.shields.io/github/stars/volcengine/verl)
- [NoisyRollout](https://github.com/NUS-TRAIL/NoisyRollout): Reinforcing Visual Reasoning with Data Augmentation ![GitHub Repo stars](https://img.shields.io/github/stars/NUS-TRAIL/NoisyRollout)
# Char Count
## Introduction
Char count is a simple NLP task. We create it for beginners to grasp the idea of RLVR. The task can be trained using a tiny model (e.g., https://huggingface.co/HuggingFaceTB/SmolLM2-135M) on a consumer GPU with only 8GB.
## Problem formulation
The prompt is: "How many {char} are there in {word}?". In order for LLM to better answer this question, we create SFT dataset with intermediate steps. For example,
```text
Question: How many n are there in n-i-n-e?
Answer:
n = n
i != n
n = n
e != n
\boxed{2}
```
Note that
- We add a dash between each individual char to make the task easier because each individual char will be tokenized to the same token by most tokenizer.
- In the SFT dataset, we create a CoT by listing all the individual chars and whether it equals to the target. In the end, it outputs the final answer inside the box.
- The task can be verified.
- The word is not always meaningful. Each char is sampled uniformly from a to z. We make the total length and the answer uniformly distributed within a range.
## Scripts
To create the dataset, run
```bash
python3 create_dataset.py
```
We create a train set and a val set. Both of them are used of SFT and RL. You can specify the total number of data, min/max length and data path.
To run the SFT
```bash
bash train_sft.sh
```
We train SFT for 3 epochs. After 3 epochs, the validation score is around 0.12.
To run GRPO
```bash
bash train_grpo.sh
```
We train GRPO for 2 epochs. After 2 epochs, the validation score is around 0.36.
# Copyright 2024 Bytedance Ltd. and/or its affiliates
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
Task description:
Given a random word and a random char, count the number of occurrence of char in the word.
Create CoT dataset that split the word into separate char. Then list the char and count the occurrence.
The word set comes from shakespeare
"""
import os.path
import random
prompt_template = "How many {} are there in word {}?"
def generate_random_char():
return chr(97 + random.randint(0, 25))
def create_prompt_response(min_length=3, max_length=5):
# randomly generate a length
word_length = random.randint(min_length, max_length)
# randomly generate a target count number. This makes the target number
target_count_number = random.randint(1, word_length)
char_lst = []
# generate the word
# step 1: generate the target word
target_char = generate_random_char()
for _ in range(target_count_number):
char_lst.append(target_char)
# step 2: generate other words
for _ in range(word_length - target_count_number):
while True:
char = generate_random_char()
if char != target_char:
char_lst.append(char)
break
# step 3: random permute char_lst
random.shuffle(char_lst)
word = "-".join(char_lst)
prompt = prompt_template.format(target_char, word)
final_answer = []
# cot
number = 0
for i, char in enumerate(char_lst):
cot = f"{char}"
if char != target_char:
cot += " != "
else:
cot += " = "
number += 1
cot += f"{target_char}."
final_answer.append(cot)
conclusion = f"\\boxed{{{number}}} {target_char} in {word}."
final_answer.append(conclusion)
final_answer = "\n".join(final_answer)
return prompt, final_answer
if __name__ == "__main__":
import argparse
parser = argparse.ArgumentParser()
parser.add_argument("--total_number", type=int, default=10000)
parser.add_argument("--min_length", type=int, default=5)
parser.add_argument("--max_length", type=int, default=20)
parser.add_argument("--data_path", type=str, default="~/data/char_count")
args = vars(parser.parse_args())
total_number = args["total_number"]
min_length = args["min_length"]
max_length = args["max_length"]
data_path = args["data_path"]
data_path = os.path.expanduser(data_path)
full_output = []
for _ in range(total_number):
output = create_prompt_response(min_length=min_length, max_length=max_length)
full_output.append(output)
# random reorder
random.shuffle(full_output)
# split for train and test
train_split_len = int(0.9 * len(full_output))
train_outputs = full_output[:train_split_len]
test_output = full_output[train_split_len:]
sft_train_dataset = {"prompt": [], "response": []}
for o in train_outputs:
sft_train_dataset["prompt"].append(o[0])
sft_train_dataset["response"].append(o[1])
sft_test_dataset = {"prompt": [], "response": []}
for o in test_output:
sft_test_dataset["prompt"].append(o[0])
sft_test_dataset["response"].append(o[1])
import pandas as pd
sft_train_dataset = pd.DataFrame(data=sft_train_dataset)
sft_test_dataset = pd.DataFrame(data=sft_test_dataset)
folder = os.path.join(data_path, "sft")
os.makedirs(folder, exist_ok=True)
sft_train_dataset.to_parquet(os.path.join(folder, "train.parquet"))
sft_test_dataset.to_parquet(os.path.join(folder, "test.parquet"))
# build RL dataset
rl_train_dataset = {"prompt": [], "data_source": [], "ability": [], "reward_model": [], "extra_info": []}
rl_test_dataset = {"prompt": [], "data_source": [], "ability": [], "reward_model": [], "extra_info": []}
from verl.utils.reward_score.math import last_boxed_only_string, remove_boxed
for o in train_outputs:
prompt = o[0]
response = o[1]
prompt_with_template = [
{
"role": "user",
"content": prompt,
}
]
rl_train_dataset["prompt"].append(prompt_with_template)
rl_train_dataset["data_source"].append("char_count")
rl_train_dataset["ability"].append("other")
rl_train_dataset["reward_model"].append(
{"style": "rule", "ground_truth": remove_boxed(last_boxed_only_string(response))}
)
rl_train_dataset["extra_info"].append({"response": response})
for o in test_output:
prompt = o[0]
response = o[1]
prompt_with_template = [
{
"role": "user",
"content": prompt,
}
]
rl_test_dataset["prompt"].append(prompt_with_template)
rl_test_dataset["data_source"].append("char_count")
rl_test_dataset["ability"].append("other")
rl_test_dataset["reward_model"].append(
{"style": "rule", "ground_truth": remove_boxed(last_boxed_only_string(response))}
)
rl_test_dataset["extra_info"].append({"response": response})
rl_train_dataset = pd.DataFrame(data=rl_train_dataset)
rl_test_dataset = pd.DataFrame(data=rl_test_dataset)
folder = os.path.join(data_path, "rl")
os.makedirs(folder, exist_ok=True)
rl_train_dataset.to_parquet(os.path.join(folder, "train.parquet"))
rl_test_dataset.to_parquet(os.path.join(folder, "test.parquet"))
# Copyright 2024 Bytedance Ltd. and/or its affiliates
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
Reward function
"""
from verl.utils.reward_score import math
def char_count_reward_function(data_source, solution_str, ground_truth, extra_info=None):
try:
last_boxed_string = math.last_boxed_only_string(solution_str)
if last_boxed_string is None:
return 0
solution = math.remove_boxed(last_boxed_string)
if solution == ground_truth:
return 1
else:
return 0
except Exception:
print(ground_truth, solution_str)
return 0
set -x
python3 -m verl.trainer.main_ppo \
algorithm.adv_estimator=grpo \
data.train_files=$HOME/data/char_count/rl/train.parquet \
data.val_files=$HOME/data/char_count/rl/test.parquet \
data.train_batch_size=128 \
data.max_prompt_length=128 \
data.max_response_length=128 \
data.filter_overlong_prompts=False \
data.truncation='error' \
actor_rollout_ref.model.path=./models/sft/global_step_105 \
actor_rollout_ref.actor.optim.lr=1e-6 \
actor_rollout_ref.model.use_remove_padding=True \
actor_rollout_ref.actor.ppo_mini_batch_size=16 \
actor_rollout_ref.actor.use_dynamic_bsz=True \
actor_rollout_ref.actor.ppo_max_token_len_per_gpu=5000 \
actor_rollout_ref.actor.use_kl_loss=False \
actor_rollout_ref.actor.kl_loss_coef=0.0 \
actor_rollout_ref.actor.kl_loss_type=low_var_kl \
actor_rollout_ref.actor.entropy_coeff=0 \
actor_rollout_ref.model.enable_gradient_checkpointing=True \
actor_rollout_ref.actor.fsdp_config.param_offload=True \
actor_rollout_ref.actor.fsdp_config.optimizer_offload=True \
actor_rollout_ref.rollout.tensor_model_parallel_size=1 \
actor_rollout_ref.rollout.name=vllm \
actor_rollout_ref.rollout.gpu_memory_utilization=0.7 \
actor_rollout_ref.rollout.n=8 \
actor_rollout_ref.ref.fsdp_config.param_offload=True \
algorithm.use_kl_in_reward=False \
trainer.critic_warmup=0 \
trainer.logger='["console","tensorboard"]' \
trainer.project_name='verl_example' \
trainer.experiment_name='smol135m_grpo' \
trainer.val_before_train=True \
trainer.n_gpus_per_node=1 \
trainer.nnodes=1 \
trainer.save_freq=-1 \
trainer.test_freq=5 \
trainer.total_epochs=2 \
custom_reward_function.path=recipe/char_count/reward_function.py \
custom_reward_function.name=char_count_reward_function
set -x
nproc_per_node=1
save_path=./models/sft
torchrun --standalone --nnodes=1 --nproc_per_node=$nproc_per_node \
-m verl.trainer.fsdp_sft_trainer \
data.train_files=$HOME/data/char_count/sft/train.parquet \
data.val_files=$HOME/data/char_count/sft/test.parquet \
data.prompt_key=prompt \
data.response_key=response \
data.micro_batch_size_per_gpu=8 \
data.max_length=256 \
data.train_batch_size=256 \
use_remove_padding=True \
model.partial_pretrain=HuggingFaceTB/SmolLM2-135M-Instruct \
trainer.default_local_dir=$save_path \
trainer.project_name=char_count-sft \
trainer.experiment_name=char_count-sft-SmolLM2-135M-Instruct \
trainer.total_epochs=3 \
trainer.logger=console
\ No newline at end of file
# Recipe: Decoupled Clip and Dynamic Sampling Policy Optimization (DAPO)
> Open-Source Algorithm Implementation & Expriement Running: [Yuxuan Tong](https://tongyx361.github.io/), [Guangming Sheng](https://hk.linkedin.com/in/guangming-sheng-b50640211)
> [!IMPORTANT]
>
> **🔥 News!!!**
>
> - [2025/04] We reproduced the results of two versions of DAPO ([Full](./run_dapo_qwen2.5_32b.sh) & [w/o Dynamic Sampling](./run_dapo_wo_ds_qwen2.5_32b.sh)), achieving 52% and 50% on AIME 2024 respectively, based on [the latest codebase on `recipe/dapo`](https://github.com/volcengine/verl/tree/recipe/dapo/recipe/dapo). Please check the details in [W&B](https://wandb.ai/verl-org/DAPO%20Reproduction%20on%20verl/workspace?nw=wmb4qxfht0n).
> - [2025/03] We published the training record of [an early version of DAPO (w/o Token-level PG Loss & Dynamic Sampling)](./run_dapo_early_qwen2.5_32b.sh), achieving 44% on AIME 2024, in [W&B](https://wandb.ai/verl-org/DAPO%20Reproduction%20on%20verl/workspace?nw=wmb4qxfht0n).
🏠 [Homepage](https://dapo-sia.github.io/) | 📝 [Paper@arXiv](https://arxiv.org/abs/2503.14476) | 🤗 [Datasets&Models@HF](https://huggingface.co/collections/BytedTsinghua-SIA/dapo-67d7f1517ee33c8aed059da0) | 🐱 [Code@GitHub](https://github.com/volcengine/verl/tree/recipe/dapo/recipe/dapo) | 🐱 [Repo@GitHub](https://github.com/BytedTsinghua-SIA/DAPO)
> We propose the **D**ecoupled Clip and Dynamic s**A**mpling **P**olicy **O**ptimization (DAPO) algorithm. By making our work publicly available, we provide the broader research community and society with practical access to scalable reinforcement learning, enabling all to benefit from these advancements. Our system is based on the awesome [verl](https://github.com/volcengine/verl) framework. Thanks for their great work! Applying DAPO training to Qwen2.5-32B base model proves to outperform the previous state-of-the-art DeepSeek-R1-Zero-Qwen-32B on AIME 2024, achieving **50%** accuracy with **50%** less training steps.
>
> ![dapo-main-result](https://dapo-sia.github.io/static/images/score.png)
## Quickstart
1. Prepare the datasets **on the Ray cluster**:
```bash
bash prepare_dapo_data.sh # This downloads the datasets to ${HOME}/verl/data by default
```
2. Submit the job to the Ray cluster **from any machine**:
```bash
cd verl # Repo root
export RAY_ADDRESS="http://${RAY_IP:-localhost}:8265" # The Ray cluster address to connect to
export WORKING_DIR="${PWD}" # The local directory to package to the Ray cluster
# Set the runtime environment like env vars and pip packages for the Ray cluster in yaml
export RUNTIME_ENV="./recipe/dapo/runtime_env.yaml" # This sets environment variables for the Ray cluster
bash recipe/dapo/run_dapo_qwen2.5_32b.sh # or other scripts
```
## Reproduction Runs
| Setup | AIME 2024 Acc. | Hardware | Image | Commit | Environment Variables | Training Script | Training Record |
| -------------------------------------------- | -------------- | --------- | -------------------------------------------------------------------- | -------------------------------------------------------------------------------------------- | --------------------------------------------------------------------------------------------------------------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------- | ----------------------------------------------------------------------------------------- |
| DAPO | 52% | 16x8xH800 | `hiyouga/verl:ngc-th2.6.0-cu126-vllm0.8.3-flashinfer0.2.2-cxx11abi0` | [`4f80e4`](https://github.com/volcengine/verl/tree/4f80e465c2ec79ab9c3c30ec74b9745de61d0490) | [runtime_env.yaml](https://github.com/volcengine/verl/blob/4f80e465c2ec79ab9c3c30ec74b9745de61d0490/recipe/dapo/runtime_env.yaml) | [run_dapo_qwen2.5_32b.sh](https://github.com/volcengine/verl/blob/4f80e465c2ec79ab9c3c30ec74b9745de61d0490/recipe/dapo/run_dapo_qwen2.5_32b.sh) | [W&B](https://wandb.ai/verl-org/DAPO%20Reproduction%20on%20verl/workspace?nw=wmb4qxfht0n) |
| DAPO w/o Dynamic Sampling | 50% | 16x8xH800 | `hiyouga/verl:ngc-th2.6.0-cu126-vllm0.8.3-flashinfer0.2.2-cxx11abi0` | [`4f80e4`](https://github.com/volcengine/verl/tree/4f80e465c2ec79ab9c3c30ec74b9745de61d0490) | [runtime_env.yaml](https://github.com/volcengine/verl/blob/4f80e465c2ec79ab9c3c30ec74b9745de61d0490/recipe/dapo/runtime_env.yaml) | [run_dapo_wo_ds_qwen2.5_32b.sh](https://github.com/volcengine/verl/blob/4f80e465c2ec79ab9c3c30ec74b9745de61d0490/recipe/dapo/run_dapo_wo_ds_qwen2.5_32b.sh) | [W&B](https://wandb.ai/verl-org/DAPO%20Reproduction%20on%20verl/workspace?nw=wmb4qxfht0n) |
| DAPO w/o Token-level Loss & Dynamic Sampling | 44% | 16x8xH20 | `hiyouga/verl:ngc-th2.5.1-cu120-vllm0.7.4-hotfix` | [`4f80e4`](https://github.com/volcengine/verl/tree/4f80e465c2ec79ab9c3c30ec74b9745de61d0490) | [runtime_env.yaml](https://github.com/volcengine/verl/blob/4f80e465c2ec79ab9c3c30ec74b9745de61d0490/recipe/dapo/runtime_env.yaml) | [run_dapo_early_qwen2.5_32b.sh](https://github.com/volcengine/verl/blob/4f80e465c2ec79ab9c3c30ec74b9745de61d0490/recipe/dapo/run_dapo_early_qwen2.5_32b.sh) | [W&B](https://wandb.ai/verl-org/DAPO%20Reproduction%20on%20verl/workspace?nw=wmb4qxfht0n) |
> [!IMPORTANT]
>
> **📢 Call for Contribution!**
>
> Welcome to submit your reproduction runs and setups!
## Configuration
### Separated Clip Epsilons (-> Clip-Higher)
An example configuration:
```yaml
actor_rollout_ref:
actor:
clip_ratio_low: 0.2
clip_ratio_high: 0.28
```
`clip_ratio_low` and `clip_ratio_high` specify the $\varepsilon_{\text {low }}$ and $\varepsilon_{\text {high }}$ in the DAPO objective.
Core relevant code:
```python
pg_losses1 = -advantages * ratio
pg_losses2 = -advantages * torch.clamp(ratio, 1 - cliprange_low, 1 + cliprange_high)
pg_losses = torch.maximum(pg_losses1, pg_losses2)
```
### Dynamic Sampling (with Group Filtering)
An example configuration:
```yaml
data:
gen_batch_size: 1536
train_batch_size: 512
algorithm:
filter_groups:
enable: True
metric: acc # score / seq_reward / seq_final_reward / ...
max_num_gen_batches: 10 # Non-positive values mean no upper limit
```
Setting `filter_groups.enable` to `True` will filter out groups whose outputs' `metric` are all the same, e.g., for `acc`, groups whose outputs' accuracies are all 1 or 0.
The trainer will repeat sampling with `gen_batch_size` until there are enough qualified groups for `train_batch_size` or reaching the upper limit specified by `max_num_gen_batches`.
Core relevant code:
```python
prompt_bsz = self.config.data.train_batch_size
if num_prompt_in_batch < prompt_bsz:
print(f'{num_prompt_in_batch=} < {prompt_bsz=}')
num_gen_batches += 1
max_num_gen_batches = self.config.algorithm.filter_groups.max_num_gen_batches
if max_num_gen_batches <= 0 or num_gen_batches < max_num_gen_batches:
print(f'{num_gen_batches=} < {max_num_gen_batches=}. Keep generating...')
continue
else:
raise ValueError(
f'{num_gen_batches=} >= {max_num_gen_batches=}. Generated too many. Please check your data.'
)
else:
# Align the batch
traj_bsz = self.config.data.train_batch_size * self.config.actor_rollout_ref.rollout.n
batch = batch[:traj_bsz]
```
### Flexible Loss Aggregation Mode (-> Token-level Loss)
An example configuration:
```yaml
actor_rollout_ref:
actor:
loss_agg_mode: "token-mean" # / "seq-mean-token-sum" / "seq-mean-token-mean"
# NOTE: "token-mean" is the default behavior
```
Setting `loss_agg_mode` to `token-mean` will mean the (policy gradient) loss across all the tokens in all the sequences in a mini-batch.
Core relevant code:
```python
if loss_agg_mode == "token-mean":
loss = verl_F.masked_mean(loss_mat, loss_mask)
elif loss_agg_mode == "seq-mean-token-sum":
seq_losses = torch.sum(loss_mat * loss_mask, dim=-1) # token-sum
loss = torch.mean(seq_losses) # seq-mean
elif loss_agg_mode == "seq-mean-token-mean":
seq_losses = torch.sum(loss_mat * loss_mask, dim=-1) / torch.sum(loss_mask, dim=-1) # token-mean
loss = torch.mean(seq_losses) # seq-mean
else:
raise ValueError(f"Invalid loss_agg_mode: {loss_agg_mode}")
```
### Overlong Reward Shaping
An example configuration:
```yaml
data:
max_response_length: 20480 # 16384 + 4096
reward_model:
overlong_buffer:
enable: True
len: 4096
penalty_factor: 1.0
```
Setting `overlong_buffer.enable` to `True` will penalize the outputs whose lengths are overlong but still within the hard context limit.
Specifically, the penalty increases linearly from `0` to `overlong_buffer.penalty_factor` when the length of the output exceeds the `max_response_length` by `0` to `overlong_buffer.len` tokens.
Core relevant code:
```python
if self.overlong_buffer_cfg.enable:
overlong_buffer_len = self.overlong_buffer_cfg.len
expected_len = self.max_resp_len - overlong_buffer_len
exceed_len = valid_response_length - expected_len
overlong_penalty_factor = self.overlong_buffer_cfg.penalty_factor
overlong_reward = min(-exceed_len / overlong_buffer_len * overlong_penalty_factor, 0)
reward += overlong_reward
```
## FAQ
### Where is the "Overlong Filtering" in the paper?
Most experiments in the paper, including the best-performant one, are run without Overlong Filtering because it's somehow overlapping with Overlong Reward Shaping in terms of properly learning from the longest outputs. So we don't implement it here.
### What's the difference between [the `recipe/dapo` directory in the `main` branch](https://github.com/volcengine/verl/tree/main/recipe/dapo) and the [`recipe/dapo` branch](https://github.com/volcengine/verl/tree/recipe/dapo/recipe/dapo)?
[The `recipe/dapo` branch](https://github.com/volcengine/verl/tree/recipe/dapo/recipe/dapo) is for **as-is reproduction** and thus won't be updated with new features.
[The `recipe/dapo` directory in the `main` branch](https://github.com/volcengine/verl/tree/main/recipe/dapo) works as an example of how to extend the latest `verl` to implement an algorithm recipe, which will be maintained with new features.
### Why can't I produce similar results after modifications?
RL infrastructures nowadays still have inherent unrobustness, on which we are still working hard to improve.
We strongly recommend to only modify one thing at a time.
We also list some known problems here:
1. Enabling CUDA graph (`enforce_eager=False`) might cause model performance degradation, whose cause is still under investigation.
hydra:
searchpath:
- file://verl/trainer/config
defaults:
- ppo_trainer
- _self_
data:
gen_batch_size: ${data.train_batch_size}
reward_model:
reward_manager: dapo
overlong_buffer:
enable: False # We try to avoid forgetting to set enable
len: 0
penalty_factor: 0.0
log: False
algorithm:
filter_groups:
_target_: verl.trainer.config.FilterGroupsConfig
enable: False # We try to avoid forgetting to set enable
metric: null # acc / score / seq_reward / seq_final_reward / ...
max_num_gen_batches: 0 # Non-positive values mean no upper limit
trainer:
project_name: verl-dapo
# Copyright 2024 Bytedance Ltd. and/or its affiliates
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
FSDP PPO Trainer with Ray-based single controller.
This trainer supports model-agonistic model initialization with huggingface
"""
import uuid
from collections import defaultdict
from copy import deepcopy
from pprint import pprint
import numpy as np
import torch
from tqdm import tqdm
from verl import DataProto
from verl.trainer.ppo.core_algos import agg_loss
from verl.trainer.ppo.metric_utils import (
compute_data_metrics,
compute_throughout_metrics,
compute_timing_metrics,
reduce_metrics,
)
from verl.trainer.ppo.ray_trainer import (
AdvantageEstimator,
RayPPOTrainer,
apply_kl_penalty,
compute_advantage,
compute_response_mask,
)
from verl.utils.profiler import marked_timer
class RayDAPOTrainer(RayPPOTrainer):
"""
Note that this trainer runs on the driver process on a single CPU/GPU node.
"""
def fit(self):
"""
The training loop of PPO.
The driver process only need to call the compute functions of the worker group through RPC
to construct the PPO dataflow.
The light-weight advantage computation is done on the driver process.
"""
from omegaconf import OmegaConf
from verl.utils.tracking import Tracking
logger = Tracking(
project_name=self.config.trainer.project_name,
experiment_name=self.config.trainer.experiment_name,
default_backend=self.config.trainer.logger,
config=OmegaConf.to_container(self.config, resolve=True),
)
self.global_steps = 0
self.gen_steps = 0
# load checkpoint before doing anything
self._load_checkpoint()
# perform validation before training
# currently, we only support validation using the reward_function.
if self.val_reward_fn is not None and self.config.trainer.get("val_before_train", True):
val_metrics = self._validate()
assert val_metrics, f"{val_metrics=}"
pprint(f"Initial validation metrics: {val_metrics}")
logger.log(data=val_metrics, step=self.global_steps)
if self.config.trainer.get("val_only", False):
return
# add tqdm
progress_bar = tqdm(total=self.total_training_steps, initial=self.global_steps, desc="Training Progress")
# we start from step 1
self.global_steps += 1
self.gen_steps += 1
last_val_metrics = None
timing_raw = defaultdict(float)
batch = None
num_prompt_in_batch = 0
num_gen_batches = 0
for epoch in range(self.config.trainer.total_epochs):
for batch_dict in self.train_dataloader:
metrics = {}
do_profile = (
self.global_steps in self.config.trainer.profile_steps
if self.config.trainer.profile_steps is not None
else False
)
with marked_timer("start_profile", timing_raw):
if do_profile:
self.actor_rollout_wg.start_profile(role="e2e", profile_step=self.global_steps)
if self.use_reference_policy:
self.ref_policy_wg.start_profile()
if self.use_critic:
self.critic_wg.start_profile()
if self.use_rm:
self.rm_wg.start_profile()
new_batch: DataProto = DataProto.from_single_dict(batch_dict)
num_gen_batches += 1
# pop those keys for generation
if "multi_modal_data" in new_batch.non_tensor_batch.keys():
gen_batch = new_batch.pop(
batch_keys=["input_ids", "attention_mask", "position_ids"],
non_tensor_batch_keys=["raw_prompt_ids", "multi_modal_data"],
)
else:
gen_batch = new_batch.pop(
batch_keys=["input_ids", "attention_mask", "position_ids"],
non_tensor_batch_keys=["raw_prompt_ids"],
)
gen_batch = gen_batch.repeat(repeat_times=self.config.actor_rollout_ref.rollout.n, interleave=True)
is_last_step = self.gen_steps >= self.total_training_steps
with marked_timer("step", timing_raw):
# generate a batch
with marked_timer("gen", timing_raw, "red"):
gen_batch_output = self.actor_rollout_wg.generate_sequences(gen_batch)
timing_raw.update(gen_batch_output.meta_info["timing"])
gen_batch_output.meta_info.pop("timing", None)
if self.config.algorithm.adv_estimator == AdvantageEstimator.REMAX:
with marked_timer("gen_max", timing_raw, "red"):
gen_baseline_batch = deepcopy(gen_batch)
gen_baseline_batch.meta_info["do_sample"] = False
gen_baseline_output = self.actor_rollout_wg.generate_sequences(gen_baseline_batch)
new_batch = new_batch.union(gen_baseline_output)
reward_baseline_tensor = self.reward_fn(new_batch)
reward_baseline_tensor = reward_baseline_tensor.sum(dim=-1)
new_batch.pop(batch_keys=list(gen_baseline_output.batch.keys()))
new_batch.batch["reward_baselines"] = reward_baseline_tensor
del gen_baseline_batch, gen_baseline_output
new_batch.non_tensor_batch["uid"] = np.array(
[str(uuid.uuid4()) for _ in range(len(new_batch.batch))], dtype=object
)
# repeat to align with repeated responses in rollout
new_batch = new_batch.repeat(repeat_times=self.config.actor_rollout_ref.rollout.n, interleave=True)
new_batch = new_batch.union(gen_batch_output)
with marked_timer("reward", timing_raw, "yellow"):
# compute scores. Support both model and function-based.
# We first compute the scores using reward model. Then, we call reward_fn to combine
# the results from reward model and rule-based results.
if self.use_rm:
# we first compute reward model score
reward_tensor = self.rm_wg.compute_rm_score(new_batch)
new_batch = new_batch.union(reward_tensor)
# we combine with rule-based rm
reward_extra_infos_dict: dict[str, list]
try:
reward_result = self.reward_fn(new_batch, return_dict=True)
reward_tensor = reward_result["reward_tensor"]
reward_extra_infos_dict = reward_result.get("reward_extra_info", {})
except Exception as e:
print(f"Error in reward_fn: {e}")
reward_tensor = self.reward_fn(new_batch)
reward_extra_infos_dict = {}
new_batch.batch["token_level_scores"] = reward_tensor
if reward_extra_infos_dict:
new_batch.non_tensor_batch.update(
{k: np.array(v) for k, v in reward_extra_infos_dict.items()}
)
# compute rewards. apply_kl_penalty if available
if self.config.algorithm.use_kl_in_reward:
new_batch, kl_metrics = apply_kl_penalty(
new_batch, kl_ctrl=self.kl_ctrl_in_reward, kl_penalty=self.config.algorithm.kl_penalty
)
metrics.update(
kl_metrics
) # TODO: This will be cleared if we use multiple genenration batches
else:
new_batch.batch["token_level_rewards"] = new_batch.batch["token_level_scores"]
if not self.config.algorithm.filter_groups.enable:
batch = new_batch
else: # NOTE: When prompts after filtering is less than train batch size,
# we skip to the next generation batch
metric_name = self.config.algorithm.filter_groups.metric
if metric_name == "seq_final_reward":
# Turn to numpy for easier filtering
new_batch.non_tensor_batch["seq_final_reward"] = (
new_batch.batch["token_level_rewards"].sum(dim=-1).numpy()
)
elif metric_name == "seq_reward":
new_batch.non_tensor_batch["seq_reward"] = (
new_batch.batch["token_level_scores"].sum(dim=-1).numpy()
)
# Collect the sequence reward for each trajectory
prompt_uid2metric_vals = defaultdict(list)
for uid, metric_val in zip(
new_batch.non_tensor_batch["uid"], new_batch.non_tensor_batch[metric_name], strict=True
):
prompt_uid2metric_vals[uid].append(metric_val)
prompt_uid2metric_std = {}
for prompt_uid, metric_vals in prompt_uid2metric_vals.items():
prompt_uid2metric_std[prompt_uid] = np.std(metric_vals)
kept_prompt_uids = [
uid
for uid, std in prompt_uid2metric_std.items()
if std > 0 or len(prompt_uid2metric_vals[uid]) == 1
]
num_prompt_in_batch += len(kept_prompt_uids)
kept_traj_idxs = []
for idx, traj_from_prompt_uid in enumerate(new_batch.non_tensor_batch["uid"]):
if traj_from_prompt_uid in kept_prompt_uids:
kept_traj_idxs.append(idx)
new_batch = new_batch[kept_traj_idxs]
batch = new_batch if batch is None else DataProto.concat([batch, new_batch])
prompt_bsz = self.config.data.train_batch_size
if num_prompt_in_batch < prompt_bsz:
print(f"{num_prompt_in_batch=} < {prompt_bsz=}")
max_num_gen_batches = self.config.algorithm.filter_groups.max_num_gen_batches
if max_num_gen_batches <= 0 or num_gen_batches < max_num_gen_batches:
print(f"{num_gen_batches=}. Keep generating...")
progress_bar.update(1)
self.gen_steps += 1
continue
else:
raise ValueError(
f"{num_gen_batches=} >= {max_num_gen_batches=}."
+ " Generated too many. Please check if your data are too difficult."
+ " You could also try set max_num_gen_batches=0 to enable endless trials."
)
else:
# Align the batch
traj_bsz = self.config.data.train_batch_size * self.config.actor_rollout_ref.rollout.n
batch = batch[:traj_bsz]
# === Updating ===
batch.batch["response_mask"] = compute_response_mask(batch)
# Balance the number of valid tokens across DP ranks.
# NOTE: This usually changes the order of data in the `batch`,
# which won't affect the advantage calculation (since it's based on uid),
# but might affect the loss calculation (due to the change of mini-batching).
# TODO: Decouple the DP balancing and mini-batching.
if self.config.trainer.balance_batch:
self._balance_batch(batch, metrics=metrics)
# compute global_valid tokens
batch.meta_info["global_token_num"] = torch.sum(batch.batch["attention_mask"], dim=-1).tolist()
# recompute old_log_probs
with marked_timer("old_log_prob", timing_raw, "blue"):
old_log_prob = self.actor_rollout_wg.compute_log_prob(batch)
entropys = old_log_prob.batch["entropys"]
response_masks = batch.batch["response_mask"]
loss_agg_mode = self.config.actor_rollout_ref.actor.loss_agg_mode
entropy_agg = agg_loss(loss_mat=entropys, loss_mask=response_masks, loss_agg_mode=loss_agg_mode)
old_log_prob_metrics = {"actor/entropy": entropy_agg.detach().item()}
metrics.update(old_log_prob_metrics)
old_log_prob.batch.pop("entropys")
batch = batch.union(old_log_prob)
if self.use_reference_policy:
# compute reference log_prob
with marked_timer("ref", timing_raw, "olive"):
ref_log_prob = self.ref_policy_wg.compute_ref_log_prob(batch)
batch = batch.union(ref_log_prob)
# compute values
if self.use_critic:
with marked_timer("values", timing_raw, "cyan"):
values = self.critic_wg.compute_values(batch)
batch = batch.union(values)
with marked_timer("adv", timing_raw, "brown"):
# compute advantages, executed on the driver process
norm_adv_by_std_in_grpo = self.config.algorithm.get("norm_adv_by_std_in_grpo", True)
batch = compute_advantage(
batch,
adv_estimator=self.config.algorithm.adv_estimator,
gamma=self.config.algorithm.gamma,
lam=self.config.algorithm.lam,
num_repeat=self.config.actor_rollout_ref.rollout.n,
norm_adv_by_std_in_grpo=norm_adv_by_std_in_grpo,
)
# update critic
if self.use_critic:
with marked_timer("update_critic", timing_raw, "pink"):
critic_output = self.critic_wg.update_critic(batch)
critic_output_metrics = reduce_metrics(critic_output.meta_info["metrics"])
metrics.update(critic_output_metrics)
# implement critic warmup
if self.config.trainer.critic_warmup <= self.global_steps:
# update actor
with marked_timer("update_actor", timing_raw, "red"):
actor_output = self.actor_rollout_wg.update_actor(batch)
actor_output_metrics = reduce_metrics(actor_output.meta_info["metrics"])
metrics.update(actor_output_metrics)
# validate
if (
self.val_reward_fn is not None
and self.config.trainer.test_freq > 0
and (is_last_step or self.global_steps % self.config.trainer.test_freq == 0)
):
with marked_timer("testing", timing_raw, "green"):
val_metrics: dict = self._validate()
if is_last_step:
last_val_metrics = val_metrics
metrics.update(val_metrics)
if self.config.trainer.save_freq > 0 and (
is_last_step or self.global_steps % self.config.trainer.save_freq == 0
):
with marked_timer("save_checkpoint", timing_raw, "green"):
self._save_checkpoint()
with marked_timer("stop_profile", timing_raw):
if do_profile:
self.actor_rollout_wg.stop_profile()
if self.use_reference_policy:
self.ref_policy_wg.stop_profile()
if self.use_critic:
self.critic_wg.stop_profile()
if self.use_rm:
self.rm_wg.stop_profile()
# collect metrics
metrics.update(compute_data_metrics(batch=batch, use_critic=self.use_critic))
metrics.update(compute_timing_metrics(batch=batch, timing_raw=timing_raw))
# TODO: implement actual tflpo and theoretical tflpo
n_gpus = self.resource_pool_manager.get_n_gpus()
metrics.update(compute_throughout_metrics(batch=batch, timing_raw=timing_raw, n_gpus=n_gpus))
timing_raw = defaultdict(float) # clear timing
metrics["train/num_gen_batches"] = num_gen_batches
batch = None
num_prompt_in_batch = 0
num_gen_batches = 0
# TODO: make a canonical logger that supports various backend
logger.log(data=metrics, step=self.global_steps)
if is_last_step:
pprint(f"Final validation metrics: {last_val_metrics}")
progress_bar.close()
return
progress_bar.update(1)
self.global_steps += 1
self.gen_steps += 1
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment