Initial commit

7f6cc211 · jerrrrry · 7f6cc211 · 7f6cc211 · 7f6cc211 · 7f6cc211
Commit 7f6cc211 authored Aug 05, 2025 by jerrrrry
20 changed files
--- a/examples/sglang_multiturn/geo3k/run_qwen2.5-3b_geo3k_multiturn_4xgpu.sh
+++ b/examples/sglang_multiturn/geo3k/run_qwen2.5-3b_geo3k_multiturn_4xgpu.sh
+# run on 4xH100
+# make sure your current working directory is the root of the project
+
+set -x
+export HYDRA_FULL_ERROR=1
+ulimit -n 65535
+
+PROJECT_DIR="$(pwd)"
+CONFIG_PATH="$PROJECT_DIR/examples/sglang_multiturn/config"
+
+python3 -m verl.trainer.main_ppo \
+    --config-path="$CONFIG_PATH" \
+    --config-name='geo3k_multiturn_grpo' \
+    algorithm.adv_estimator=grpo \
+    data.train_batch_size=256 \
+    data.max_prompt_length=2048 \
+    data.max_response_length=2048 \
+    data.filter_overlong_prompts=True \
+    data.truncation='error' \
+    data.return_raw_chat=True \
+    actor_rollout_ref.model.path=Qwen/Qwen2.5-VL-3B-Instruct \
+    actor_rollout_ref.actor.optim.lr=1e-6 \
+    actor_rollout_ref.model.use_remove_padding=True \
+    actor_rollout_ref.actor.ppo_mini_batch_size=256 \
+    actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=32 \
+    actor_rollout_ref.actor.use_kl_loss=True \
+    actor_rollout_ref.actor.kl_loss_coef=0.001 \
+    actor_rollout_ref.actor.kl_loss_type=low_var_kl \
+    actor_rollout_ref.actor.entropy_coeff=0 \
+    actor_rollout_ref.model.enable_gradient_checkpointing=True \
+    actor_rollout_ref.actor.fsdp_config.param_offload=False \
+    actor_rollout_ref.actor.fsdp_config.optimizer_offload=False \
+    actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=32 \
+    actor_rollout_ref.rollout.tensor_model_parallel_size=2 \
+    actor_rollout_ref.rollout.name=sglang \
+    actor_rollout_ref.rollout.gpu_memory_utilization=0.5 \
+    actor_rollout_ref.rollout.n=16 \
+    actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=32 \
+    actor_rollout_ref.ref.fsdp_config.param_offload=True \
+    algorithm.use_kl_in_reward=False \
+    trainer.critic_warmup=0 \
+    trainer.logger='["console","wandb"]' \
+    trainer.project_name='geo3k_async_rl' \
+    trainer.experiment_name='qwen2.5-3b_function_rm-geo3k-async-sgl-multi-w-tool-verify-n16-4cards' \
+    trainer.n_gpus_per_node=4 \
+    trainer.nnodes=1 \
+    trainer.save_freq=-1 \
+    trainer.test_freq=20 \
+    trainer.total_epochs=15 \
+    actor_rollout_ref.actor.ppo_max_token_len_per_gpu=8192 \
+    actor_rollout_ref.rollout.log_prob_max_token_len_per_gpu=8192 \
+    actor_rollout_ref.ref.log_prob_max_token_len_per_gpu=8192 \
+    critic.ppo_max_token_len_per_gpu=8192 \
+    critic.forward_max_token_len_per_gpu=8192 \
+    data.train_files=$HOME/data/geo3k/train.parquet \
+    data.val_files=$HOME/data/geo3k/test.parquet \
+    actor_rollout_ref.rollout.multi_turn.tool_config_path="$PROJECT_DIR/examples/sglang_multiturn/config/tool_config/geo3k_tool_config.yaml" \
+    $@
\ No newline at end of file
--- a/examples/sglang_multiturn/geo3k/run_qwen2.5-3b_megatron_geo3k_multiturn.sh
+++ b/examples/sglang_multiturn/geo3k/run_qwen2.5-3b_megatron_geo3k_multiturn.sh
+# run on 8xH100
+# make sure your current working directory is the root of the project
+# this is a verification training script, the parallel setting should be tuned to your model
+
+set -x
+
+export PYTHONUNBUFFERED=1
+export RAY_DEDUP_LOGS=0
+export RUST_BACKTRACE=1
+export HYDRA_FULL_ERROR=1
+export CUDA_DEVICE_MAX_CONNECTIONS=1
+
+ulimit -n 65535
+
+PROJECT_DIR="$(pwd)"
+CONFIG_PATH="$PROJECT_DIR/examples/sglang_multiturn/config"
+
+python3 -m verl.trainer.main_ppo \
+    --config-path="$CONFIG_PATH" \
+    --config-name='geo3k_multiturn_megatron_grpo' \
+    algorithm.adv_estimator=grpo \
+    data.train_batch_size=256 \
+    data.max_prompt_length=2048 \
+    data.max_response_length=2048 \
+    data.filter_overlong_prompts=True \
+    data.truncation='error' \
+    data.return_raw_chat=True \
+    actor_rollout_ref.model.path=Qwen/Qwen2.5-VL-3B-Instruct \
+    actor_rollout_ref.actor.optim.lr=1e-6 \
+    actor_rollout_ref.actor.ppo_mini_batch_size=256 \
+    actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=32 \
+    actor_rollout_ref.actor.megatron.pipeline_model_parallel_size=2 \
+    actor_rollout_ref.actor.megatron.virtual_pipeline_model_parallel_size=2 \
+    actor_rollout_ref.actor.megatron.context_parallel_size=2 \
+    actor_rollout_ref.actor.megatron.tensor_model_parallel_size=2 \
+    actor_rollout_ref.actor.use_kl_loss=True \
+    actor_rollout_ref.actor.kl_loss_coef=0.001 \
+    actor_rollout_ref.actor.kl_loss_type=low_var_kl \
+    actor_rollout_ref.actor.entropy_coeff=0 \
+    actor_rollout_ref.actor.megatron.seed=42 \
+    actor_rollout_ref.ref.megatron.pipeline_model_parallel_size=2 \
+    actor_rollout_ref.ref.megatron.virtual_pipeline_model_parallel_size=2 \
+    actor_rollout_ref.ref.megatron.context_parallel_size=2 \
+    actor_rollout_ref.ref.megatron.tensor_model_parallel_size=2 \
+    actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=32 \
+    actor_rollout_ref.rollout.tensor_model_parallel_size=2 \
+    actor_rollout_ref.rollout.name=sglang \
+    actor_rollout_ref.rollout.gpu_memory_utilization=0.5 \
+    actor_rollout_ref.rollout.n=8 \
+    actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=32 \
+    algorithm.use_kl_in_reward=False \
+    trainer.critic_warmup=0 \
+    trainer.logger='["console","wandb"]' \
+    trainer.project_name='geo3k_async_rl' \
+    trainer.experiment_name='qwen2.5-3b_function_rm-geo3k-sgl-multi-w-tool-n8-mcore-v2505201745_seed42' \
+    trainer.n_gpus_per_node=8 \
+    trainer.nnodes=1 \
+    trainer.save_freq=-1 \
+    trainer.test_freq=20 \
+    data.train_files=$HOME/data/geo3k_multiturn_w_tool/train.parquet \
+    data.val_files=$HOME/data/geo3k_multiturn_w_tool/test.parquet \
+    actor_rollout_ref.rollout.multi_turn.tool_config_path="$PROJECT_DIR/examples/sglang_multiturn/config/tool_config/geo3k_tool_config.yaml" \
+    trainer.total_epochs=15 $@
+
--- a/examples/sglang_multiturn/run_qwen0.5b_gsm8k_multiturn_curriculum.sh
+++ b/examples/sglang_multiturn/run_qwen0.5b_gsm8k_multiturn_curriculum.sh
+# run on 8xH100
+# make sure your current working directory is the root of the project
+
+set -x
+
+ulimit -n 65535
+
+PROJECT_DIR="$(pwd)"
+CONFIG_PATH="$PROJECT_DIR/examples/sglang_multiturn/config"
+
+python3 -m verl.trainer.main_ppo \
+    --config-path="$CONFIG_PATH" \
+    --config-name='gsm8k_multiturn_grpo' \
+    algorithm.adv_estimator=grpo \
+    data.sampler.class_name="RandomCurriculumSampler" \
+    data.sampler.class_path="pkg://tests.utils.dataset.test_create_rl_sampler_on_cpu" \
+    data.dataloader_num_workers=0 \
+    data.max_prompt_length=1024 \
+    data.max_response_length=1024 \
+    data.train_batch_size=256 \
+    data.filter_overlong_prompts=True \
+    data.truncation='error' \
+    data.return_raw_chat=True \
+    actor_rollout_ref.model.path=Qwen/Qwen2.5-0.5B-Instruct \
+    actor_rollout_ref.actor.optim.lr=1e-6 \
+    actor_rollout_ref.model.use_remove_padding=True \
+    actor_rollout_ref.actor.ppo_mini_batch_size=256 \
+    actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=32 \
+    actor_rollout_ref.actor.use_kl_loss=True \
+    actor_rollout_ref.actor.kl_loss_coef=0.001 \
+    actor_rollout_ref.actor.kl_loss_type=low_var_kl \
+    actor_rollout_ref.actor.entropy_coeff=0 \
+    actor_rollout_ref.model.enable_gradient_checkpointing=True \
+    actor_rollout_ref.actor.fsdp_config.param_offload=False \
+    actor_rollout_ref.actor.fsdp_config.optimizer_offload=False \
+    actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=32 \
+    actor_rollout_ref.rollout.tensor_model_parallel_size=2 \
+    actor_rollout_ref.rollout.name=sglang \
+    actor_rollout_ref.rollout.gpu_memory_utilization=0.5 \
+    actor_rollout_ref.rollout.n=16 \
+    actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=32 \
+    actor_rollout_ref.ref.fsdp_config.param_offload=True \
+    algorithm.use_kl_in_reward=False \
+    trainer.critic_warmup=0 \
+    trainer.logger='["console","wandb"]' \
+    trainer.project_name='gsm8k_async_rl' \
+    trainer.experiment_name='qwen3-4b_function_rm-gsm8k-sgl-multi-w-tool-verify-n16' \
+    trainer.n_gpus_per_node=8 \
+    trainer.nnodes=1 \
+    trainer.save_freq=-1 \
+    trainer.test_freq=20 \
+    data.train_files=$HOME/data/gsm8k/train.parquet \
+    data.val_files=$HOME/data/gsm8k/test.parquet \
+    actor_rollout_ref.rollout.multi_turn.tool_config_path="$PROJECT_DIR/examples/sglang_multiturn/config/tool_config/gsm8k_tool_config.yaml" \
+    trainer.total_epochs=15 $@
+
--- a/examples/sglang_multiturn/run_qwen2.5-0.5b_gsm8k_multiturn_w_interaction.sh
+++ b/examples/sglang_multiturn/run_qwen2.5-0.5b_gsm8k_multiturn_w_interaction.sh
+# run on 8xH100
+# make sure your current working directory is the root of the project
+
+set -x
+
+ulimit -n 65535
+
+PROJECT_DIR="$(pwd)"
+CONFIG_PATH="$PROJECT_DIR/examples/sglang_multiturn/config"
+TRAIN_BATCH_SIZE=${TRAIN_BATCH_SIZE:-512}
+MICRO_BATCH_SIZE=${MICRO_BATCH_SIZE:-8}
+OFFLOAD=${OFFLOAD:-False}
+
+python3 -m verl.trainer.main_ppo \
+    --config-path="$CONFIG_PATH" \
+    --config-name='gsm8k_multiturn_grpo_w_interaction' \
+    algorithm.adv_estimator=grpo \
+    data.train_batch_size=$TRAIN_BATCH_SIZE \
+    data.max_prompt_length=1024 \
+    data.max_response_length=$((1024 * 3)) \
+    data.filter_overlong_prompts=True \
+    data.truncation='error' \
+    data.return_raw_chat=True \
+    actor_rollout_ref.model.path=Qwen/Qwen2.5-0.5B-Instruct \
+    actor_rollout_ref.model.use_remove_padding=True \
+    actor_rollout_ref.model.enable_gradient_checkpointing=True \
+    +actor_rollout_ref.model.enable_activation_offloading=True \
+    actor_rollout_ref.actor.optim.lr=1e-6 \
+    actor_rollout_ref.actor.ppo_mini_batch_size=$TRAIN_BATCH_SIZE \
+    actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=$MICRO_BATCH_SIZE \
+    actor_rollout_ref.actor.use_kl_loss=True \
+    actor_rollout_ref.actor.kl_loss_coef=0.001 \
+    actor_rollout_ref.actor.kl_loss_type=low_var_kl \
+    actor_rollout_ref.actor.entropy_coeff=0 \
+    actor_rollout_ref.actor.fsdp_config.param_offload=$OFFLOAD \
+    actor_rollout_ref.actor.fsdp_config.optimizer_offload=$OFFLOAD \
+    actor_rollout_ref.actor.fsdp_config.model_dtype=bfloat16 \
+    actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=$MICRO_BATCH_SIZE \
+    actor_rollout_ref.rollout.tensor_model_parallel_size=2 \
+    actor_rollout_ref.rollout.name=sglang \
+    actor_rollout_ref.rollout.gpu_memory_utilization=0.7 \
+    actor_rollout_ref.rollout.n=8 \
+    actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=$MICRO_BATCH_SIZE \
+    actor_rollout_ref.ref.fsdp_config.param_offload=$OFFLOAD \
+    algorithm.use_kl_in_reward=False \
+    trainer.critic_warmup=0 \
+    trainer.logger='["console","wandb"]' \
+    trainer.project_name='gsm8k_async_rl' \
+    trainer.experiment_name='qwen2.5-0.5b_function_rm-gsm8k-sgl-multi-w-interaction-n8' \
+    trainer.n_gpus_per_node=8 \
+    trainer.nnodes=1 \
+    trainer.save_freq=-1 \
+    trainer.test_freq=20 \
+    data.train_files=$HOME/data/gsm8k_verl_sgl_multi_turn_w_interaction/train.parquet \
+    data.val_files=$HOME/data/gsm8k_verl_sgl_multi_turn_w_interaction/test.parquet \
+    actor_rollout_ref.rollout.multi_turn.interaction_config_path="$PROJECT_DIR/examples/sglang_multiturn/config/interaction_config/gsm8k_interaction_config.yaml" \
+    trainer.total_epochs=15 $@
+
--- a/examples/sglang_multiturn/run_qwen2.5-3b_gsm8k_multiturn.sh
+++ b/examples/sglang_multiturn/run_qwen2.5-3b_gsm8k_multiturn.sh
+# run on 8xH100
+# make sure your current working directory is the root of the project
+
+set -x
+
+ulimit -n 65535
+
+PROJECT_DIR="$(pwd)"
+CONFIG_PATH="$PROJECT_DIR/examples/sglang_multiturn/config"
+
+python3 -m verl.trainer.main_ppo \
+    --config-path="$CONFIG_PATH" \
+    --config-name='gsm8k_multiturn_grpo' \
+    algorithm.adv_estimator=grpo \
+    data.train_batch_size=256 \
+    data.max_prompt_length=1024 \
+    data.max_response_length=1024 \
+    data.filter_overlong_prompts=True \
+    data.truncation='error' \
+    data.return_raw_chat=True \
+    actor_rollout_ref.model.path=Qwen/Qwen2.5-3B-Instruct \
+    actor_rollout_ref.actor.optim.lr=1e-6 \
+    actor_rollout_ref.model.use_remove_padding=True \
+    actor_rollout_ref.actor.ppo_mini_batch_size=256 \
+    actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=32 \
+    actor_rollout_ref.actor.use_kl_loss=True \
+    actor_rollout_ref.actor.kl_loss_coef=0.001 \
+    actor_rollout_ref.actor.kl_loss_type=low_var_kl \
+    actor_rollout_ref.actor.entropy_coeff=0 \
+    actor_rollout_ref.model.enable_gradient_checkpointing=True \
+    actor_rollout_ref.actor.fsdp_config.param_offload=False \
+    actor_rollout_ref.actor.fsdp_config.optimizer_offload=False \
+    actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=32 \
+    actor_rollout_ref.rollout.tensor_model_parallel_size=2 \
+    actor_rollout_ref.rollout.name=sglang \
+    actor_rollout_ref.rollout.gpu_memory_utilization=0.5 \
+    actor_rollout_ref.rollout.n=16 \
+    actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=32 \
+    actor_rollout_ref.ref.fsdp_config.param_offload=True \
+    algorithm.use_kl_in_reward=False \
+    trainer.critic_warmup=0 \
+    trainer.logger='["console","wandb"]' \
+    trainer.project_name='gsm8k_async_rl' \
+    trainer.experiment_name='qwen2.5-3b_function_rm-gsm8k-sgl-multi-w-tool-verify-n16' \
+    trainer.n_gpus_per_node=8 \
+    trainer.nnodes=1 \
+    trainer.save_freq=-1 \
+    trainer.test_freq=20 \
+    data.train_files=$HOME/data/gsm8k/train.parquet \
+    data.val_files=$HOME/data/gsm8k/test.parquet \
+    actor_rollout_ref.rollout.multi_turn.tool_config_path="$PROJECT_DIR/examples/sglang_multiturn/config/tool_config/gsm8k_tool_config.yaml" \
+    trainer.total_epochs=15 \
+    actor_rollout_ref.rollout.update_weights_bucket_megabytes=512 $@
+
--- a/examples/sglang_multiturn/run_qwen2.5-3b_gsm8k_multiturn_4xgpu.sh
+++ b/examples/sglang_multiturn/run_qwen2.5-3b_gsm8k_multiturn_4xgpu.sh
+# run on 4xH100
+# make sure your current working directory is the root of the project
+
+set -x
+export HYDRA_FULL_ERROR=1
+ulimit -n 65535
+
+PROJECT_DIR="$(pwd)"
+CONFIG_PATH="$PROJECT_DIR/examples/sglang_multiturn/config"
+
+python3 -m verl.trainer.main_ppo \
+    --config-path="$CONFIG_PATH" \
+    --config-name='gsm8k_multiturn_grpo' \
+    algorithm.adv_estimator=grpo \
+    data.train_batch_size=256 \
+    data.max_prompt_length=1024 \
+    data.max_response_length=1024 \
+    data.filter_overlong_prompts=True \
+    data.truncation='error' \
+    data.return_raw_chat=True \
+    actor_rollout_ref.model.path=Qwen/Qwen2.5-3B-Instruct \
+    actor_rollout_ref.actor.optim.lr=1e-6 \
+    actor_rollout_ref.model.use_remove_padding=True \
+    actor_rollout_ref.actor.ppo_mini_batch_size=256 \
+    actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=32 \
+    actor_rollout_ref.actor.use_kl_loss=True \
+    actor_rollout_ref.actor.kl_loss_coef=0.001 \
+    actor_rollout_ref.actor.kl_loss_type=low_var_kl \
+    actor_rollout_ref.actor.entropy_coeff=0 \
+    actor_rollout_ref.model.enable_gradient_checkpointing=True \
+    actor_rollout_ref.actor.fsdp_config.param_offload=False \
+    actor_rollout_ref.actor.fsdp_config.optimizer_offload=False \
+    actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=32 \
+    actor_rollout_ref.rollout.tensor_model_parallel_size=2 \
+    actor_rollout_ref.rollout.name=sglang \
+    actor_rollout_ref.rollout.gpu_memory_utilization=0.5 \
+    actor_rollout_ref.rollout.n=16 \
+    actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=32 \
+    actor_rollout_ref.ref.fsdp_config.param_offload=True \
+    algorithm.use_kl_in_reward=False \
+    trainer.critic_warmup=0 \
+    trainer.logger='["console","wandb"]' \
+    trainer.project_name='gsm8k_async_rl' \
+    trainer.experiment_name='qwen2.5-3b_function_rm-gsm8k-async-sgl-multi-w-tool-verify-n16-4cards' \
+    trainer.n_gpus_per_node=4 \
+    trainer.nnodes=1 \
+    trainer.save_freq=-1 \
+    trainer.test_freq=20 \
+    trainer.total_epochs=15 \
+    actor_rollout_ref.actor.ppo_max_token_len_per_gpu=8192 \
+    actor_rollout_ref.rollout.log_prob_max_token_len_per_gpu=8192 \
+    actor_rollout_ref.ref.log_prob_max_token_len_per_gpu=8192 \
+    critic.ppo_max_token_len_per_gpu=8192 \
+    critic.forward_max_token_len_per_gpu=8192 \
+    data.train_files=$HOME/data/gsm8k/train.parquet \
+    data.val_files=$HOME/data/gsm8k/test.parquet \
+    actor_rollout_ref.rollout.multi_turn.tool_config_path="$PROJECT_DIR/examples/sglang_multiturn/config/tool_config/gsm8k_tool_config.yaml" \
+    actor_rollout_ref.rollout.multi_turn.interaction_config_path="$PROJECT_DIR/examples/sglang_multiturn/config/interaction_config/gsm8k_interaction_config.yaml" \
+    actor_rollout_ref.rollout.multi_turn.max_user_turns=1 \
+    $@
\ No newline at end of file
--- a/examples/sglang_multiturn/run_qwen2.5-3b_gsm8k_tool_agent_mlflow.sh
+++ b/examples/sglang_multiturn/run_qwen2.5-3b_gsm8k_tool_agent_mlflow.sh
+# run on 8xH100
+# make sure your current working directory is the root of the project
+
+set -x
+
+ulimit -n 65535
+
+PROJECT_DIR="$(pwd)"
+CONFIG_PATH="$PROJECT_DIR/examples/sglang_multiturn/config"
+
+python3 -m verl.trainer.main_ppo \
+    --config-path="$CONFIG_PATH" \
+    --config-name='gsm8k_multiturn_grpo' \
+    algorithm.adv_estimator=grpo \
+    data.train_batch_size=256 \
+    data.max_prompt_length=1024 \
+    data.max_response_length=1024 \
+    data.filter_overlong_prompts=True \
+    data.truncation='error' \
+    data.return_raw_chat=True \
+    actor_rollout_ref.model.path=Qwen/Qwen2.5-3B-Instruct \
+    actor_rollout_ref.actor.optim.lr=1e-6 \
+    actor_rollout_ref.model.use_remove_padding=True \
+    actor_rollout_ref.actor.ppo_mini_batch_size=256 \
+    actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=32 \
+    actor_rollout_ref.actor.use_kl_loss=True \
+    actor_rollout_ref.actor.kl_loss_coef=0.001 \
+    actor_rollout_ref.actor.kl_loss_type=low_var_kl \
+    actor_rollout_ref.actor.entropy_coeff=0 \
+    actor_rollout_ref.model.enable_gradient_checkpointing=True \
+    actor_rollout_ref.actor.fsdp_config.param_offload=False \
+    actor_rollout_ref.actor.fsdp_config.optimizer_offload=False \
+    actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=32 \
+    actor_rollout_ref.rollout.tensor_model_parallel_size=2 \
+    actor_rollout_ref.rollout.name=sglang \
+    actor_rollout_ref.rollout.mode=async \
+    actor_rollout_ref.rollout.gpu_memory_utilization=0.5 \
+    actor_rollout_ref.rollout.n=16 \
+    actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=32 \
+    actor_rollout_ref.ref.fsdp_config.param_offload=True \
+    actor_rollout_ref.rollout.trace.backend=mlflow \
+    actor_rollout_ref.rollout.trace.token2text=True \
+    algorithm.use_kl_in_reward=False \
+    trainer.critic_warmup=0 \
+    trainer.logger='["console","mlflow"]' \
+    trainer.project_name='gsm8k_tool-agent' \
+    trainer.experiment_name='qwen2.5-3b_function_rm-gsm8k-sgl-tool-agent-verify-n16' \
+    trainer.n_gpus_per_node=8 \
+    trainer.nnodes=1 \
+    trainer.save_freq=-1 \
+    trainer.test_freq=20 \
+    trainer.total_training_steps=2 \
+    data.train_files=$HOME/data/gsm8k/train.parquet \
+    data.val_files=$HOME/data/gsm8k/test.parquet \
+    actor_rollout_ref.rollout.multi_turn.tool_config_path="$PROJECT_DIR/examples/sglang_multiturn/config/tool_config/gsm8k_tool_config.yaml" \
+    trainer.total_epochs=15 $@
+
--- a/examples/sglang_multiturn/run_qwen2.5-3b_megatron_gsm8k_multiturn.sh
+++ b/examples/sglang_multiturn/run_qwen2.5-3b_megatron_gsm8k_multiturn.sh
+# run on 8xH100
+# make sure your current working directory is the root of the project
+# this is a verification training script, the parallel setting should be tuned to your model
+
+set -x
+
+export PYTHONUNBUFFERED=1
+export RAY_DEDUP_LOGS=0
+export RUST_BACKTRACE=1
+export HYDRA_FULL_ERROR=1
+export CUDA_DEVICE_MAX_CONNECTIONS=1
+
+ulimit -n 65535
+
+PROJECT_DIR="$(pwd)"
+CONFIG_PATH="$PROJECT_DIR/examples/sglang_multiturn/config"
+
+python3 -m verl.trainer.main_ppo \
+    --config-path="$CONFIG_PATH" \
+    --config-name='gsm8k_multiturn_megatron_grpo' \
+    algorithm.adv_estimator=grpo \
+    data.train_batch_size=1024 \
+    data.max_prompt_length=1024 \
+    data.max_response_length=1024 \
+    data.filter_overlong_prompts=True \
+    data.truncation='error' \
+    data.return_raw_chat=True \
+    actor_rollout_ref.model.path=/user/longxiang1/models/Qwen/Qwen2.5-3B-Instruct \
+    actor_rollout_ref.actor.optim.lr=1e-6 \
+    actor_rollout_ref.actor.ppo_mini_batch_size=256 \
+    actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=16 \
+    actor_rollout_ref.actor.megatron.pipeline_model_parallel_size=2 \
+    actor_rollout_ref.actor.megatron.virtual_pipeline_model_parallel_size=2 \
+    actor_rollout_ref.actor.megatron.context_parallel_size=2 \
+    actor_rollout_ref.actor.megatron.tensor_model_parallel_size=2 \
+    actor_rollout_ref.actor.use_kl_loss=True \
+    actor_rollout_ref.actor.kl_loss_coef=0.001 \
+    actor_rollout_ref.actor.kl_loss_type=low_var_kl \
+    actor_rollout_ref.actor.entropy_coeff=0 \
+    actor_rollout_ref.actor.megatron.seed=42 \
+    actor_rollout_ref.ref.megatron.pipeline_model_parallel_size=2 \
+    actor_rollout_ref.ref.megatron.virtual_pipeline_model_parallel_size=2 \
+    actor_rollout_ref.ref.megatron.context_parallel_size=2 \
+    actor_rollout_ref.ref.megatron.tensor_model_parallel_size=2 \
+    actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=16 \
+    actor_rollout_ref.rollout.tensor_model_parallel_size=2 \
+    actor_rollout_ref.rollout.name=sglang \
+    actor_rollout_ref.rollout.gpu_memory_utilization=0.5 \
+    actor_rollout_ref.rollout.n=8 \
+    actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=16 \
+    algorithm.use_kl_in_reward=False \
+    trainer.critic_warmup=0 \
+    trainer.logger='["console","wandb"]' \
+    trainer.project_name='gsm8k_async_rl' \
+    trainer.experiment_name='qwen2.5-3b_function_rm-gsm8k-sgl-multi-w-tool-n8-mcore-v2505201745_seed42' \
+    trainer.n_gpus_per_node=8 \
+    trainer.nnodes=1 \
+    trainer.save_freq=-1 \
+    trainer.test_freq=20 \
+    data.train_files=/user/longxiang1/data/gsm8k_verl_sgl_multi_turn_preprocessed_v2/train.parquet \
+    data.val_files=/user/longxiang1/data/gsm8k_verl_sgl_multi_turn_preprocessed_v2/test.parquet \
+    actor_rollout_ref.rollout.multi_turn.tool_config_path="$PROJECT_DIR/examples/sglang_multiturn/config/tool_config/gsm8k_tool_config.yaml" \
+    trainer.total_epochs=15 $@
+
--- a/examples/sglang_multiturn/run_qwen3-4b_gsm8k_multiturn.sh
+++ b/examples/sglang_multiturn/run_qwen3-4b_gsm8k_multiturn.sh
+# run on 8xH100
+# make sure your current working directory is the root of the project
+
+set -x
+
+ulimit -n 65535
+
+PROJECT_DIR="$(pwd)"
+CONFIG_PATH="$PROJECT_DIR/examples/sglang_multiturn/config"
+
+python3 -m verl.trainer.main_ppo \
+    --config-path="$CONFIG_PATH" \
+    --config-name='gsm8k_multiturn_grpo' \
+    algorithm.adv_estimator=grpo \
+    data.train_batch_size=256 \
+    data.max_prompt_length=1024 \
+    data.max_response_length=1024 \
+    data.filter_overlong_prompts=True \
+    data.truncation='error' \
+    data.return_raw_chat=True \
+    actor_rollout_ref.model.path=Qwen/Qwen3-4B \
+    actor_rollout_ref.actor.optim.lr=1e-6 \
+    actor_rollout_ref.model.use_remove_padding=True \
+    actor_rollout_ref.actor.ppo_mini_batch_size=256 \
+    actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=32 \
+    actor_rollout_ref.actor.use_kl_loss=True \
+    actor_rollout_ref.actor.kl_loss_coef=0.001 \
+    actor_rollout_ref.actor.kl_loss_type=low_var_kl \
+    actor_rollout_ref.actor.entropy_coeff=0 \
+    actor_rollout_ref.model.enable_gradient_checkpointing=True \
+    actor_rollout_ref.actor.fsdp_config.param_offload=False \
+    actor_rollout_ref.actor.fsdp_config.optimizer_offload=False \
+    actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=32 \
+    actor_rollout_ref.rollout.tensor_model_parallel_size=2 \
+    actor_rollout_ref.rollout.name=sglang \
+    actor_rollout_ref.rollout.gpu_memory_utilization=0.5 \
+    actor_rollout_ref.rollout.n=16 \
+    actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=32 \
+    actor_rollout_ref.ref.fsdp_config.param_offload=True \
+    algorithm.use_kl_in_reward=False \
+    trainer.critic_warmup=0 \
+    trainer.logger='["console","wandb"]' \
+    trainer.project_name='gsm8k_async_rl' \
+    trainer.experiment_name='qwen3-4b_function_rm-gsm8k-sgl-multi-w-tool-verify-n16' \
+    trainer.n_gpus_per_node=8 \
+    trainer.nnodes=1 \
+    trainer.save_freq=-1 \
+    trainer.test_freq=20 \
+    data.train_files=$HOME/data/gsm8k/train.parquet \
+    data.val_files=$HOME/data/gsm8k/test.parquet \
+    actor_rollout_ref.rollout.multi_turn.tool_config_path="$PROJECT_DIR/examples/sglang_multiturn/config/tool_config/gsm8k_tool_config.yaml" \
+    trainer.total_epochs=15 $@
+
--- a/examples/sglang_multiturn/search_r1_like/local_dense_retriever/download.py
+++ b/examples/sglang_multiturn/search_r1_like/local_dense_retriever/download.py
+# Copyright 2024 Bytedance Ltd. and/or its affiliates
+# Copyright 2023-2024 SGLang Team
+# Copyright 2025 Search-R1 Contributors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# Adapted from https://github.com/PeterGriffinJin/Search-R1/blob/main/scripts/download.py
+
+
+import argparse
+
+from huggingface_hub import hf_hub_download
+
+parser = argparse.ArgumentParser(description="Download files from a Hugging Face dataset repository.")
+parser.add_argument("--repo_id", type=str, default="PeterJinGo/wiki-18-e5-index", help="Hugging Face repository ID")
+parser.add_argument("--save_path", type=str, required=True, help="Local directory to save files")
+
+args = parser.parse_args()
+
+repo_id = "PeterJinGo/wiki-18-e5-index"
+for file in ["part_aa", "part_ab"]:
+    hf_hub_download(
+        repo_id=repo_id,
+        filename=file,  # e.g., "e5_Flat.index"
+        repo_type="dataset",
+        local_dir=args.save_path,
+    )
+
+repo_id = "PeterJinGo/wiki-18-corpus"
+hf_hub_download(
+    repo_id=repo_id,
+    filename="wiki-18.jsonl.gz",
+    repo_type="dataset",
+    local_dir=args.save_path,
+)
--- a/examples/sglang_multiturn/search_r1_like/local_dense_retriever/retrieval_server.py
+++ b/examples/sglang_multiturn/search_r1_like/local_dense_retriever/retrieval_server.py
+# Copyright 2024 Bytedance Ltd. and/or its affiliates
+# Copyright 2023-2024 SGLang Team
+# Copyright 2025 Search-R1 Contributors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# Adapted from https://github.com/PeterGriffinJin/Search-R1/blob/main/search_r1/search/retrieval_server.py
+
+import argparse
+import json
+import warnings
+from typing import Optional
+
+import datasets
+import faiss
+import numpy as np
+import torch
+import uvicorn
+from fastapi import FastAPI
+from pydantic import BaseModel
+from tqdm import tqdm
+from transformers import AutoModel, AutoTokenizer
+
+
+def load_corpus(corpus_path: str):
+    corpus = datasets.load_dataset("json", data_files=corpus_path, split="train", num_proc=4)
+    return corpus
+
+
+def load_docs(corpus, doc_idxs):
+    results = [corpus[int(idx)] for idx in doc_idxs]
+    return results
+
+
+def load_model(model_path: str, use_fp16: bool = False):
+    model = AutoModel.from_pretrained(model_path, trust_remote_code=True)
+    model.eval()
+    model.cuda()
+    if use_fp16:
+        model = model.half()
+    tokenizer = AutoTokenizer.from_pretrained(model_path, use_fast=True, trust_remote_code=True)
+    return model, tokenizer
+
+
+def pooling(pooler_output, last_hidden_state, attention_mask=None, pooling_method="mean"):
+    if pooling_method == "mean":
+        last_hidden = last_hidden_state.masked_fill(~attention_mask[..., None].bool(), 0.0)
+        return last_hidden.sum(dim=1) / attention_mask.sum(dim=1)[..., None]
+    elif pooling_method == "cls":
+        return last_hidden_state[:, 0]
+    elif pooling_method == "pooler":
+        return pooler_output
+    else:
+        raise NotImplementedError("Pooling method not implemented!")
+
+
+class Encoder:
+    def __init__(self, model_name, model_path, pooling_method, max_length, use_fp16):
+        self.model_name = model_name
+        self.model_path = model_path
+        self.pooling_method = pooling_method
+        self.max_length = max_length
+        self.use_fp16 = use_fp16
+
+        self.model, self.tokenizer = load_model(model_path=model_path, use_fp16=use_fp16)
+        self.model.eval()
+
+    @torch.no_grad()
+    def encode(self, query_list: list[str], is_query=True) -> np.ndarray:
+        # processing query for different encoders
+        if isinstance(query_list, str):
+            query_list = [query_list]
+
+        if "e5" in self.model_name.lower():
+            if is_query:
+                query_list = [f"query: {query}" for query in query_list]
+            else:
+                query_list = [f"passage: {query}" for query in query_list]
+
+        if "bge" in self.model_name.lower():
+            if is_query:
+                query_list = [
+                    f"Represent this sentence for searching relevant passages: {query}" for query in query_list
+                ]
+
+        inputs = self.tokenizer(
+            query_list, max_length=self.max_length, padding=True, truncation=True, return_tensors="pt"
+        )
+        inputs = {k: v.cuda() for k, v in inputs.items()}
+
+        if "T5" in type(self.model).__name__:
+            # T5-based retrieval model
+            decoder_input_ids = torch.zeros((inputs["input_ids"].shape[0], 1), dtype=torch.long).to(
+                inputs["input_ids"].device
+            )
+            output = self.model(**inputs, decoder_input_ids=decoder_input_ids, return_dict=True)
+            query_emb = output.last_hidden_state[:, 0, :]
+        else:
+            output = self.model(**inputs, return_dict=True)
+            query_emb = pooling(
+                output.pooler_output, output.last_hidden_state, inputs["attention_mask"], self.pooling_method
+            )
+            if "dpr" not in self.model_name.lower():
+                query_emb = torch.nn.functional.normalize(query_emb, dim=-1)
+
+        query_emb = query_emb.detach().cpu().numpy()
+        query_emb = query_emb.astype(np.float32, order="C")
+
+        del inputs, output
+        torch.cuda.empty_cache()
+
+        return query_emb
+
+
+class BaseRetriever:
+    def __init__(self, config):
+        self.config = config
+        self.retrieval_method = config.retrieval_method
+        self.topk = config.retrieval_topk
+
+        self.index_path = config.index_path
+        self.corpus_path = config.corpus_path
+
+    def _search(self, query: str, num: int, return_score: bool):
+        raise NotImplementedError
+
+    def _batch_search(self, query_list: list[str], num: int, return_score: bool):
+        raise NotImplementedError
+
+    def search(self, query: str, num: int = None, return_score: bool = False):
+        return self._search(query, num, return_score)
+
+    def batch_search(self, query_list: list[str], num: int = None, return_score: bool = False):
+        return self._batch_search(query_list, num, return_score)
+
+
+class BM25Retriever(BaseRetriever):
+    def __init__(self, config):
+        super().__init__(config)
+        from pyserini.search.lucene import LuceneSearcher
+
+        self.searcher = LuceneSearcher(self.index_path)
+        self.contain_doc = self._check_contain_doc()
+        if not self.contain_doc:
+            self.corpus = load_corpus(self.corpus_path)
+        self.max_process_num = 8
+
+    def _check_contain_doc(self):
+        return self.searcher.doc(0).raw() is not None
+
+    def _search(self, query: str, num: int = None, return_score: bool = False):
+        if num is None:
+            num = self.topk
+        hits = self.searcher.search(query, num)
+        if len(hits) < 1:
+            if return_score:
+                return [], []
+            else:
+                return []
+        scores = [hit.score for hit in hits]
+        if len(hits) < num:
+            warnings.warn("Not enough documents retrieved!", stacklevel=2)
+        else:
+            hits = hits[:num]
+
+        if self.contain_doc:
+            all_contents = [json.loads(self.searcher.doc(hit.docid).raw())["contents"] for hit in hits]
+            results = [
+                {
+                    "title": content.split("\n")[0].strip('"'),
+                    "text": "\n".join(content.split("\n")[1:]),
+                    "contents": content,
+                }
+                for content in all_contents
+            ]
+        else:
+            results = load_docs(self.corpus, [hit.docid for hit in hits])
+
+        if return_score:
+            return results, scores
+        else:
+            return results
+
+    def _batch_search(self, query_list: list[str], num: int = None, return_score: bool = False):
+        results = []
+        scores = []
+        for query in query_list:
+            item_result, item_score = self._search(query, num, True)
+            results.append(item_result)
+            scores.append(item_score)
+        if return_score:
+            return results, scores
+        else:
+            return results
+
+
+class DenseRetriever(BaseRetriever):
+    def __init__(self, config):
+        super().__init__(config)
+        self.index = faiss.read_index(self.index_path)
+        if config.faiss_gpu:
+            co = faiss.GpuMultipleClonerOptions()
+            co.useFloat16 = True
+            co.shard = True
+            self.index = faiss.index_cpu_to_all_gpus(self.index, co=co)
+
+        self.corpus = load_corpus(self.corpus_path)
+        self.encoder = Encoder(
+            model_name=self.retrieval_method,
+            model_path=config.retrieval_model_path,
+            pooling_method=config.retrieval_pooling_method,
+            max_length=config.retrieval_query_max_length,
+            use_fp16=config.retrieval_use_fp16,
+        )
+        self.topk = config.retrieval_topk
+        self.batch_size = config.retrieval_batch_size
+
+    def _search(self, query: str, num: int = None, return_score: bool = False):
+        if num is None:
+            num = self.topk
+        query_emb = self.encoder.encode(query)
+        scores, idxs = self.index.search(query_emb, k=num)
+        idxs = idxs[0]
+        scores = scores[0]
+        results = load_docs(self.corpus, idxs)
+        if return_score:
+            return results, scores.tolist()
+        else:
+            return results
+
+    def _batch_search(self, query_list: list[str], num: int = None, return_score: bool = False):
+        if isinstance(query_list, str):
+            query_list = [query_list]
+        if num is None:
+            num = self.topk
+
+        results = []
+        scores = []
+        for start_idx in tqdm(range(0, len(query_list), self.batch_size), desc="Retrieval process: "):
+            query_batch = query_list[start_idx : start_idx + self.batch_size]
+            batch_emb = self.encoder.encode(query_batch)
+            batch_scores, batch_idxs = self.index.search(batch_emb, k=num)
+            batch_scores = batch_scores.tolist()
+            batch_idxs = batch_idxs.tolist()
+
+            # load_docs is not vectorized, but is a python list approach
+            flat_idxs = sum(batch_idxs, [])
+            batch_results = load_docs(self.corpus, flat_idxs)
+            # chunk them back
+            batch_results = [batch_results[i * num : (i + 1) * num] for i in range(len(batch_idxs))]
+
+            results.extend(batch_results)
+            scores.extend(batch_scores)
+
+            del batch_emb, batch_scores, batch_idxs, query_batch, flat_idxs, batch_results
+            torch.cuda.empty_cache()
+
+        if return_score:
+            return results, scores
+        else:
+            return results
+
+
+def get_retriever(config):
+    if config.retrieval_method == "bm25":
+        return BM25Retriever(config)
+    else:
+        return DenseRetriever(config)
+
+
+#####################################
+# FastAPI server below
+#####################################
+
+
+class Config:
+    """
+    Minimal config class (simulating your argparse)
+    Replace this with your real arguments or load them dynamically.
+    """
+
+    def __init__(
+        self,
+        retrieval_method: str = "bm25",
+        retrieval_topk: int = 10,
+        index_path: str = "./index/bm25",
+        corpus_path: str = "./data/corpus.jsonl",
+        dataset_path: str = "./data",
+        data_split: str = "train",
+        faiss_gpu: bool = True,
+        retrieval_model_path: str = "./model",
+        retrieval_pooling_method: str = "mean",
+        retrieval_query_max_length: int = 256,
+        retrieval_use_fp16: bool = False,
+        retrieval_batch_size: int = 128,
+    ):
+        self.retrieval_method = retrieval_method
+        self.retrieval_topk = retrieval_topk
+        self.index_path = index_path
+        self.corpus_path = corpus_path
+        self.dataset_path = dataset_path
+        self.data_split = data_split
+        self.faiss_gpu = faiss_gpu
+        self.retrieval_model_path = retrieval_model_path
+        self.retrieval_pooling_method = retrieval_pooling_method
+        self.retrieval_query_max_length = retrieval_query_max_length
+        self.retrieval_use_fp16 = retrieval_use_fp16
+        self.retrieval_batch_size = retrieval_batch_size
+
+
+class QueryRequest(BaseModel):
+    queries: list[str]
+    topk: Optional[int] = None
+    return_scores: bool = False
+
+
+app = FastAPI()
+
+
+@app.post("/retrieve")
+def retrieve_endpoint(request: QueryRequest):
+    """
+    Endpoint that accepts queries and performs retrieval.
+
+    Input format:
+    {
+      "queries": ["What is Python?", "Tell me about neural networks."],
+      "topk": 3,
+      "return_scores": true
+    }
+
+    Output format (when return_scores=True，similarity scores are returned):
+    {
+        "result": [
+            [   # Results for each query
+                {
+                    {"document": doc, "score": score}
+                },
+                # ... more documents
+            ],
+            # ... results for other queries
+        ]
+    }
+    """
+    if not request.topk:
+        request.topk = config.retrieval_topk  # fallback to default
+
+    # Perform batch retrieval
+    results, scores = retriever.batch_search(
+        query_list=request.queries, num=request.topk, return_score=request.return_scores
+    )
+
+    # Format response
+    resp = []
+    for i, single_result in enumerate(results):
+        if request.return_scores:
+            # If scores are returned, combine them with results
+            combined = []
+            for doc, score in zip(single_result, scores[i], strict=True):
+                combined.append({"document": doc, "score": score})
+            resp.append(combined)
+        else:
+            resp.append(single_result)
+    return {"result": resp}
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Launch the local faiss retriever.")
+    parser.add_argument(
+        "--index_path", type=str, default="/home/peterjin/mnt/index/wiki-18/e5_Flat.index", help="Corpus indexing file."
+    )
+    parser.add_argument(
+        "--corpus_path",
+        type=str,
+        default="/home/peterjin/mnt/data/retrieval-corpus/wiki-18.jsonl",
+        help="Local corpus file.",
+    )
+    parser.add_argument("--topk", type=int, default=3, help="Number of retrieved passages for one query.")
+    parser.add_argument("--retriever_name", type=str, default="e5", help="Name of the retriever model.")
+    parser.add_argument(
+        "--retriever_model", type=str, default="intfloat/e5-base-v2", help="Path of the retriever model."
+    )
+    parser.add_argument("--faiss_gpu", action="store_true", help="Use GPU for computation")
+
+    args = parser.parse_args()
+
+    # 1) Build a config (could also parse from arguments).
+    #    In real usage, you'd parse your CLI arguments or environment variables.
+    config = Config(
+        retrieval_method=args.retriever_name,  # or "dense"
+        index_path=args.index_path,
+        corpus_path=args.corpus_path,
+        retrieval_topk=args.topk,
+        faiss_gpu=args.faiss_gpu,
+        retrieval_model_path=args.retriever_model,
+        retrieval_pooling_method="mean",
+        retrieval_query_max_length=256,
+        retrieval_use_fp16=True,
+        retrieval_batch_size=512,
+    )
+
+    # 2) Instantiate a global retriever so it is loaded once and reused.
+    retriever = get_retriever(config)
+
+    # 3) Launch the server. By default, it listens on http://127.0.0.1:8000
+    uvicorn.run(app, host="0.0.0.0", port=8000)
--- a/examples/sglang_multiturn/search_r1_like/run_qwen2.5-3b_instruct_search_multiturn.sh
+++ b/examples/sglang_multiturn/search_r1_like/run_qwen2.5-3b_instruct_search_multiturn.sh
+# run on 8xH20
+# make sure your current working directory is the root of the project
+
+set -x
+
+ulimit -n 65535
+
+PROJECT_DIR="$(pwd)"
+CONFIG_PATH="$PROJECT_DIR/examples/sglang_multiturn/config"
+
+
+TRAIN_DATA="$HOME/data/searchR1_processed_direct/train.parquet"
+VAL_DATA="$HOME/data/searchR1_processed_direct/test.parquet"
+
+TOOL_CONFIG="$CONFIG_PATH/tool_config/search_tool_config.yaml"
+
+
+
+python3 -m verl.trainer.main_ppo \
+    --config-path="$CONFIG_PATH" \
+    --config-name='search_multiturn_grpo' \
+    algorithm.adv_estimator=grpo \
+    data.train_batch_size=512 \
+    data.val_batch_size=256 \
+    data.max_prompt_length=4096 \
+    data.max_response_length=3000 \
+    data.filter_overlong_prompts=True \
+    data.truncation='error' \
+    data.return_raw_chat=True \
+    actor_rollout_ref.model.path=Qwen/Qwen2.5-3B-Instruct \
+    actor_rollout_ref.actor.optim.lr=1e-6 \
+    actor_rollout_ref.actor.optim.lr_warmup_steps_ratio=0.285 \
+    actor_rollout_ref.model.use_remove_padding=True \
+    actor_rollout_ref.actor.ppo_mini_batch_size=256 \
+    actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=8 \
+    actor_rollout_ref.actor.use_kl_loss=True \
+    actor_rollout_ref.actor.kl_loss_coef=0.001 \
+    actor_rollout_ref.actor.kl_loss_type=low_var_kl \
+    actor_rollout_ref.actor.entropy_coeff=0 \
+    actor_rollout_ref.model.enable_gradient_checkpointing=True \
+    actor_rollout_ref.actor.fsdp_config.param_offload=False \
+    actor_rollout_ref.actor.fsdp_config.optimizer_offload=False \
+    actor_rollout_ref.rollout.max_model_len=15000 \
+    actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=8 \
+    actor_rollout_ref.rollout.tensor_model_parallel_size=1 \
+    actor_rollout_ref.rollout.name=sglang \
+    actor_rollout_ref.rollout.gpu_memory_utilization=0.5 \
+    actor_rollout_ref.rollout.n=5 \
+    actor_rollout_ref.rollout.multi_turn.max_assistant_turns=2 \
+    actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=8 \
+    actor_rollout_ref.ref.fsdp_config.param_offload=True \
+    algorithm.use_kl_in_reward=False \
+    trainer.critic_warmup=0 \
+    trainer.val_before_train=False \
+    trainer.logger='["console","wandb"]' \
+    trainer.project_name='search_r1_like_async_rl' \
+    trainer.experiment_name='qwen2.5-3b-instruct_function_rm-search-async-sgl-multi-w-searchtool-verify-n16' \
+    trainer.n_gpus_per_node=8 \
+    trainer.nnodes=1 \
+    trainer.save_freq=100 \
+    trainer.test_freq=50 \
+    data.train_files="$TRAIN_DATA" \
+    data.val_files="$VAL_DATA"  \
+    actor_rollout_ref.rollout.multi_turn.tool_config_path="$TOOL_CONFIG" \
+    trainer.total_epochs=1 $@
+
--- a/examples/slurm/ray_on_slurm.slurm
+++ b/examples/slurm/ray_on_slurm.slurm
+#!/bin/bash
+#SBATCH --job-name=verl-ray-on-slurm
+#SBATCH --nodes=2
+#SBATCH --ntasks-per-node=1
+#SBATCH --mem=200G
+#SBATCH --partition=your-partition
+#SBATCH --time=01:00:00
+#SBATCH --account=your-account
+#SBATCH --gpus-per-node=4
+#SBATCH --cpus-per-task=64
+#SBATCH --output=slurm-%j.out
+#SBATCH --error=slurm-%j.err
+
+# load necessary modules
+
+# replace these information with your own
+verl_workdir=/path/to/verl
+train_files=/path/to/gsm8k/train.parquet
+val_files=/path/to/gsm8k/test.parquet
+apptainer_image_path=/path/to/verl-ngc.sif
+# replace these information with your own
+
+# Getting the node names
+nodes=$(scontrol show hostnames "$SLURM_JOB_NODELIST")
+nodes_array=("$nodes")
+
+head_node=${nodes_array[0]}
+head_node_ip=$(srun --nodes=1 --ntasks=1 -w "$head_node" hostname --ip-address)
+
+# if we detect a space character in the head node IP, we'll
+# convert it to an ipv4 address. This step is optional.
+if [[ "$head_node_ip" == *" "* ]]; then
+IFS=' ' read -ra ADDR <<<"$head_node_ip"
+if [[ ${#ADDR[0]} -gt 16 ]]; then
+  head_node_ip=${ADDR[1]}
+else
+  head_node_ip=${ADDR[0]}
+fi
+echo "IPV6 address detected. We split the IPV4 address as $head_node_ip"
+fi
+
+port=6379
+ip_head=$head_node_ip:$port
+export ip_head
+echo "IP Head: $ip_head"
+
+# make sure we set environment variables before Ray initialization
+
+printenv
+
+echo "Starting HEAD at $head_node"
+srun --nodes=1 --ntasks=1 -w "$head_node" \
+    apptainer run --nv --bind $verl_workdir $apptainer_image_path \
+        ray start --head --node-ip-address="$head_node_ip" --port=$port \
+        --num-cpus "${SLURM_CPUS_PER_TASK}" --num-gpus "${SLURM_GPUS_PER_NODE}" --block &
+# optional, though may be useful in certain versions of Ray < 1.0.
+sleep 10
+
+# number of nodes other than the head node
+worker_num=$((SLURM_JOB_NUM_NODES - 1))
+
+for ((i = 1; i <= worker_num; i++)); do
+    node_i=${nodes_array[$i]}
+    echo "Starting WORKER $i at $node_i"
+    srun --nodes=1 --ntasks=1 -w "$node_i" \
+        apptainer run --nv --bind $verl_workdir $apptainer_image_path \
+            ray start --address "$ip_head" --num-cpus "${SLURM_CPUS_PER_TASK}" --num-gpus "${SLURM_GPUS_PER_NODE}" --block &
+    sleep 5
+done
+
+PYTHONUNBUFFERED=1 srun --overlap --nodes=1 --ntasks=1 -w "$head_node" \
+    apptainer run --nv --bind $verl_workdir $apptainer_image_path \
+    python3 -m verl.trainer.main_ppo \
+    algorithm.adv_estimator=gae \
+    data.train_files=$train_files \
+    data.val_files=$val_files \
+    data.train_batch_size=256 \
+    data.max_prompt_length=512 \
+    data.max_response_length=256 \
+    actor_rollout_ref.model.path=Qwen/Qwen2.5-0.5B-Instruct \
+    actor_rollout_ref.actor.optim.lr=1e-6 \
+    actor_rollout_ref.actor.ppo_mini_batch_size=64 \
+    actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=4 \
+    actor_rollout_ref.actor.use_kl_loss=False \
+    actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=8 \
+    actor_rollout_ref.rollout.tensor_model_parallel_size=1 \
+    actor_rollout_ref.rollout.gpu_memory_utilization=0.4 \
+    critic.optim.lr=1e-5 \
+    critic.model.path=Qwen/Qwen2.5-0.5B-Instruct \
+    critic.ppo_micro_batch_size_per_gpu=4 \
+    algorithm.use_kl_in_reward=False \
+    trainer.logger=console \
+    trainer.val_before_train=False \
+    trainer.n_gpus_per_node="${SLURM_GPUS_PER_NODE}" \
+    trainer.nnodes="${SLURM_NNODES}" \
+    trainer.save_freq=10 \
+    trainer.test_freq=10 \
+    trainer.total_epochs=15 2>&1 | tee verl_demo_slurm.log
--- a/examples/split_placement/README.md
+++ b/examples/split_placement/README.md
+# Split Placement Example
+Here we introduce how to run the naive implementation of the split placement of PPO algorithm.
+We will release the complete version of flexible placement in the near future.
+
+ For quickstart, you can only follow Step 2 to modify the code and then follow Step 4 to execute the split placement example.
+
+### Step 1: Placing the models to different GPUs
+Specify the placement and resource allocation. In the example, we place the actor and reference in the first half of the GPUs while map the critic and reward model (if any) to the second half of the GPUs.
+```python
+actor_rollout_ref_pool_id = 'actor_rollout_ref_pool'
+critic_pool_id = 'critic_pool'
+if config.trainer.nnodes // 2 == 0 and config.trainer.n_gpus_per_node // 2 > 0:
+    resource_pool_spec = {
+        actor_rollout_ref_pool_id: [config.trainer.n_gpus_per_node // 2] * config.trainer.nnodes,
+        critic_pool_id: [config.trainer.n_gpus_per_node // 2] * config.trainer.nnodes,
+    }
+else:
+    resource_pool_spec = {
+        actor_rollout_ref_pool_id: [config.trainer.n_gpus_per_node] * (config.trainer.nnodes // 2),
+        critic_pool_id: [config.trainer.n_gpus_per_node] * (config.trainer.nnodes // 2),
+    }
+print(f'resource_pool_spec: {resource_pool_spec}')
+mapping = {
+    Role.ActorRollout: actor_rollout_ref_pool_id,
+    Role.Critic: critic_pool_id,
+    Role.RefPolicy: actor_rollout_ref_pool_id,
+}
+mapping[Role.RewardModel] = critic_pool_id
+```
+
+### Step 2: Make the models executed asynchronously
+Based on the model placement, we need to make the models executed asynchronously.
+
+To do so, you need to turn off the `blocking` flag (i.e., `blocking=False`) in our decorator of some model operations.
+For example, we hope the actor update and critic update can be executed in parallel, then we need to make the following modification in `fsdp_workers.py`
+
+```
+@register(dispatch_mode=Dispatch.DP_COMPUTE_PROTO, blocking=False)
+def update_actor(self, data: DataProto):
+    ...
+
+@register(dispatch_mode=Dispatch.DP_COMPUTE_PROTO, blocking=False)
+def update_critic(self, data: DataProto):
+    ...
+```
+
+We can also parallelize the computation of `ref_log_prob` and `values` and `rewards` in the split placement. For simplicity of the tutorial, we don't do this in this example.
+
+### Step 3: Execute these operation in parallel in the single controller process
+To implement the parallel execution of the actor and critic update, the only thing we need to modify in the `ray_trainer.py` is to `get` the concurrent  `futures` on the single controller process.
+
+```python
+critic_output = critic_output.get()
+actor_output = actor_output.get()
+```
+
+### Step 4: Run the split placement example
+
+```
+bash run_deepseek7b_llm.sh
+```
--- a/examples/split_placement/config/ppo_trainer_split.yaml
+++ b/examples/split_placement/config/ppo_trainer_split.yaml
+# the ppo trainer split config will override default ppo_trainer.yaml
+
+hydra:
+  searchpath:
+    - file://../../verl/trainer/config
+
+defaults:
+  - ppo_trainer
+  - _self_
+
+data:
+  tokenizer: null
+  train_files: ~/data/rlhf/gsm8k/train.parquet
+  val_files: ~/data/rlhf/gsm8k/test.parquet
+  prompt_key: prompt
+  max_prompt_length: 512
+  max_response_length: 512
+  train_batch_size: 1024
+  val_batch_size: null # DEPRECATED: Validation datasets are sent to inference engines as a whole batch, which will schedule the memory themselves
+  return_raw_input_ids: False  # This should be set to true when the tokenizer between policy and rm differs
+  return_raw_chat: False
+  return_full_prompt: False
+  shuffle: True
+
+actor_rollout_ref:
+  hybrid_engine: True
+  model:
+    path: ~/models/deepseek-llm-7b-chat
+    external_lib: null
+    override_config: { }
+    enable_gradient_checkpointing: True
+    use_remove_padding: False
+  actor:
+    strategy: fsdp  # This is for backward-compatibility
+    ppo_mini_batch_size: 256
+    ppo_micro_batch_size: null # will be deprecated, use ppo_micro_batch_size_per_gpu
+    ppo_micro_batch_size_per_gpu: null
+    use_dynamic_bsz: False
+    ppo_max_token_len_per_gpu: 16384 # n * ${data.max_prompt_length} + ${data.max_response_length}
+    grad_clip: 1.0
+    clip_ratio: 0.2
+    entropy_coeff: 0.0
+    use_kl_loss: False # True for GRPO
+    kl_loss_coef: 0.001 # for grpo
+    kl_loss_type: low_var_kl # for grpo
+    ppo_epochs: 1
+    shuffle: False
+    ulysses_sequence_parallel_size: 1 # sp size
+    optim:
+      lr: 1e-6
+      lr_warmup_steps: -1 # Prioritized. Negative values mean delegating to lr_warmup_steps_ratio.
+      lr_warmup_steps_ratio: 0.  # the total steps will be injected during runtime
+      min_lr_ratio: null   # only useful for warmup with cosine
+      warmup_style: constant  # select from constant/cosine
+      total_training_steps: -1  # must be override by program
+    fsdp_config:
+      wrap_policy:
+        # transformer_layer_cls_to_wrap: None
+        min_num_params: 0
+      param_offload: False
+      optimizer_offload: False
+      fsdp_size: -1
+  ref:
+    fsdp_config:
+      param_offload: False
+      wrap_policy:
+        # transformer_layer_cls_to_wrap: None
+        min_num_params: 0
+    log_prob_micro_batch_size: null # will be deprecated, use log_prob_micro_batch_size_per_gpu
+    log_prob_micro_batch_size_per_gpu: null
+    log_prob_use_dynamic_bsz: ${actor_rollout_ref.actor.use_dynamic_bsz}
+    log_prob_max_token_len_per_gpu: ${actor_rollout_ref.actor.ppo_max_token_len_per_gpu}
+    ulysses_sequence_parallel_size: ${actor_rollout_ref.actor.ulysses_sequence_parallel_size} # sp size
+  rollout:
+    name: vllm
+    temperature: 1.0
+    top_k: -1 # 0 for hf rollout, -1 for vllm rollout
+    top_p: 1
+    prompt_length: ${data.max_prompt_length}  # not use for opensource
+    response_length: ${data.max_response_length}
+    # for vllm rollout
+    dtype: bfloat16 # should align with FSDP
+    gpu_memory_utilization: 0.5
+    ignore_eos: False
+    enforce_eager: True
+    free_cache_engine: True
+    load_format: dummy_dtensor
+    tensor_model_parallel_size: 2
+    max_num_batched_tokens: 8192
+    max_num_seqs: 1024
+    log_prob_micro_batch_size: null # will be deprecated, use log_prob_micro_batch_size_per_gpu
+    log_prob_micro_batch_size_per_gpu: null
+    log_prob_use_dynamic_bsz: ${actor_rollout_ref.actor.use_dynamic_bsz}
+    log_prob_max_token_len_per_gpu: ${actor_rollout_ref.actor.ppo_max_token_len_per_gpu}
+    disable_log_stats: True
+    enable_chunked_prefill: True # could get higher throughput
+    # for hf rollout
+    do_sample: True
+    # number of responses (i.e. num sample times)
+    n: 1 # > 1 for grpo
+
+critic:
+  strategy: fsdp
+  optim:
+    lr: 1e-5
+    lr_warmup_steps_ratio: 0.  # the total steps will be injected during runtime
+    min_lr_ratio: null   # only useful for warmup with cosine
+    warmup_style: constant  # select from constant/cosine
+    total_training_steps: -1  # must be override by program
+  model:
+    path: ~/models/deepseek-llm-7b-chat
+    tokenizer_path: ${actor_rollout_ref.model.path}
+    override_config: { }
+    external_lib: ${actor_rollout_ref.model.external_lib}
+    enable_gradient_checkpointing: True
+    use_remove_padding: False
+    fsdp_config:
+      param_offload: False
+      optimizer_offload: False
+      wrap_policy:
+        # transformer_layer_cls_to_wrap: None
+        min_num_params: 0
+      fsdp_size: -1
+  ppo_mini_batch_size: ${actor_rollout_ref.actor.ppo_mini_batch_size}
+  ppo_micro_batch_size: null # will be deprecated, use ppo_micro_batch_size_per_gpu
+  ppo_micro_batch_size_per_gpu: null
+  forward_micro_batch_size: ${critic.ppo_micro_batch_size}
+  forward_micro_batch_size_per_gpu: ${critic.ppo_micro_batch_size_per_gpu}
+  use_dynamic_bsz: ${actor_rollout_ref.actor.use_dynamic_bsz}
+  ppo_max_token_len_per_gpu: 32768 # (${actor_rollout_ref.actor.ppo_max_token_len_per_gpu}) * 2
+  forward_max_token_len_per_gpu: ${critic.ppo_max_token_len_per_gpu}
+  ulysses_sequence_parallel_size: 1 # sp size
+  ppo_epochs: ${actor_rollout_ref.actor.ppo_epochs}
+  shuffle: ${actor_rollout_ref.actor.shuffle}
+  grad_clip: 1.0
+  cliprange_value: 0.5
+
+reward_model:
+  enable: False
+  strategy: fsdp
+  model:
+    input_tokenizer: ${actor_rollout_ref.model.path}  # set this to null if the chat template is identical
+    path: ~/models/FsfairX-LLaMA3-RM-v0.1
+    external_lib: ${actor_rollout_ref.model.external_lib}
+    use_remove_padding: False
+    fsdp_config:
+      min_num_params: 0
+      param_offload: False
+      fsdp_size: -1
+  micro_batch_size: null # will be deprecated, use micro_batch_size_per_gpu
+  micro_batch_size_per_gpu: null # set a number
+  max_length: null
+  ulysses_sequence_parallel_size: 1 # sp size
+  use_dynamic_bsz: ${critic.use_dynamic_bsz}
+  forward_max_token_len_per_gpu: ${critic.forward_max_token_len_per_gpu}
+  reward_manager: naive
+
+algorithm:
+  gamma: 1.0
+  lam: 1.0
+  adv_estimator: gae
+  use_kl_in_reward: False
+  kl_penalty: kl  # how to estimate kl divergence
+  kl_ctrl:
+    type: fixed
+    kl_coef: 0.001
+
+trainer:
+  total_epochs: 30
+  total_training_steps: null
+  project_name: verl_examples
+  experiment_name: gsm8k
+  logger: [ 'console', 'wandb' ]
+  log_val_generations: 0
+  nnodes: 1
+  n_gpus_per_node: 8
+  save_freq: -1
+  # auto: find the last ckpt to resume. If can't find, start from scratch
+  resume_mode: auto # or disable or resume_path if resume_from_path is set
+  resume_from_path: null
+  test_freq: -1
+  critic_warmup: 0
+  default_hdfs_dir: null
+  default_local_dir: checkpoints/${trainer.project_name}/${trainer.experiment_name}
+
+ray_init:
+  num_cpus: null # `None` means using all CPUs, which might cause hang if limited in systems like SLURM. Please set to a number allowed then.
--- a/examples/split_placement/main_ppo_split.py
+++ b/examples/split_placement/main_ppo_split.py
+# Copyright 2024 Bytedance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Note that we don't combine the main with ray_trainer as ray_trainer is used by other main.
+"""
+
+import hydra
+import ray
+import torch
+from split_monkey_patch import fit
+
+from verl import DataProto
+from verl.trainer.ppo.ray_trainer import RayPPOTrainer
+from verl.utils.reward_score import gsm8k, math
+
+
+def _select_rm_score_fn(data_source):
+    if data_source == "openai/gsm8k":
+        return gsm8k.compute_score
+    elif data_source == "lighteval/MATH":
+        return math.compute_score
+    else:
+        raise NotImplementedError
+
+
+class RewardManager:
+    def __init__(self, tokenizer, num_examine) -> None:
+        self.tokenizer = tokenizer
+        self.num_examine = num_examine  # the number of batches of decoded responses to print to the console
+
+    def __call__(self, data: DataProto, return_dict: bool = False):
+        """We will expand this function gradually based on the available datasets"""
+
+        # If there is rm score, we directly return rm score. Otherwise, we compute via rm_score_fn
+        if "rm_scores" in data.batch.keys():
+            return data.batch["rm_scores"]
+
+        reward_tensor = torch.zeros_like(data.batch["responses"], dtype=torch.float32)
+
+        already_print_data_sources = {}
+
+        for i in range(len(data)):
+            data_item = data[i]  # DataProtoItem
+
+            prompt_ids = data_item.batch["prompts"]
+
+            prompt_length = prompt_ids.shape[-1]
+
+            valid_prompt_length = data_item.batch["attention_mask"][:prompt_length].sum()
+            valid_prompt_ids = prompt_ids[-valid_prompt_length:]
+
+            response_ids = data_item.batch["responses"]
+            valid_response_length = data_item.batch["attention_mask"][prompt_length:].sum()
+            valid_response_ids = response_ids[:valid_response_length]
+
+            # decode
+            sequences = torch.cat((valid_prompt_ids, valid_response_ids))
+            sequences_str = self.tokenizer.decode(sequences)
+
+            ground_truth = data_item.non_tensor_batch["reward_model"]["ground_truth"]
+
+            # select rm_score
+            data_source = data_item.non_tensor_batch["data_source"]
+            compute_score_fn = _select_rm_score_fn(data_source)
+
+            score = compute_score_fn(solution_str=sequences_str, ground_truth=ground_truth)
+            reward_tensor[i, valid_response_length - 1] = score
+
+            if data_source not in already_print_data_sources:
+                already_print_data_sources[data_source] = 0
+
+            if already_print_data_sources[data_source] < self.num_examine:
+                already_print_data_sources[data_source] += 1
+                print(sequences_str)
+
+        if return_dict:
+            return {"reward_tensor": reward_tensor}
+        else:
+            return reward_tensor
+
+
+@hydra.main(config_path="config", config_name="ppo_trainer_split", version_base=None)
+def main(config):
+    if not ray.is_initialized():
+        # this is for local ray cluster
+        ray.init(
+            runtime_env={"env_vars": {"TOKENIZERS_PARALLELISM": "true", "NCCL_DEBUG": "WARN"}},
+            num_cpus=config.ray_init.num_cpus,
+        )
+
+    ray.get(main_task.remote(config))
+
+
+@ray.remote
+def main_task(config):
+    # print initial config
+    from pprint import pprint
+
+    from omegaconf import OmegaConf
+
+    from verl.utils.fs import copy_to_local
+
+    pprint(OmegaConf.to_container(config, resolve=True))  # resolve=True will eval symbol values
+    OmegaConf.resolve(config)
+
+    # download the checkpoint from hdfs
+    local_path = copy_to_local(config.actor_rollout_ref.model.path)
+
+    # instantiate tokenizer
+    from verl.utils import hf_tokenizer
+
+    tokenizer = hf_tokenizer(local_path)
+
+    # define worker classes
+    if config.actor_rollout_ref.actor.strategy in {"fsdp", "fsdp2"}:
+        assert config.critic.strategy in {"fsdp", "fsdp2"}
+        from verl.single_controller.ray import RayWorkerGroup
+        from verl.workers.fsdp_workers import ActorRolloutRefWorker, CriticWorker
+
+        ray_worker_group_cls = RayWorkerGroup
+
+    elif config.actor_rollout_ref.actor.strategy == "megatron":
+        assert config.actor_rollout_ref.actor.strategy == config.critic.strategy
+        from verl.single_controller.ray.megatron import NVMegatronRayWorkerGroup
+        from verl.workers.megatron_workers import ActorRolloutRefWorker, CriticWorker
+
+        ray_worker_group_cls = NVMegatronRayWorkerGroup
+
+    else:
+        raise NotImplementedError
+
+    from verl.trainer.ppo.ray_trainer import ResourcePoolManager, Role
+
+    role_worker_mapping = {
+        Role.ActorRollout: ray.remote(ActorRolloutRefWorker),
+        Role.Critic: ray.remote(CriticWorker),
+    }
+
+    # NOTE: initialze two resource pool
+    actor_rollout_ref_pool_id = "actor_rollout_ref_pool"
+    critic_pool_id = "critic_pool"
+    if config.trainer.nnodes // 2 == 0 and config.trainer.n_gpus_per_node // 2 > 0:
+        resource_pool_spec = {
+            actor_rollout_ref_pool_id: [config.trainer.n_gpus_per_node // 2] * config.trainer.nnodes,
+            critic_pool_id: [config.trainer.n_gpus_per_node // 2] * config.trainer.nnodes,
+        }
+    else:
+        resource_pool_spec = {
+            actor_rollout_ref_pool_id: [config.trainer.n_gpus_per_node] * (config.trainer.nnodes // 2),
+            critic_pool_id: [config.trainer.n_gpus_per_node] * (config.trainer.nnodes // 2),
+        }
+    print(f"resource_pool_spec: {resource_pool_spec}")
+    mapping = {
+        Role.ActorRollout: actor_rollout_ref_pool_id,
+        Role.Critic: critic_pool_id,
+    }
+
+    # use reference model
+    if config.algorithm.use_kl_in_reward or config.actor_rollout_ref.actor.use_kl_loss:
+        role_worker_mapping[Role.RefPolicy] = ray.remote(ActorRolloutRefWorker)
+        mapping[Role.RefPolicy] = actor_rollout_ref_pool_id
+
+    # we should adopt a multi-source reward function here
+    # - for rule-based rm, we directly call a reward score
+    # - for model-based rm, we call a model
+    # - for code related prompt, we send to a sandbox if there are test cases
+    # - finally, we combine all the rewards together
+    # - The reward type depends on the tag of the data
+    if config.reward_model.enable:
+        if config.reward_model.strategy in {"fsdp", "fsdp2"}:
+            from verl.workers.fsdp_workers import RewardModelWorker
+        elif config.reward_model.strategy == "megatron":
+            from verl.workers.megatron_workers import RewardModelWorker
+        else:
+            raise NotImplementedError
+        role_worker_mapping[Role.RewardModel] = ray.remote(RewardModelWorker)
+        mapping[Role.RewardModel] = critic_pool_id
+
+    reward_fn = RewardManager(tokenizer=tokenizer, num_examine=0)
+
+    # Note that we always use function-based RM for validation
+    val_reward_fn = RewardManager(tokenizer=tokenizer, num_examine=1)
+
+    resource_pool_manager = ResourcePoolManager(resource_pool_spec=resource_pool_spec, mapping=mapping)
+
+    RayPPOTrainer.fit = fit
+    trainer = RayPPOTrainer(
+        config=config,
+        tokenizer=tokenizer,
+        role_worker_mapping=role_worker_mapping,
+        resource_pool_manager=resource_pool_manager,
+        ray_worker_group_cls=ray_worker_group_cls,
+        reward_fn=reward_fn,
+        val_reward_fn=val_reward_fn,
+    )
+    trainer.init_workers()
+    trainer.fit()
+
+
+if __name__ == "__main__":
+    main()
--- a/examples/split_placement/run_deepseek7b_llm.sh
+++ b/examples/split_placement/run_deepseek7b_llm.sh
+set -x
+
+python3 main_ppo_split.py \
+    algorithm.adv_estimator=gae \
+    data.train_files=$HOME/data/gsm8k/train.parquet \
+    data.val_files=$HOME/data/gsm8k/test.parquet \
+    data.train_batch_size=1024 \
+    data.max_prompt_length=512 \
+    data.max_response_length=512 \
+    data.filter_overlong_prompts=True \
+    data.truncation='error' \
+    actor_rollout_ref.model.path=deepseek-ai/deepseek-llm-7b-chat \
+    actor_rollout_ref.actor.optim.lr=1e-6 \
+    actor_rollout_ref.actor.ppo_mini_batch_size=256 \
+    actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=8 \
+    actor_rollout_ref.actor.fsdp_config.param_offload=False \
+    actor_rollout_ref.actor.fsdp_config.optimizer_offload=False \
+    actor_rollout_ref.actor.use_kl_loss=False \
+    actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=8 \
+    actor_rollout_ref.rollout.tensor_model_parallel_size=4 \
+    actor_rollout_ref.rollout.name=vllm \
+    actor_rollout_ref.rollout.gpu_memory_utilization=0.4 \
+    critic.optim.lr=1e-5 \
+    critic.model.path=deepseek-ai/deepseek-llm-7b-chat \
+    critic.model.enable_gradient_checkpointing=False \
+    critic.ppo_micro_batch_size_per_gpu=8 \
+    critic.model.fsdp_config.param_offload=False \
+    critic.model.fsdp_config.optimizer_offload=False \
+    algorithm.use_kl_in_reward=False \
+    trainer.critic_warmup=0 \
+    trainer.logger='["console","wandb"]' \
+    trainer.project_name='verl_example_gsm8k' \
+    trainer.experiment_name='deepseek_llm_7b_function_rm' \
+    trainer.n_gpus_per_node=8 \
+    trainer.nnodes=1 \
+    trainer.save_freq=-1 \
+    trainer.total_epochs=15 $@
--- a/examples/split_placement/split_monkey_patch.py
+++ b/examples/split_placement/split_monkey_patch.py
+# Copyright 2024 Bytedance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+An naive implementation of split placment example
+"""
+
+import uuid
+from copy import deepcopy
+from pprint import pprint
+
+import numpy as np
+import torch
+
+from verl import DataProto
+from verl.trainer.ppo.ray_trainer import (
+    AdvantageEstimator,
+    apply_kl_penalty,
+    compute_advantage,
+    compute_data_metrics,
+    compute_timing_metrics,
+    marked_timer,
+)
+from verl.utils.metric import reduce_metrics
+
+
+def fit(self):
+    """
+    The training loop of PPO.
+    The driver process only need to call the compute functions of the worker group through RPC
+    to construct the PPO dataflow.
+    The light-weight advantage computation is done on the driver process.
+    """
+    from omegaconf import OmegaConf
+
+    from verl.utils.tracking import Tracking
+
+    logger = Tracking(
+        project_name=self.config.trainer.project_name,
+        experiment_name=self.config.trainer.experiment_name,
+        default_backend=self.config.trainer.logger,
+        config=OmegaConf.to_container(self.config, resolve=True),
+    )
+
+    self.global_steps = 0
+
+    # load checkpoint before doing anything
+    self._load_checkpoint()
+
+    # perform validation before training
+    # currently, we only support validation using the reward_function.
+    if self.val_reward_fn is not None and self.config.trainer.get("val_before_train", True):
+        val_metrics = self._validate()
+        pprint(f"Initial validation metrics: {val_metrics}")
+        logger.log(data=val_metrics, step=self.global_steps)
+        if self.config.trainer.get("val_only", False):
+            return
+
+    # we start from step 1
+    self.global_steps += 1
+    last_val_metrics = None
+
+    for epoch in range(self.config.trainer.total_epochs):
+        for batch_dict in self.train_dataloader:
+            metrics = {}
+            timing_raw = {}
+
+            batch: DataProto = DataProto.from_single_dict(batch_dict)
+
+            # pop those keys for generation
+            gen_batch = batch.pop(batch_keys=["input_ids", "attention_mask", "position_ids"])
+            is_last_step = self.global_steps >= self.total_training_steps
+
+            with marked_timer("step", timing_raw):
+                # generate a batch
+                with marked_timer("gen", timing_raw):
+                    gen_batch_output = self.actor_rollout_wg.generate_sequences(gen_batch)
+                    timing_raw.update(gen_batch_output.meta_info["timing"])
+                    gen_batch_output.meta_info.pop("timing", None)
+
+                if self.config.algorithm.adv_estimator == AdvantageEstimator.REMAX:
+                    with marked_timer("gen_max", timing_raw):
+                        gen_baseline_batch = deepcopy(gen_batch)
+                        gen_baseline_batch.meta_info["do_sample"] = False
+                        gen_baseline_output = self.actor_rollout_wg.generate_sequences(gen_baseline_batch)
+
+                        batch = batch.union(gen_baseline_output)
+                        reward_baseline_tensor = self.reward_fn(batch)
+                        reward_baseline_tensor = reward_baseline_tensor.sum(dim=-1)
+
+                        batch.pop(batch_keys=list(gen_baseline_output.batch.keys()))
+
+                        batch.batch["reward_baselines"] = reward_baseline_tensor
+
+                        del gen_baseline_batch, gen_baseline_output
+
+                batch.non_tensor_batch["uid"] = np.array(
+                    [str(uuid.uuid4()) for _ in range(len(batch.batch))], dtype=object
+                )
+                # repeat to align with repeated responses in rollout
+                batch = batch.repeat(repeat_times=self.config.actor_rollout_ref.rollout.n, interleave=True)
+                batch = batch.union(gen_batch_output)
+
+                # Balance the number of valid tokens across DP ranks.
+                # NOTE: This usually changes the order of data in the `batch`,
+                # which won't affect the advantage calculation (since it's based on uid),
+                # but might affect the loss calculation (due to the change of mini-batching).
+                # TODO: Decouple the DP balancing and mini-batching.
+                self._balance_batch(batch, metrics=metrics)
+
+                # compute global_valid tokens
+                batch.meta_info["global_token_num"] = torch.sum(batch.batch["attention_mask"], dim=-1).tolist()
+
+                # recompute old_log_probs
+                with marked_timer("old_log_prob", timing_raw):
+                    old_log_prob = self.actor_rollout_wg.compute_log_prob(batch)
+                    batch = batch.union(old_log_prob)
+
+                if self.use_reference_policy:
+                    # compute reference log_prob
+                    with marked_timer("ref", timing_raw):
+                        ref_log_prob = self.ref_policy_wg.compute_ref_log_prob(batch)
+                        batch = batch.union(ref_log_prob)
+
+                # compute values
+                if self.use_critic:
+                    with marked_timer("values", timing_raw):
+                        values = self.critic_wg.compute_values(batch)
+                        batch = batch.union(values)
+
+                with marked_timer("adv", timing_raw):
+                    # compute scores. Support both model and function-based.
+                    # We first compute the scores using reward model. Then, we call reward_fn to combine
+                    # the results from reward model and rule-based results.
+                    if self.use_rm:
+                        # we first compute reward model score
+                        reward_tensor = self.rm_wg.compute_rm_score(batch)
+                        batch = batch.union(reward_tensor)
+
+                    # we combine with rule-based rm
+                    reward_tensor = self.reward_fn(batch)
+                    batch.batch["token_level_scores"] = reward_tensor
+
+                    # compute rewards. apply_kl_penalty if available
+                    if self.config.algorithm.use_kl_in_reward:
+                        batch, kl_metrics = apply_kl_penalty(
+                            batch, kl_ctrl=self.kl_ctrl_in_reward, kl_penalty=self.config.algorithm.kl_penalty
+                        )
+                        metrics.update(kl_metrics)
+                    else:
+                        batch.batch["token_level_rewards"] = batch.batch["token_level_scores"]
+
+                    # compute advantages, executed on the driver process
+                    norm_adv_by_std_in_grpo = self.config.algorithm.get("norm_adv_by_std_in_grpo", True)
+                    batch = compute_advantage(
+                        batch,
+                        adv_estimator=self.config.algorithm.adv_estimator,
+                        gamma=self.config.algorithm.gamma,
+                        lam=self.config.algorithm.lam,
+                        num_repeat=self.config.actor_rollout_ref.rollout.n,
+                        norm_adv_by_std_in_grpo=norm_adv_by_std_in_grpo,
+                        config=self.config.algorithm,
+                    )
+
+                # implement critic warmup
+                if self.config.trainer.critic_warmup <= self.global_steps:
+                    # update actor
+                    with marked_timer("update_actor_call", timing_raw):
+                        actor_output = self.actor_rollout_wg.update_actor(batch)
+                else:
+                    actor_output = None
+
+                # update critic
+                if self.use_critic:
+                    with marked_timer("update_critic_call", timing_raw):
+                        critic_output = self.critic_wg.update_critic(batch)
+
+                    # NOTE: make sure you set blocking=False in update_actor and update_crtic in the worker class
+                    with marked_timer("update_actor_critic", timing_raw):
+                        critic_output = critic_output.get()
+                        critic_output_metrics = reduce_metrics(critic_output.meta_info["metrics"])
+                        metrics.update(critic_output_metrics)
+
+                if actor_output is not None:
+                    actor_output = actor_output.get()
+                    actor_output_metrics = reduce_metrics(actor_output.meta_info["metrics"])
+                    metrics.update(actor_output_metrics)
+
+                # validate
+                if (
+                    self.val_reward_fn is not None
+                    and self.config.trainer.test_freq > 0
+                    and (is_last_step or self.global_steps % self.config.trainer.test_freq == 0)
+                ):
+                    with marked_timer("testing", timing_raw):
+                        val_metrics: dict = self._validate()
+                        if is_last_step:
+                            last_val_metrics = val_metrics
+                    metrics.update(val_metrics)
+
+                if self.config.trainer.save_freq > 0 and (
+                    is_last_step or self.global_steps % self.config.trainer.save_freq == 0
+                ):
+                    with marked_timer("save_checkpoint", timing_raw):
+                        self._save_checkpoint()
+
+            # collect metrics
+            metrics.update(compute_data_metrics(batch=batch, use_critic=self.use_critic))
+            metrics.update(compute_timing_metrics(batch=batch, timing_raw=timing_raw))
+
+            # TODO: make a canonical logger that supports various backend
+            logger.log(data=metrics, step=self.global_steps)
+
+            if self.global_steps >= self.total_training_steps:
+                pprint(f"Final validation metrics: {last_val_metrics}")
+                return
+
+            self.global_steps += 1
--- a/examples/tuning/0.5b/qwen2-0.5b_grpo-lora_1_h100_fsdp_vllm.sh
+++ b/examples/tuning/0.5b/qwen2-0.5b_grpo-lora_1_h100_fsdp_vllm.sh
+# -*- coding: utf-8 -*-
+export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+NOW=$(date +%Y%m%d)
+export WANDB_DIR=gsm8k-grpo-lora-qwen2.5-0.5b-${NOW}
+export WANDB_PROJECT=${WANDB_DIR}
+export WANDB_EXP=0.5b-${NOW}
+MODEL_PATH=Qwen/Qwen2.5-0.5B-Instruct
+
+set -x
+nproc_per_gpu=116
+nnodes=1
+ngpu_per_node=1
+total_procs=$(( nproc_per_gpu * nnodes * ngpu_per_node ))
+mini_batch_size=$(( total_procs ))
+
+python3 -m verl.trainer.main_ppo \
+    algorithm.adv_estimator=grpo \
+    data.train_files=data/gsm8k/train.parquet \
+    data.val_files=data/gsm8k/test.parquet \
+    data.train_batch_size=${total_procs} \
+    data.val_batch_size=${total_procs} \
+    data.max_prompt_length=512 \
+    data.max_response_length=1024 \
+    data.filter_overlong_prompts=True \
+    data.truncation='error' \
+    data.shuffle=False \
+    actor_rollout_ref.model.path=$MODEL_PATH  \
+    actor_rollout_ref.model.use_shm=True  \
+    actor_rollout_ref.model.enable_gradient_checkpointing=True \
+    actor_rollout_ref.model.lora_rank=32 \
+    actor_rollout_ref.model.lora_alpha=32 \
+    actor_rollout_ref.model.target_modules=all-linear \
+    actor_rollout_ref.actor.optim.lr=3e-5 \
+    actor_rollout_ref.model.use_remove_padding=True \
+    actor_rollout_ref.actor.ppo_mini_batch_size=${mini_batch_size} \
+    actor_rollout_ref.actor.ppo_micro_batch_size=${mini_batch_size} \
+    actor_rollout_ref.actor.use_kl_loss=True \
+    actor_rollout_ref.actor.kl_loss_coef=0.001 \
+    actor_rollout_ref.actor.kl_loss_type=low_var_kl \
+    actor_rollout_ref.actor.fsdp_config.fsdp_size=-1 \
+    actor_rollout_ref.model.enable_gradient_checkpointing=True \
+    actor_rollout_ref.actor.fsdp_config.param_offload=True \
+    actor_rollout_ref.actor.fsdp_config.optimizer_offload=True \
+    actor_rollout_ref.rollout.log_prob_micro_batch_size=${mini_batch_size} \
+    actor_rollout_ref.rollout.tensor_model_parallel_size=1 \
+    actor_rollout_ref.rollout.name=vllm \
+    actor_rollout_ref.rollout.gpu_memory_utilization=0.1 \
+    actor_rollout_ref.rollout.n=5 \
+    actor_rollout_ref.rollout.max_num_seqs=512 \
+    actor_rollout_ref.rollout.max_model_len=1536 \
+    actor_rollout_ref.rollout.max_num_batched_tokens=1536 \
+    actor_rollout_ref.rollout.enable_chunked_prefill=False \
+    actor_rollout_ref.rollout.load_format=safetensors \
+    actor_rollout_ref.rollout.layered_summon=True \
+    actor_rollout_ref.ref.log_prob_micro_batch_size=${mini_batch_size} \
+    actor_rollout_ref.ref.fsdp_config.param_offload=True \
+    actor_rollout_ref.actor.ulysses_sequence_parallel_size=1 \
+    actor_rollout_ref.actor.entropy_coeff=0.001 \
+    algorithm.kl_ctrl.kl_coef=0.001 \
+    algorithm.use_kl_in_reward=False \
+    trainer.critic_warmup=0 \
+    trainer.logger='["console","wandb"]' \
+    trainer.project_name=${WANDB_PROJECT} \
+    trainer.experiment_name=${WANDB_EXP} \
+    trainer.n_gpus_per_node=1 \
+    trainer.nnodes=1 \
+    trainer.save_freq=20 \
+    trainer.test_freq=5 \
+    trainer.total_epochs=1 $@ 2>&1 | tee ${WANDB_PROJECT}.log
--- a/examples/tuning/1.5b/qwen2-1.5b_grpo-lora_1_h100_fsdp_vllm.sh
+++ b/examples/tuning/1.5b/qwen2-1.5b_grpo-lora_1_h100_fsdp_vllm.sh
+# -*- coding: utf-8 -*-
+export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+NOW=$(date +%Y%m%d)
+export WANDB_DIR=gsm8k-grpo-lora-qwen2.5-1.5b-${NOW}
+export WANDB_PROJECT=${WANDB_DIR}
+export WANDB_EXP=1.5b-${NOW}
+MODEL_PATH=Qwen/Qwen2.5-1.5B-Instruct
+
+set -x
+nproc_per_gpu=128
+nnodes=1
+ngpu_per_node=1
+total_procs=$(( nproc_per_gpu * nnodes * ngpu_per_node ))
+mini_batch_size=$(( total_procs ))
+
+python3 -m verl.trainer.main_ppo \
+    algorithm.adv_estimator=grpo \
+    data.train_files=data/gsm8k/train.parquet \
+    data.val_files=data/gsm8k/test.parquet \
+    data.train_batch_size=${total_procs} \
+    data.val_batch_size=${total_procs} \
+    data.max_prompt_length=512 \
+    data.max_response_length=1024 \
+    data.filter_overlong_prompts=True \
+    data.truncation='error' \
+    data.shuffle=False \
+    actor_rollout_ref.model.path=$MODEL_PATH  \
+    actor_rollout_ref.model.use_shm=True  \
+    actor_rollout_ref.model.enable_gradient_checkpointing=True \
+    actor_rollout_ref.model.lora_rank=32 \
+    actor_rollout_ref.model.lora_alpha=32 \
+    actor_rollout_ref.model.target_modules=all-linear \
+    actor_rollout_ref.actor.optim.lr=3e-5 \
+    actor_rollout_ref.model.use_remove_padding=True \
+    actor_rollout_ref.actor.ppo_mini_batch_size=${mini_batch_size} \
+    actor_rollout_ref.actor.ppo_micro_batch_size=${mini_batch_size} \
+    actor_rollout_ref.actor.use_kl_loss=True \
+    actor_rollout_ref.actor.kl_loss_coef=0.001 \
+    actor_rollout_ref.actor.kl_loss_type=low_var_kl \
+    actor_rollout_ref.actor.fsdp_config.fsdp_size=-1 \
+    actor_rollout_ref.model.enable_gradient_checkpointing=True \
+    actor_rollout_ref.actor.fsdp_config.param_offload=True \
+    actor_rollout_ref.actor.fsdp_config.optimizer_offload=True \
+    actor_rollout_ref.rollout.log_prob_micro_batch_size=${mini_batch_size} \
+    actor_rollout_ref.rollout.tensor_model_parallel_size=1 \
+    actor_rollout_ref.rollout.name=vllm \
+    actor_rollout_ref.rollout.gpu_memory_utilization=0.1 \
+    actor_rollout_ref.rollout.n=5 \
+    actor_rollout_ref.rollout.max_num_seqs=512 \
+    actor_rollout_ref.rollout.max_model_len=1536 \
+    actor_rollout_ref.rollout.max_num_batched_tokens=1536 \
+    actor_rollout_ref.rollout.enable_chunked_prefill=False \
+    actor_rollout_ref.rollout.load_format=safetensors \
+    actor_rollout_ref.rollout.layered_summon=True \
+    actor_rollout_ref.ref.log_prob_micro_batch_size=${mini_batch_size} \
+    actor_rollout_ref.ref.fsdp_config.param_offload=True \
+    actor_rollout_ref.actor.ulysses_sequence_parallel_size=1 \
+    actor_rollout_ref.actor.entropy_coeff=0.001 \
+    algorithm.kl_ctrl.kl_coef=0.001 \
+    algorithm.use_kl_in_reward=False \
+    trainer.critic_warmup=0 \
+    trainer.logger='["console","wandb"]' \
+    trainer.project_name=${WANDB_PROJECT} \
+    trainer.experiment_name=${WANDB_EXP} \
+    trainer.n_gpus_per_node=1 \
+    trainer.nnodes=1 \
+    trainer.save_freq=20 \
+    trainer.test_freq=5 \
+    trainer.total_epochs=1 $@ 2>&1 | tee ${WANDB_PROJECT}.log