Initial commit

f87b35b2 · jerrrrry · f87b35b2 · f87b35b2 · f87b35b2 · f87b35b2
Commit f87b35b2 authored Apr 17, 2025 by jerrrrry
20 changed files
--- a/tests/e2e/ppo_trainer/run_function_reward.sh
+++ b/tests/e2e/ppo_trainer/run_function_reward.sh
+#!/usr/bin/env bash
+set -xeuo pipefail
+
+NUM_GPUS=${NUM_GPUS:-8}
+
+MODEL_ID=${MODEL_ID:-Qwen/Qwen2.5-0.5B}
+MODEL_PATH=${MODEL_PATH:-${HOME}/models/${MODEL_ID}}
+huggingface-cli download "${MODEL_ID}" --local-dir "${MODEL_PATH}"
+
+TRAIN_FILES=${TRAIN_FILES:-$HOME/data/gsm8k/train.parquet}
+VAL_FILES=${VAL_FILES:-$HOME/data/gsm8k/test.parquet}
+MAX_PROMPT_LEN=${MAX_PROMPT_LEN:-512}
+MAX_RESPONSE_LEN=${MAX_RESPONSE_LEN:-512}
+
+ENGINE=${ENGINE:-vllm}
+RM_PAD=${RM_PAD:-True}
+ADV_ESTIMATOR=${ADV_ESTIMATOR:-gae}
+USE_KL=${USE_KL:-False}
+CUSTOM_REWARD_FN=${CUSTOM_REWARD_FN:-False}
+ENABLE_CHUNKED_PREFILL=${ENABLE_CHUNKED_PREFILL:-True} # For vLLM VLM placeholder issue: https://github.com/vllm-project/vllm/issues/15185
+# Validation
+VAL_BEFORE_TRAIN=${VAL_BEFORE_TRAIN:-False}
+TEST_FREQ=${TEST_FREQ:--1}
+# Save & Resume
+RESUME_MODE=${RESUME_MODE:-disable}
+SAVE_FREQ=${SAVE_FREQ:--1}
+TOT_TRAIN_STEPS=${TOT_TRAIN_STEPS:-1}
+
+train_traj_micro_bsz_per_gpu=2 # b
+n_resp_per_prompt=4 # g
+
+train_traj_micro_bsz=$((train_traj_micro_bsz_per_gpu * NUM_GPUS)) # b * n
+train_traj_mini_bsz=$((train_traj_micro_bsz * 2)) # 2 * b * n
+train_prompt_mini_bsz=$((train_traj_mini_bsz * n_resp_per_prompt)) # 2 * b * n / g
+train_prompt_bsz=$((train_prompt_mini_bsz * 2)) # 4 * b * n / g
+
+reward_fn_name=null
+reward_fn_file_path=null
+output_file="$(pwd)/output.txt"
+if [ "${CUSTOM_REWARD_FN}" = "True" ]; then
+    reward_fn_name="my_reward_function"
+    reward_fn_file_path="$(pwd)/my_reward_function.py"
+    rm -rf "${reward_fn_file_path}"
+    cat <<EOF > "$reward_fn_file_path"
+def ${reward_fn_name}(data_source, solution_str, ground_truth, extra_info=None):
+    print(f"Congratulations!!! You have called ${reward_fn_name} successfully!!!")
+    return 0.1
+EOF
+
+    rm -rf "${output_file}"
+fi
+
+exp_name="$(basename "${MODEL_ID,,}")-function-reward-minimal"
+
+python3 -m verl.trainer.main_ppo \
+    algorithm.adv_estimator="${ADV_ESTIMATOR}" \
+    data.train_files="${TRAIN_FILES}" \
+    data.val_files="${VAL_FILES}" \
+    data.train_batch_size="${train_prompt_bsz}" \
+    data.max_prompt_length="${MAX_PROMPT_LEN}" \
+    data.max_response_length="${MAX_RESPONSE_LEN}" \
+    actor_rollout_ref.model.path="${MODEL_PATH}" \
+    actor_rollout_ref.actor.optim.lr=1e-6 \
+    actor_rollout_ref.model.use_remove_padding="${RM_PAD}" \
+    actor_rollout_ref.actor.ppo_mini_batch_size=${train_prompt_mini_bsz} \
+    actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=${train_traj_micro_bsz_per_gpu} \
+    actor_rollout_ref.actor.fsdp_config.param_offload=False \
+    actor_rollout_ref.actor.fsdp_config.optimizer_offload=False \
+    actor_rollout_ref.actor.use_kl_loss="${USE_KL}" \
+    actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=${train_traj_micro_bsz_per_gpu} \
+    actor_rollout_ref.rollout.tensor_model_parallel_size=2 \
+    actor_rollout_ref.rollout.name="${ENGINE}" \
+    actor_rollout_ref.rollout.gpu_memory_utilization=0.8 \
+    actor_rollout_ref.rollout.enable_chunked_prefill="${ENABLE_CHUNKED_PREFILL}" \
+    actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=${train_traj_micro_bsz_per_gpu} \
+    critic.optim.lr=1e-5 \
+    critic.model.use_remove_padding="${RM_PAD}" \
+    critic.model.path="${MODEL_PATH}" \
+    critic.model.enable_gradient_checkpointing=False \
+    critic.ppo_micro_batch_size_per_gpu=${train_traj_micro_bsz_per_gpu} \
+    critic.model.fsdp_config.param_offload=False \
+    critic.model.fsdp_config.optimizer_offload=False \
+    custom_reward_function.path="${reward_fn_file_path}"\
+    custom_reward_function.name="${reward_fn_name}"\
+    algorithm.use_kl_in_reward="${USE_KL}" \
+    algorithm.kl_penalty=kl \
+    algorithm.kl_ctrl.kl_coef=0.001 \
+    trainer.critic_warmup=0 \
+    trainer.logger=['console'] \
+    trainer.project_name='verl-test' \
+    trainer.experiment_name="${exp_name}" \
+    trainer.nnodes=1 \
+    trainer.n_gpus_per_node="${NUM_GPUS}" \
+    trainer.val_before_train="${VAL_BEFORE_TRAIN}" \
+    trainer.test_freq="${TEST_FREQ}" \
+    trainer.save_freq="${SAVE_FREQ}" \
+    trainer.resume_mode="${RESUME_MODE}" \
+    trainer.total_epochs=2 \
+    trainer.total_training_steps="${TOT_TRAIN_STEPS}" $@ \
+    | tee "${output_file}"
+
+if [ "${CUSTOM_REWARD_FN}" = "True" ]; then
+    python3 tests/e2e/check_custom_rwd_fn.py --output_file="${output_file}"
+    check_exit_code=$?
+    rm -rf "${reward_fn_file_path}"
+    rm -rf "${output_file}"
+    # Return the exit code of check_custom_rwd_fn.py if it fails
+    if [ $check_exit_code -ne 0 ]; then
+        exit $check_exit_code
+    fi
+fi
\ No newline at end of file
--- a/tests/e2e/ppo_trainer/run_model_reward.sh
+++ b/tests/e2e/ppo_trainer/run_model_reward.sh
+#!/usr/bin/env bash
+set -xeuo pipefail
+
+NUM_GPUS=${NUM_GPUS:-8}
+
+MODEL_ID=${MODEL_ID:-Qwen/Qwen2.5-0.5B}
+MODEL_PATH=${MODEL_PATH:-${HOME}/models/${MODEL_ID}}
+huggingface-cli download "${MODEL_ID}" --local-dir "${MODEL_PATH}"
+
+TRAIN_FILES=${TRAIN_FILES:-$HOME/data/gsm8k/train.parquet}
+VAL_FILES=${VAL_FILES:-$HOME/data/gsm8k/test.parquet}
+
+RM_PAD=${RM_PAD:-True}
+SP_SIZE=${SP_SIZE:-1}
+SEQ_BALANCE=${SEQ_BALANCE:-False}
+LIGER=${LIGER:-False}
+# Validation
+VAL_BEFORE_TRAIN=${VAL_BEFORE_TRAIN:-False}
+TEST_FREQ=${TEST_FREQ:--1}
+# Save & Resume
+RESUME_MODE=${RESUME_MODE:-disable}
+SAVE_FREQ=${SAVE_FREQ:--1}
+TOT_TRAIN_STEPS=${TOT_TRAIN_STEPS:-1}
+
+train_traj_micro_bsz_per_gpu=2 # b
+n_resp_per_prompt=4 # g
+
+train_traj_micro_bsz=$((train_traj_micro_bsz_per_gpu * NUM_GPUS)) # b * n
+train_traj_mini_bsz=$((train_traj_micro_bsz * 2)) # 2 * b * n
+train_prompt_mini_bsz=$((train_traj_mini_bsz * n_resp_per_prompt)) # 2 * b * n / g
+train_prompt_bsz=$((train_prompt_mini_bsz * 2)) # 4 * b * n / g
+
+train_max_token_num_per_gpu=32768
+infer_max_token_num_per_gpu=32768
+
+exp_name="$(basename "${MODEL_ID,,}")-model-reward-minimal"
+
+python3 -m verl.trainer.main_ppo \
+    algorithm.adv_estimator=gae \
+    data.train_files="${TRAIN_FILES}" \
+    data.val_files="${VAL_FILES}" \
+    data.train_batch_size=${train_prompt_bsz} \
+    data.max_prompt_length=512 \
+    data.max_response_length=512 \
+    data.return_raw_chat=True \
+    actor_rollout_ref.model.path="${MODEL_PATH}" \
+    actor_rollout_ref.model.use_liger="${LIGER}" \
+    actor_rollout_ref.actor.optim.lr=1e-6 \
+    actor_rollout_ref.model.use_remove_padding="${RM_PAD}" \
+    actor_rollout_ref.actor.optim.lr_warmup_steps_ratio=0.1 \
+    actor_rollout_ref.actor.ppo_mini_batch_size=${train_prompt_mini_bsz} \
+    actor_rollout_ref.actor.use_dynamic_bsz="${SEQ_BALANCE}" \
+    actor_rollout_ref.actor.ppo_max_token_len_per_gpu=${train_max_token_num_per_gpu} \
+    actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=${train_traj_micro_bsz_per_gpu} \
+    actor_rollout_ref.actor.ulysses_sequence_parallel_size="${SP_SIZE}" \
+    actor_rollout_ref.actor.fsdp_config.param_offload=False \
+    actor_rollout_ref.actor.fsdp_config.optimizer_offload=False \
+    actor_rollout_ref.actor.use_kl_loss=False \
+    actor_rollout_ref.rollout.log_prob_max_token_len_per_gpu=${infer_max_token_num_per_gpu} \
+    actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=${train_traj_micro_bsz_per_gpu} \
+    actor_rollout_ref.rollout.tensor_model_parallel_size=2 \
+    actor_rollout_ref.rollout.name=vllm \
+    actor_rollout_ref.rollout.gpu_memory_utilization=0.8 \
+    actor_rollout_ref.ref.log_prob_max_token_len_per_gpu=${infer_max_token_num_per_gpu} \
+    actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=${train_traj_micro_bsz_per_gpu} \
+    critic.optim.lr=1e-5 \
+    critic.ulysses_sequence_parallel_size="${SP_SIZE}" \
+    critic.model.use_remove_padding="${RM_PAD}" \
+    critic.optim.lr_warmup_steps_ratio=0.05 \
+    critic.model.path="${MODEL_PATH}" \
+    critic.model.enable_gradient_checkpointing=False \
+    critic.use_dynamic_bsz="${SEQ_BALANCE}" \
+    critic.ppo_max_token_len_per_gpu=${train_max_token_num_per_gpu} \
+    critic.ppo_micro_batch_size_per_gpu=${train_traj_micro_bsz_per_gpu} \
+    critic.model.fsdp_config.param_offload=False \
+    critic.model.fsdp_config.optimizer_offload=False \
+    reward_model.enable=True \
+    reward_model.ulysses_sequence_parallel_size="${SP_SIZE}" \
+    reward_model.model.path="${MODEL_PATH}" \
+    reward_model.model.use_remove_padding="${RM_PAD}" \
+    reward_model.model.fsdp_config.param_offload=True \
+    reward_model.use_dynamic_bsz="${SEQ_BALANCE}" \
+    reward_model.forward_max_token_len_per_gpu=${infer_max_token_num_per_gpu} \
+    reward_model.micro_batch_size_per_gpu=${train_traj_micro_bsz_per_gpu} \
+    algorithm.use_kl_in_reward=False \
+    trainer.critic_warmup=0 \
+    trainer.logger=['console'] \
+    trainer.project_name='verl-test' \
+    trainer.experiment_name="${exp_name}" \
+    trainer.nnodes=1 \
+    trainer.n_gpus_per_node="${NUM_GPUS}" \
+    trainer.val_before_train="${VAL_BEFORE_TRAIN}" \
+    trainer.test_freq="${VAL_BEFORE_TRAIN}" \
+    trainer.save_freq="${SAVE_FREQ}" \
+    trainer.resume_mode="${RESUME_MODE}" \
+    trainer.total_epochs=2 \
+    trainer.total_training_steps="${TOT_TRAIN_STEPS}" $@
--- a/tests/e2e/run_dapo.sh
+++ b/tests/e2e/run_dapo.sh
+#!/usr/bin/env bash
+set -xeuo pipefail
+
+NUM_GPUS=${NUM_GPUS:-8}
+
+MODEL_ID=${MODEL_ID:-Qwen/Qwen2.5-0.5B-Instruct}
+MODEL_PATH=${MODEL_PATH:-${HOME}/models/${MODEL_ID}}
+huggingface-cli download "${MODEL_ID}" --local-dir "${MODEL_PATH}"
+
+adv_estimator=grpo
+
+kl_coef=0.0
+use_kl_in_reward=False
+use_kl_loss=False
+kl_loss_coef=0.0
+
+clip_ratio_low=0.2
+clip_ratio_high=0.28
+
+max_prompt_length=1024
+max_response_length=2048
+enable_overlong_buffer=True
+overlong_buffer_len=128
+overlong_penalty_factor=1.0
+
+loss_agg_mode="token-mean"
+
+enable_filter_groups=True
+filter_groups_metric=seq_reward
+max_num_gen_batches=10
+
+train_traj_micro_bsz_per_gpu=2 # b
+n_resp_per_prompt=4 # g
+
+train_traj_micro_bsz=$((train_traj_micro_bsz_per_gpu * NUM_GPUS)) # b * n
+train_traj_mini_bsz=$((train_traj_micro_bsz * 2)) # 2 * b * n
+train_prompt_mini_bsz=$((train_traj_mini_bsz * n_resp_per_prompt)) # 2 * b * n / g
+train_prompt_bsz=$((train_prompt_mini_bsz * 2)) # 4 * b * n / g
+
+gen_prompt_bsz=$((train_prompt_bsz * 4))
+
+exp_name="$(basename "${MODEL_ID,,}")-dapo-minimal"
+
+python3 -m recipe.dapo.src.main_dapo \
+    data.train_files="${HOME}/data/gsm8k/train.parquet" \
+    data.val_files="${HOME}/data/gsm8k/test.parquet" \
+    reward_model.reward_manager=dapo \
+    algorithm.adv_estimator=${adv_estimator} \
+    algorithm.use_kl_in_reward=${use_kl_in_reward} \
+    algorithm.kl_ctrl.kl_coef=${kl_coef} \
+    actor_rollout_ref.actor.use_kl_loss=${use_kl_loss} \
+    actor_rollout_ref.actor.kl_loss_coef=${kl_loss_coef} \
+    actor_rollout_ref.actor.clip_ratio_low=${clip_ratio_low} \
+    actor_rollout_ref.actor.clip_ratio_high=${clip_ratio_high} \
+    data.max_prompt_length=${max_prompt_length} \
+    data.max_response_length=${max_response_length} \
+    reward_model.overlong_buffer.enable=${enable_overlong_buffer} \
+    reward_model.overlong_buffer.len=${overlong_buffer_len} \
+    reward_model.overlong_buffer.penalty_factor=${overlong_penalty_factor} \
+    actor_rollout_ref.actor.loss_agg_mode=${loss_agg_mode} \
+    data.train_batch_size=${train_prompt_bsz} \
+    data.gen_batch_size=${gen_prompt_bsz} \
+    algorithm.filter_groups.enable=${enable_filter_groups} \
+    algorithm.filter_groups.metric=${filter_groups_metric} \
+    algorithm.filter_groups.max_num_gen_batches=${max_num_gen_batches} \
+    actor_rollout_ref.model.path="${MODEL_PATH}" \
+    actor_rollout_ref.actor.optim.lr=1e-6 \
+    actor_rollout_ref.model.use_remove_padding=True \
+    actor_rollout_ref.rollout.n=${n_resp_per_prompt} \
+    actor_rollout_ref.actor.ppo_mini_batch_size=${train_prompt_mini_bsz} \
+    actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=${train_traj_micro_bsz_per_gpu} \
+    actor_rollout_ref.actor.fsdp_config.param_offload=False \
+    actor_rollout_ref.actor.fsdp_config.optimizer_offload=False \
+    actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=${train_traj_micro_bsz_per_gpu} \
+    actor_rollout_ref.rollout.tensor_model_parallel_size=2 \
+    actor_rollout_ref.rollout.name=vllm \
+    actor_rollout_ref.rollout.gpu_memory_utilization=0.8 \
+    actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=${train_traj_micro_bsz_per_gpu} \
+    actor_rollout_ref.ref.fsdp_config.param_offload=True \
+    trainer.logger=['console'] \
+    trainer.project_name='verl-test' \
+    trainer.experiment_name="${exp_name}" \
+    trainer.n_gpus_per_node=${NUM_GPUS} \
+    trainer.nnodes=1 \
+    trainer.save_freq=-1 \
+    trainer.total_epochs=2 \
+    trainer.resume_mode=disable \
+    trainer.val_before_train=False \
+    trainer.total_training_steps=1 $@
--- a/tests/e2e/run_ppo_trainer_megatron.sh
+++ b/tests/e2e/run_ppo_trainer_megatron.sh
+#!/usr/bin/env bash
+set -xeuo pipefail
+
+NUM_GPUS=${NUM_GPUS:-8}
+
+MODEL_ID=${MODEL_ID:-Qwen/Qwen2.5-0.5B}
+MODEL_PATH=${MODEL_PATH:-${HOME}/models/${MODEL_ID}}
+huggingface-cli download "${MODEL_ID}" --local-dir "${MODEL_PATH}"
+
+TRAIN_FILES=${TRAIN_FILES:-${HOME}/data/gsm8k/train.parquet}
+VAL_FILES=${VAL_FILES:-${HOME}/data/gsm8k/test.parquet}
+
+ADV_ESTIMATOR=${ADV_ESTIMATOR:-gae}
+# Validation
+VAL_BEFORE_TRAIN=${VAL_BEFORE_TRAIN:-False}
+TEST_FREQ=${TEST_FREQ:--1}
+# Save & Resume
+RESUME_MODE=${RESUME_MODE:-disable}
+SAVE_FREQ=${SAVE_FREQ:--1}
+TOT_TRAIN_STEPS=${TOT_TRAIN_STEPS:-1}
+
+train_traj_micro_bsz_per_gpu=2 # b
+n_resp_per_prompt=4 # g
+
+train_traj_micro_bsz=$((train_traj_micro_bsz_per_gpu * NUM_GPUS)) # b * n
+train_traj_mini_bsz=$((train_traj_micro_bsz * 2)) # 2 * b * n
+train_prompt_mini_bsz=$((train_traj_mini_bsz * n_resp_per_prompt)) # 2 * b * n / g
+train_prompt_bsz=$((train_prompt_mini_bsz * 2)) # 4 * b * n / g
+
+exp_name="$(basename "${MODEL_ID,,}")-megatron-gsm8k-minimal"
+
+python3 -m verl.trainer.main_ppo --config-path=config \
+    --config-name='ppo_megatron_trainer.yaml'\
+    algorithm.adv_estimator="${ADV_ESTIMATOR}" \
+    data.train_files="${TRAIN_FILES}" \
+    data.val_files="${VAL_FILES}" \
+    data.train_batch_size=${train_prompt_bsz} \
+    data.max_prompt_length=512 \
+    data.max_response_length=512 \
+    data.filter_overlong_prompts=True \
+    data.truncation='error' \
+    actor_rollout_ref.model.path="${MODEL_PATH}" \
+    actor_rollout_ref.actor.optim.lr=1e-6 \
+    actor_rollout_ref.actor.ppo_mini_batch_size=${train_prompt_mini_bsz} \
+    actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=${train_traj_micro_bsz_per_gpu} \
+    actor_rollout_ref.actor.megatron.pipeline_model_parallel_size=2 \
+    actor_rollout_ref.actor.megatron.virtual_pipeline_model_parallel_size=2 \
+    actor_rollout_ref.actor.megatron.context_parallel_size=2 \
+    actor_rollout_ref.actor.megatron.tensor_model_parallel_size=2 \
+    actor_rollout_ref.actor.use_kl_loss=True \
+    actor_rollout_ref.actor.kl_loss_coef=0.001 \
+    actor_rollout_ref.actor.kl_loss_type=low_var_kl \
+    actor_rollout_ref.actor.checkpoint.contents=['model','hf_model','optimizer','extra'] \
+    actor_rollout_ref.rollout.name=vllm \
+    actor_rollout_ref.rollout.tensor_model_parallel_size=2 \
+    actor_rollout_ref.rollout.gpu_memory_utilization=0.8 \
+    actor_rollout_ref.rollout.n=${n_resp_per_prompt} \
+    actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=${train_traj_micro_bsz_per_gpu} \
+    actor_rollout_ref.ref.megatron.pipeline_model_parallel_size=2 \
+    actor_rollout_ref.ref.megatron.virtual_pipeline_model_parallel_size=2 \
+    actor_rollout_ref.ref.megatron.context_parallel_size=2 \
+    actor_rollout_ref.ref.megatron.tensor_model_parallel_size=2 \
+    actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=${train_traj_micro_bsz_per_gpu} \
+    critic.optim.lr=2e-5 \
+    critic.model.path="${MODEL_PATH}" \
+    critic.model.enable_gradient_checkpointing=False \
+    critic.ppo_micro_batch_size_per_gpu=4 \
+    critic.megatron.pipeline_model_parallel_size=2 \
+    critic.megatron.virtual_pipeline_model_parallel_size=2 \
+    critic.megatron.context_parallel_size=2 \
+    critic.megatron.tensor_model_parallel_size=2 \
+    critic.checkpoint.contents=['model','hf_model','optimizer','extra'] \
+    algorithm.use_kl_in_reward=True \
+    algorithm.kl_penalty=kl \
+    algorithm.kl_ctrl.kl_coef=0.001 \
+    trainer.critic_warmup=0 \
+    trainer.logger=['console'] \
+    trainer.project_name='verl-test' \
+    trainer.experiment_name="${exp_name}" \
+    trainer.nnodes=1 \
+    trainer.n_gpus_per_node=${NUM_GPUS} \
+    trainer.val_before_train="${VAL_BEFORE_TRAIN}" \
+    trainer.test_freq="${TEST_FREQ}" \
+    trainer.save_freq="${SAVE_FREQ}" \
+    trainer.resume_mode="${RESUME_MODE}" \
+    trainer.total_epochs=2 \
+    trainer.total_training_steps="${TOT_TRAIN_STEPS}" $@
\ No newline at end of file
--- a/tests/e2e/run_prime.sh
+++ b/tests/e2e/run_prime.sh
+#!/usr/bin/env bash
+set -xeuo pipefail
+
+NUM_GPUS=${NUM_GPUS:-8}
+
+MODEL_ID=${MODEL_ID:-Qwen/Qwen2.5-0.5B}
+MODEL_PATH=${MODEL_PATH:-${HOME}/models/${MODEL_ID}}
+huggingface-cli download "${MODEL_ID}" --local-dir "${MODEL_PATH}"
+
+TRAIN_FILES=${TRAIN_FILES:-${HOME}/data/gsm8k/train.parquet}
+VAL_FILES=${VAL_FILES:-${HOME}/data/gsm8k/test.parquet}
+
+train_traj_micro_bsz_per_gpu=2 # b
+n_resp_per_prompt=4 # g
+
+train_traj_micro_bsz=$((train_traj_micro_bsz_per_gpu * NUM_GPUS)) # b * n
+train_traj_mini_bsz=$((train_traj_micro_bsz * 2)) # 2 * b * n
+train_prompt_mini_bsz=$((train_traj_mini_bsz * n_resp_per_prompt)) # 2 * b * n / g
+train_prompt_bsz=$((train_prompt_mini_bsz * 2)) # 4 * b * n / g
+
+exp_name="$(basename "${MODEL_ID,,}")-prime-minimal"
+
+python3 -m recipe.prime.main_prime \
+    data.train_files="${TRAIN_FILES}" \
+    data.val_files="${VAL_FILES}" \
+    data.train_batch_size=${train_prompt_bsz} \
+    data.max_prompt_length=512 \
+    data.max_response_length=512 \
+    data.filter_accuracy=True \
+    data.accuracy_lower_bound=0.2 \
+    data.accuracy_upper_bound=0.8 \
+    data.oversample_factor=4 \
+    data.return_raw_chat=True \
+    actor_rollout_ref.model.path="${MODEL_PATH}" \
+    actor_rollout_ref.actor.optim.lr=5e-7 \
+    actor_rollout_ref.model.use_remove_padding=True \
+    actor_rollout_ref.actor.ppo_mini_batch_size=${train_prompt_mini_bsz} \
+    actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=${train_traj_micro_bsz_per_gpu} \
+    actor_rollout_ref.model.enable_gradient_checkpointing=False \
+    actor_rollout_ref.actor.fsdp_config.param_offload=False \
+    actor_rollout_ref.actor.fsdp_config.optimizer_offload=False \
+    actor_rollout_ref.actor.use_kl_loss=False \
+    actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=${train_traj_micro_bsz_per_gpu} \
+    actor_rollout_ref.rollout.tensor_model_parallel_size=2 \
+    actor_rollout_ref.rollout.name=vllm \
+    actor_rollout_ref.rollout.n=${n_resp_per_prompt} \
+    actor_rollout_ref.rollout.gpu_memory_utilization=0.8 \
+    actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=${train_traj_micro_bsz_per_gpu} \
+    actor_rollout_ref.ref.fsdp_config.param_offload=True \
+    algorithm.adv_estimator=rloo \
+    algorithm.use_kl_in_reward=True \
+    algorithm.kl_penalty=kl \
+    algorithm.kl_ctrl.kl_coef=0.001 \
+    reward_model.model.path="${MODEL_PATH}" \
+    reward_model.micro_batch_size_per_gpu=${train_traj_micro_bsz_per_gpu} \
+    reward_model.model.update=before \
+    reward_model.model.beta_train=0.05 \
+    reward_model.model.optim.lr=1e-6 \
+    reward_model.model.optim.grad_clip=10.0 \
+    reward_model.model.input_tokenizer=null \
+    reward_model.mini_batch_size=${train_prompt_bsz} \
+    reward_model.reward_manager=prime \
+    trainer.val_before_train=False \
+    trainer.logger=['console'] \
+    trainer.project_name='verl-test' \
+    trainer.experiment_name="${exp_name}" \
+    trainer.n_gpus_per_node=${NUM_GPUS} \
+    trainer.nnodes=1 \
+    trainer.save_freq=-1 \
+    trainer.total_training_steps=1 $@
--- a/tests/e2e/run_r1_distill_qwen_aime24_eval.sh
+++ b/tests/e2e/run_r1_distill_qwen_aime24_eval.sh
+#!/usr/bin/env bash
+set -xeuo pipefail
+
+huggingface-cli download deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B \
+    --local-dir $HOME/models/deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B
+
+python3 -m verl.trainer.main_generation \
+    trainer.nnodes=1 \
+    trainer.n_gpus_per_node=8 \
+    data.path=$HOME/data/r1/test.parquet \
+    data.prompt_key=prompt \
+    data.batch_size=1024 \
+    data.n_samples=1 \
+    data.output_path=$HOME/data/r1/test-output-k1.parquet \
+    model.path=$HOME/models/deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B \
+    rollout.temperature=0.6 \
+    rollout.top_p=0.95 \
+    rollout.prompt_length=1024 \
+    rollout.response_length=32768 \
+    rollout.tensor_model_parallel_size=1 \
+    rollout.gpu_memory_utilization=0.95 \
+    rollout.max_num_batched_tokens=65536 \
+    rollout.enforce_eager=False \
+    rollout.free_cache_engine=False
+
+python3 -m recipe.r1.main_eval \
+    data.path=$HOME/data/r1/test-output-k1.parquet \
+    data.prompt_key=prompt \
+    data.response_key=responses \
+    custom_reward_function.path=recipe/r1/reward_score.py \
+    custom_reward_function.name=reward_func
\ No newline at end of file
--- a/tests/e2e/run_ray_trainer.sh
+++ b/tests/e2e/run_ray_trainer.sh
+#!/usr/bin/env bash
+
+set -e -x
+
+OUTPUT_FILE="/tmp/output_ray_trainer.txt"
+
+export PATH=$PATH:~/.local/bin
+
+rm -rf $OUTPUT_FILE
+python3 tests/e2e/arithmetic_sequence/rl/main_trainer.py \
+    algorithm.adv_estimator=gae \
+    data.train_files=tests/e2e/arithmetic_sequence/data/train.parquet \
+    data.val_files=tests/e2e/arithmetic_sequence/data/test.parquet \
+    data.train_batch_size=800 \
+    data.max_prompt_length=16 \
+    data.max_response_length=32 \
+    data.return_raw_input_ids=True \
+    actor_rollout_ref.model.path=tests/e2e/arithmetic_sequence/model \
+    actor_rollout_ref.model.external_lib=tests.e2e.envs.digit_completion \
+    actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=200 \
+    actor_rollout_ref.actor.entropy_coeff=0 \
+    actor_rollout_ref.actor.optim.lr=1e-4 \
+    actor_rollout_ref.actor.use_kl_loss=False \
+    actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=200 \
+    actor_rollout_ref.rollout.name=hf \
+    actor_rollout_ref.rollout.tensor_model_parallel_size=1 \
+    critic.ppo_micro_batch_size_per_gpu=200 \
+    critic.model.path=tests/e2e/arithmetic_sequence/model \
+    critic.optim.lr=1e-3 \
+    algorithm.use_kl_in_reward=False \
+    trainer.total_epochs=200 \
+    trainer.experiment_name=arithmetic_sequences \
+    trainer.logger=['console'] \
+    trainer.n_gpus_per_node=1 \
+    trainer.test_freq=1 \
+    trainer.save_freq=110 | tee $OUTPUT_FILE;
+
+python3 tests/e2e/check_results.py --output_file=$OUTPUT_FILE
+rm -rf $OUTPUT_FILE
--- a/tests/e2e/run_ray_trainer_fire_sampling.sh
+++ b/tests/e2e/run_ray_trainer_fire_sampling.sh
+#!/usr/bin/env bash
+
+set -e -x
+
+OUTPUT_FILE="/tmp/output_ray_trainer.txt"
+
+export PATH=$PATH:~/.local/bin
+
+rm -rf $OUTPUT_FILE
+python3 tests/e2e/arithmetic_sequence/rl/main_trainer.py \
+    algorithm.adv_estimator=gae \
+    data.train_files=tests/e2e/arithmetic_sequence/data/train.parquet \
+    data.val_files=tests/e2e/arithmetic_sequence/data/test.parquet \
+    data.train_batch_size=800 \
+    data.val_batch_size=200 \
+    data.max_prompt_length=16 \
+    data.max_response_length=32 \
+    data.return_raw_input_ids=True \
+    actor_rollout_ref.model.path=tests/e2e/arithmetic_sequence/model \
+    actor_rollout_ref.model.external_lib=tests.e2e.envs.digit_completion \
+    actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=200 \
+    actor_rollout_ref.actor.entropy_coeff=0 \
+    actor_rollout_ref.actor.optim.lr=1e-4 \
+    actor_rollout_ref.actor.use_kl_loss=False \
+    actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=200 \
+    actor_rollout_ref.rollout.name=hf \
+    actor_rollout_ref.rollout.use_fire_sampling=True \
+    actor_rollout_ref.rollout.tensor_model_parallel_size=1 \
+    critic.ppo_micro_batch_size_per_gpu=200 \
+    critic.model.path=tests/e2e/arithmetic_sequence/model \
+    critic.optim.lr=1e-3 \
+    algorithm.use_kl_in_reward=False \
+    trainer.total_epochs=200 \
+    trainer.experiment_name=arithmetic_sequences \
+    trainer.logger=['console'] \
+    trainer.n_gpus_per_node=1 \
+    trainer.test_freq=1 \
+    trainer.save_freq=110 | tee $OUTPUT_FILE;
+
+python3 tests/e2e/check_results.py --output_file=$OUTPUT_FILE --target 0.19
+rm -rf $OUTPUT_FILE
--- a/tests/e2e/run_ray_trainer_rmpad.sh
+++ b/tests/e2e/run_ray_trainer_rmpad.sh
+#!/usr/bin/env bash
+
+set -e -x
+
+huggingface-cli download Qwen/Qwen2.5-0.5B --local-dir $HOME/models/Qwen/Qwen2.5-0.5B
+
+python3 tests/e2e/arithmetic_sequence/rl/main_trainer.py \
+    algorithm.adv_estimator=gae \
+    data.train_files=tests/e2e/arithmetic_sequence/data/train.parquet \
+    data.val_files=tests/e2e/arithmetic_sequence/data/test.parquet \
+    actor_rollout_ref.actor.use_kl_loss=False \
+    actor_rollout_ref.model.path=tests/e2e/arithmetic_sequence/model \
+    actor_rollout_ref.rollout.name=vllm \
+    actor_rollout_ref.rollout.tensor_model_parallel_size=1 \
+    actor_rollout_ref.model.tokenizer_path=tests/e2e/arithmetic_sequence/model \
+    critic.model.path=Qwen/Qwen2.5-0.5B \
+    critic.model.use_remove_padding=True \
+    algorithm.use_kl_in_reward=False \
+    trainer.total_epochs=1
\ No newline at end of file
--- a/tests/e2e/run_test.sh
+++ b/tests/e2e/run_test.sh
+#!/bin/bash
+set -xeuo pipefail
+
+# Get the configuration name and engine name from arguments
+CONFIG_NAME="$1"
+ENGINE="${2:-vllm}"
+
+# Download model if needed
+huggingface-cli download Qwen/Qwen2.5-0.5B --local-dir "$HOME/models/Qwen/Qwen2.5-0.5B"
+
+# Run the training with the specified configuration
+python3 -m verl.trainer.main_ppo \
+    --config-name "$CONFIG_NAME" "$@" 
\ No newline at end of file
--- a/tests/e2e/sft/run_sft.sh
+++ b/tests/e2e/sft/run_sft.sh
+#!/usr/bin/env bash
+set -xeuo pipefail
+
+ENTRYPOINT=${ENTRYPOINT:-"-m verl.trainer.fsdp_sft_trainer"}
+
+NUM_GPUS=${NUM_GPUS:-8}
+
+MODEL_ID=${MODEL_ID:-Qwen/Qwen2.5-0.5B-Instruct}
+MODEL_PATH=${MODEL_PATH:-${HOME}/models/${MODEL_ID}}
+huggingface-cli download "${MODEL_ID}" --local-dir "${MODEL_PATH}"
+
+TRAIN_FILES=${TRAIN_FILES:-$HOME/data/gsm8k/train.parquet}
+VAL_FILES=${VAL_FILES:-$HOME/data/gsm8k/test.parquet}
+
+SP_SIZE=${SP_SIZE:-1}
+LIGER=${LIGER:-False}
+MULTITURN=${MULTITURN:-False}
+LORA_RANK=${LORA_RANK:-0}
+RM_PAD=${RM_PAD:-True}
+
+micro_bsz=2
+NUM_GPUS=8
+
+project_name="verl-test"
+exp_name="$(basename "${MODEL_ID,,}")-sft-minimal"
+ckpts_home=${ckpts_home:-$HOME/${project_name}/${exp_name}}
+
+mkdir -p "${ckpts_home}"
+
+torchrun --standalone --nnodes=1 --nproc_per_node=${NUM_GPUS} ${ENTRYPOINT} \
+    data.train_files="${TRAIN_FILES}" \
+    data.val_files="${VAL_FILES}" \
+    data.prompt_key=extra_info \
+    data.response_key=extra_info \
+    data.prompt_dict_keys=['question'] \
+    data.response_dict_keys=['answer'] \
+    data.multiturn.enable="${MULTITURN}" \
+    data.multiturn.messages_key=messages \
+    optim.lr=1e-4 \
+    data.micro_batch_size_per_gpu=${micro_bsz} \
+    model.partial_pretrain="${MODEL_PATH}" \
+    model.lora_rank="${LORA_RANK}" \
+    model.lora_alpha=16 \
+    model.target_modules=all-linear \
+    model.use_liger="${LIGER}" \
+    ulysses_sequence_parallel_size="${SP_SIZE}" \
+    use_remove_padding="${RM_PAD}" \
+    trainer.default_local_dir="${ckpts_home}" \
+    trainer.project_name="${project_name}" \
+    trainer.experiment_name="${exp_name}" \
+    trainer.total_training_steps=1 \
+    trainer.logger=['console'] \
+    trainer.default_hdfs_dir=null $@
+
+rm -rf "${ckpts_home:?}/*"
\ No newline at end of file
--- a/tests/e2e/sft/test_sp_loss_match.py
+++ b/tests/e2e/sft/test_sp_loss_match.py
+# Copyright 2024 Bytedance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+import torch.distributed
+from tensordict import TensorDict
+from verl.trainer.fsdp_sft_trainer import FSDPSFTTrainer
+from torch.distributed.device_mesh import init_device_mesh
+from verl.utils.distributed import initialize_global_process_group
+
+
+def test_trainer_forward_consistency(trainer: FSDPSFTTrainer, total_steps: int = 4):
+    """Test consistency between original forward pass and SP+rmpad forward passes.
+    
+    Args:
+        trainer: The FSDPSFTTrainer instance to test
+        total_steps: Number of steps to test (default: 4)
+    """
+    if trainer.device_mesh.get_rank() == 0:
+        print("\nStarting debug comparison between original and SP+rmpad forward passes...")
+        print(f"Sequence parallel size: {trainer.config.ulysses_sequence_parallel_size}")
+        print(f"Remove padding: {trainer.use_remove_padding}\n")
+
+    steps_remaining = total_steps
+
+    for epoch in range(1):  # Just one epoch for testing
+        trainer.train_sampler.set_epoch(epoch=epoch)
+        for data in trainer.train_dataloader:
+            data = TensorDict(data, batch_size=trainer.config.data.train_batch_size).cuda()
+            trainer.fsdp_model.train()
+            micro_batches = data.split(trainer.config.data.micro_batch_size_per_gpu)
+
+            for idx, micro_batch in enumerate(micro_batches):
+                if trainer.device_mesh.get_rank() == 0:
+                    print(f"\nProcessing micro batch {idx + 1}/{len(micro_batches)}")
+
+                # Compute losses using both methods
+                # Disable SP and rmpad
+                trainer.use_remove_padding = False
+                old_sp = trainer.config.ulysses_sequence_parallel_size
+                trainer.config.ulysses_sequence_parallel_size = 1
+                loss_ref = trainer._compute_loss_and_backward(micro_batch.copy(), do_backward=False)
+
+                # Do SP and rmpad
+                trainer.config.ulysses_sequence_parallel_size = old_sp
+                trainer.use_remove_padding = True
+                loss_sp = trainer._compute_loss_and_backward(micro_batch.copy(), do_backward=False)
+
+                # Collect losses across all ranks
+                loss_ref_all = loss_ref.clone()
+                loss_sp_all = loss_sp.clone()
+                torch.distributed.all_reduce(loss_ref_all, op=torch.distributed.ReduceOp.AVG)
+                torch.distributed.all_reduce(loss_sp_all, op=torch.distributed.ReduceOp.AVG)
+
+                # Calculate relative difference of averaged losses
+                rel_diff = torch.abs(loss_ref_all - loss_sp_all) / (torch.abs(loss_ref_all) + 1e-8)
+
+                if trainer.device_mesh.get_rank() == 0:
+                    print("\nComparison Results (Averaged across ranks):")
+                    print(f"Reference Loss: {loss_ref_all.item():.6f}")
+                    print(f"SP+rmpad Loss: {loss_sp_all.item():.6f}")
+                    print(f"Relative Difference: {rel_diff.item():.6f}")
+
+                    assert rel_diff.item() < 1e-2, "Significant difference detected between averaged losses!"
+                    print("Loss difference is within the acceptable range.")
+
+                steps_remaining -= 1
+                if steps_remaining == 0:
+                    break
+            if steps_remaining == 0:
+                break
+        break
+
+    if trainer.device_mesh.get_rank() == 0:
+        print("\nDebug comparison completed successfully.")
+
+
+def create_trainer(config):
+    """Create and initialize a trainer instance with the given config.
+    
+    Args:
+        config: Configuration object with training parameters
+        
+    Returns:
+        FSDPSFTTrainer: Initialized trainer instance
+    """
+    local_rank, rank, world_size = initialize_global_process_group()
+
+    device_mesh = init_device_mesh(device_type='cuda', mesh_shape=(world_size,), mesh_dim_names=('fsdp',))
+
+    dp_size = world_size // config.ulysses_sequence_parallel_size
+    ulysses_device_mesh = init_device_mesh(device_type='cuda',
+                                           mesh_shape=(dp_size, config.ulysses_sequence_parallel_size),
+                                           mesh_dim_names=('dp', 'sp'))
+
+    return FSDPSFTTrainer(config=config, device_mesh=device_mesh, ulysses_device_mesh=ulysses_device_mesh)
+
+
+def main(config):
+    """Main function to run trainer tests.
+    
+    Args:
+        config: Configuration object with training parameters
+    """
+    trainer = create_trainer(config)
+    test_trainer_forward_consistency(trainer)
+
+
+if __name__ == '__main__':
+    import hydra
+    from omegaconf import DictConfig
+
+    @hydra.main(config_path="../../../verl/trainer/config", config_name="sft_trainer")
+    def hydra_entry(cfg: DictConfig) -> None:
+        main(cfg)
+
+    hydra_entry()
--- a/tests/generation/run_gen_qwen05.sh
+++ b/tests/generation/run_gen_qwen05.sh
+#!/usr/bin/env bash
+# Tested with 1 & 4 GPUs
+set -xeuo pipefail
+
+MODEL_ID=${MODEL_ID:-Qwen/Qwen2.5-0.5B-Instruct}
+
+NGPUS_PER_NODE=${NGPUS_PER_NODE:-4}
+OUTPUT_PATH=${OUTPUT_PATH:-$HOME/data/gen/qwen_05_gen_test.parquet}
+GEN_TP=${GEN_TP:-2}  # Default tensor parallel size to 2
+
+python3 -m verl.trainer.main_generation \
+    trainer.nnodes=1 \
+    trainer.n_gpus_per_node="${NGPUS_PER_NODE}" \
+    data.path="${HOME}/data/gsm8k/test.parquet" \
+    data.prompt_key=prompt \
+    data.n_samples=1 \
+    data.output_path="${OUTPUT_PATH}" \
+    model.path="${MODEL_ID}" \
+    +model.trust_remote_code=True \
+    rollout.temperature=1.0 \
+    rollout.top_k=50 \
+    rollout.top_p=0.7 \
+    rollout.prompt_length=2048 \
+    rollout.response_length=1024 \
+    rollout.tensor_model_parallel_size="${GEN_TP}" \
+    rollout.gpu_memory_utilization=0.8
--- a/tests/gpu_utility/test_memory_buffers.py
+++ b/tests/gpu_utility/test_memory_buffers.py
+# Copyright 2024 Bytedance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Test memory buffers
+- We start with two models with the same weights
+- We use Memory buffer to make one of the models and then compare the parameters
+"""
+
+import torch
+import gc
+
+from transformers import LlamaModel, LlamaConfig
+from verl.utils.memory_buffer import MemoryBufferModuleWrapper
+
+
+def test_memory_buffers():
+    llama_config = LlamaConfig(vocab_size=256,
+                               hidden_size=4096,
+                               intermediate_size=11008,
+                               num_hidden_layers=2,
+                               num_attention_heads=16,
+                               num_key_value_heads=16)
+
+    model = LlamaModel(config=llama_config).cuda()
+    model_copy = LlamaModel(config=llama_config).cuda()
+    model_copy.load_state_dict(model.state_dict())
+
+    model_named_params = dict(model.named_parameters())
+    model_copy_named_params = dict(model_copy.named_parameters())
+
+    norm_factor = 1024**3
+
+    t_before = torch.cuda.get_device_properties(0).total_memory / norm_factor
+    r_before = torch.cuda.memory_reserved(0) / norm_factor
+    a_before = torch.cuda.memory_allocated(0) / norm_factor
+
+    print(f'Before Total memory: {t_before} GB, reserved: {r_before} GB, allocated: {a_before} GB')
+
+    model_wrapper = MemoryBufferModuleWrapper(model)
+
+    t = torch.cuda.get_device_properties(0).total_memory / norm_factor
+    r = torch.cuda.memory_reserved(0) / norm_factor
+    a = torch.cuda.memory_allocated(0) / norm_factor
+
+    gc.collect()
+    torch.cuda.empty_cache()
+
+    print(f'After Total memory: {t} GB, reserved: {r} GB, allocated: {a} GB')
+
+    change_ratio = (a - a_before) / a_before
+    assert change_ratio < 0.01, f'make sure the allocated change is less than 1%, Got {change_ratio}'
+
+    for (name1, param1), (name2, param2) in zip(model.named_parameters(), model_copy.named_parameters()):
+        assert name1 == name2
+        assert torch.eq(param1.data, param2.data).all(), f'{param1.data}, {param2.data}, {name1}'
+
+
+if __name__ == '__main__':
+    test_memory_buffers()
--- a/tests/gpu_utility/test_ops.py
+++ b/tests/gpu_utility/test_ops.py
+# Copyright 2024 Bytedance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+def test_flash_attn_cross_entropy():
+    from verl.utils.torch_functional import logprobs_from_logits_naive
+
+    from verl.utils.debug import log_gpu_memory_usage
+
+    from flash_attn.ops.triton.cross_entropy import cross_entropy_loss
+
+    import torch
+    from torch import nn
+
+    log_gpu_memory_usage('At start')
+
+    hidden_states = torch.randn(size=(2048, 5120), device='cuda', requires_grad=True, dtype=torch.bfloat16)
+
+    linear = nn.Linear(in_features=5120, out_features=155136, bias=False, device='cuda', dtype=torch.bfloat16)
+
+    logits = linear(hidden_states)
+
+    # logits = logits.float()
+    labels = torch.randint(low=0, high=155136, size=(2048,), device='cuda')
+
+    log_gpu_memory_usage('before computation')
+    # output = checkpoint.checkpoint(logprobs_from_logits, logits, labels, use_reentrant=True)
+    output = -cross_entropy_loss(logits, labels)[0]
+    # output = logprobs_from_logits(logits, labels)
+    log_gpu_memory_usage('After forward')
+    output.sum().backward()
+    log_gpu_memory_usage('After backward')
+
+    groundtruth = logprobs_from_logits_naive(logits.float(), labels)
+
+    torch.testing.assert_close(output, groundtruth)
--- a/tests/gpu_utility/test_torch_functional.py
+++ b/tests/gpu_utility/test_torch_functional.py
+# Copyright 2024 Bytedance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from verl.utils.model import create_random_mask
+from flash_attn.bert_padding import unpad_input
+import torch
+import pytest
+
+
+def test_log_probs_from_logits_response_rmpad():
+    from verl.utils.torch_functional import log_probs_from_logits_response, log_probs_from_logits_response_rmpad
+    vocab_size = 32000
+    batch_size = 2
+    prompt_length = 256
+    response_length = 256
+
+    input_ids = torch.randint(low=0, high=vocab_size, size=(batch_size, prompt_length + response_length), device='cuda')
+    attention_mask = create_random_mask(input_ids=input_ids,
+                                        max_ratio_of_left_padding=0.2,
+                                        max_ratio_of_valid_token=0.8,
+                                        min_ratio_of_valid_token=0.6)
+
+    response_mask = attention_mask[:, -response_length:]
+
+    assert torch.all(response_mask[:, 0] == 1)
+
+    logits = torch.randn(batch_size, prompt_length + response_length, vocab_size, device='cuda')
+    logits_rmpad = unpad_input(logits, attention_mask)[0]
+
+    expected_output = log_probs_from_logits_response(input_ids=input_ids,
+                                                     logits=logits,
+                                                     response_length=response_length)
+    actual_output = log_probs_from_logits_response_rmpad(input_ids=input_ids,
+                                                         attention_mask=attention_mask,
+                                                         logits_rmpad=logits_rmpad,
+                                                         response_length=response_length)
+
+    # This should bitwise align as only this operation only contains gather operators
+    assert torch.all(torch.eq(actual_output * response_mask, expected_output * response_mask))
+
+
+@pytest.mark.parametrize("dtype", [torch.float64, torch.float32, torch.float16, torch.bfloat16])
+def test_logprobs_from_logits_v2(dtype):
+    from verl.utils.torch_functional import logprobs_from_logits_v2, logprobs_from_logits_naive
+    vocab_size = 32000
+    batch_size = 2
+    seq_len = 512
+
+    labels = torch.randint(low=0, high=vocab_size, size=(batch_size, seq_len), device='cuda')
+    logits = torch.randn(batch_size, seq_len, vocab_size, device='cuda', dtype=dtype)
+
+    expected_output = logprobs_from_logits_naive(labels=labels, logits=logits)
+    actual_output = logprobs_from_logits_v2(labels=labels, logits=logits)
+
+    if dtype in [torch.float16, torch.bfloat16]:  # float16 falls back to an exactly equivalent method
+        assert torch.equal(actual_output, expected_output)
+    else:  # small numerical difference when using gather / logsumexp approach
+        torch.testing.assert_close(actual_output, expected_output, rtol=1e-5, atol=1e-5)
+
+
+def test_lr_scheduler():
+    from torch import nn
+    model = nn.Linear(10, 10)
+    optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
+
+    from verl.utils.torch_functional import get_constant_schedule_with_warmup
+    constant_lr = get_constant_schedule_with_warmup(optimizer=optimizer, num_warmup_steps=2)
+
+    lr_lst = []
+
+    for _ in range(5):
+        lr_lst.append(constant_lr.get_last_lr()[0])
+        constant_lr.step()
+
+    torch.testing.assert_close(lr_lst, [0.0, 0.0005, 0.001, 0.001, 0.001])
+
+    from verl.utils.torch_functional import get_cosine_schedule_with_warmup
+    optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
+    cosine_lr = get_cosine_schedule_with_warmup(optimizer=optimizer,
+                                                num_warmup_steps=2,
+                                                num_training_steps=5,
+                                                min_lr_ratio=0.1)
+
+    lr_lst = []
+
+    for _ in range(5):
+        lr_lst.append(cosine_lr.get_last_lr()[0])
+        cosine_lr.step()
+
+    torch.testing.assert_close(lr_lst, [0.0, 0.0005, 0.001, 0.0007750000000000002, 0.0003250000000000002])
--- a/tests/kill_github_tests.sh
+++ b/tests/kill_github_tests.sh
+#!/bin/bash
+
+if [ "$#" -ne 1 ]; then
+    echo "Usage: $0 YOUR_GITHUB_TOKEN"
+    echo "Please provide exactly one input argument for your github token."
+    exit 1
+fi
+
+# Set your GitHub repository details
+OWNER="volcengine"
+REPO="verl"
+TOKEN=$1
+
+# API URL for workflow runs
+API_URL="https://api.github.com/repos/$OWNER/$REPO/actions/runs?status=queued"
+
+# Check required commands
+command -v jq >/dev/null 2>&1 || { echo "jq is required but not installed. Aborting."; exit 1; }
+
+# Get queued workflow runs
+response=$(curl -s -H "Authorization: token $TOKEN" -H "Accept: application/vnd.github.v3+json" "$API_URL")
+
+# Run this for debugging
+# echo $response
+
+# Extract run IDs
+queued_run_ids=$(echo "$response" | jq -r '.workflow_runs[] | .id')
+
+if [ -z "$queued_run_ids" ]; then
+    echo "No queued workflow runs found."
+    exit 0
+fi
+
+# Cancel each queued run
+for run_id in $queued_run_ids; do
+    echo "Cancelling run $run_id"
+    cancel_url="https://api.github.com/repos/$OWNER/$REPO/actions/runs/$run_id/cancel"
+    curl -s -X POST -H "Authorization: token $TOKEN" -H "Accept: application/vnd.github.v3+json" "$cancel_url"
+done
+
+echo "Cancelled all queued workflow runs."
--- a/tests/model/test_transformer.py
+++ b/tests/model/test_transformer.py
+# Copyright 2024 Bytedance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+from verl.utils.model import create_random_mask, compute_position_id_with_mask
+from verl.utils.torch_functional import masked_mean, log_probs_from_logits_all_rmpad, logprobs_from_logits
+from flash_attn.bert_padding import unpad_input, pad_input, index_first_axis, rearrange
+
+from transformers import LlamaConfig, MistralConfig, GemmaConfig, Qwen2Config
+from transformers import AutoModelForCausalLM, AutoModelForTokenClassification, AutoModelForSequenceClassification
+# TODO(sgm): add more models for test
+# we only need one scale for each model
+test_configs = [
+    LlamaConfig(num_hidden_layers=1),
+    MistralConfig(num_hidden_layers=1),
+    GemmaConfig(num_hidden_layers=1),
+    Qwen2Config(num_hidden_layers=1)
+]
+
+
+def test_hf_casual_models():
+    batch_size = 4
+    seqlen = 128
+    response_length = 127
+
+    for config in test_configs:
+        # config = AutoConfig.from_pretrained(test_case)
+        with torch.device('cuda'):
+            model = AutoModelForCausalLM.from_config(config=config,
+                                                     torch_dtype=torch.bfloat16,
+                                                     attn_implementation='flash_attention_2')
+            model = model.to(device='cuda')
+        input_ids = torch.randint(low=0, high=config.vocab_size, size=(batch_size, seqlen), device='cuda')
+        attention_mask = create_random_mask(input_ids=input_ids,
+                                            max_ratio_of_left_padding=0.1,
+                                            max_ratio_of_valid_token=0.8,
+                                            min_ratio_of_valid_token=0.5)
+        position_ids = compute_position_id_with_mask(
+            attention_mask)  # TODO(sgm): we can construct the position_ids_rmpad here
+
+        input_ids_rmpad, indices, *_ = unpad_input(input_ids.unsqueeze(-1),
+                                                   attention_mask)  # input_ids_rmpad (total_nnz, ...)
+        input_ids_rmpad = input_ids_rmpad.transpose(0, 1)  # (1, total_nnz)
+
+        # unpad the position_ids to align the rotary
+        position_ids_rmpad = index_first_axis(rearrange(position_ids.unsqueeze(-1), "b s ... -> (b s) ..."),
+                                              indices).transpose(0, 1)
+
+        # input with input_ids_rmpad and postition_ids to enable flash attention varlen
+        logits_rmpad = model(input_ids_rmpad, position_ids=position_ids_rmpad,
+                             use_cache=False).logits  # (1, total_nnz, vocab_size)
+
+        origin_logits = model(input_ids=input_ids,
+                              attention_mask=attention_mask,
+                              position_ids=position_ids,
+                              use_cache=False).logits
+        origin_logits_rmpad, origin_logits_indices, *_ = unpad_input(origin_logits, attention_mask)
+
+        logits_rmpad = logits_rmpad.squeeze(0)
+        log_probs = log_probs_from_logits_all_rmpad(input_ids_rmpad=input_ids_rmpad,
+                                                    logits_rmpad=logits_rmpad,
+                                                    indices=indices,
+                                                    batch_size=batch_size,
+                                                    seqlen=seqlen,
+                                                    response_length=response_length)  # (batch, seqlen)
+        origin_log_probs = log_probs_from_logits_all_rmpad(input_ids_rmpad=input_ids_rmpad,
+                                                           logits_rmpad=origin_logits_rmpad,
+                                                           indices=origin_logits_indices,
+                                                           batch_size=batch_size,
+                                                           seqlen=seqlen,
+                                                           response_length=response_length)  # (batch, seqlen)
+
+        torch.testing.assert_close(masked_mean(log_probs, attention_mask[:, -response_length - 1:-1]),
+                                   masked_mean(origin_log_probs, attention_mask[:, -response_length - 1:-1]),
+                                   atol=1e-2,
+                                   rtol=1e-5)
+    print(f'Check pass')
+
+
+def test_hf_value_models():
+    batch_size = 4
+    seqlen = 128
+
+    for config in test_configs:
+        # config = AutoConfig.from_pretrained(test_case)
+        config.num_labels = 1
+        setattr(config, 'classifier_dropout', 0)
+        setattr(config, 'hidden_dropout', 0)
+        with torch.device('cuda'):
+            model = AutoModelForTokenClassification.from_config(config=config,
+                                                                torch_dtype=torch.bfloat16,
+                                                                attn_implementation='flash_attention_2')
+            model = model.to(device='cuda')
+        input_ids = torch.randint(low=0, high=config.vocab_size, size=(batch_size, seqlen), device='cuda')
+        attention_mask = create_random_mask(input_ids=input_ids,
+                                            max_ratio_of_left_padding=0.1,
+                                            max_ratio_of_valid_token=0.8,
+                                            min_ratio_of_valid_token=0.5)
+        position_ids = compute_position_id_with_mask(
+            attention_mask)  # TODO(sgm): we can construct the position_ids_rmpad here
+
+        input_ids_rmpad, indices, *_ = unpad_input(input_ids.unsqueeze(-1),
+                                                   attention_mask)  # input_ids_rmpad (total_nnz, ...)
+        input_ids_rmpad = input_ids_rmpad.transpose(0, 1)  # (1, total_nnz)
+
+        # unpad the position_ids to align the rotary
+        position_ids_rmpad = index_first_axis(rearrange(position_ids.unsqueeze(-1), "b s ... -> (b s) ..."),
+                                              indices).transpose(0, 1)
+
+        origin_logits = model(input_ids=input_ids,
+                              attention_mask=attention_mask,
+                              position_ids=position_ids,
+                              use_cache=False).logits
+
+        # input with input_ids_rmpad and postition_ids to enable flash attention varlen
+        rmpad_logits = model(input_ids_rmpad, position_ids=position_ids_rmpad,
+                             use_cache=False).logits  # (1, total_nnz, 1)
+        rmpad_logits = rmpad_logits.squeeze(0)
+        pad_logits = pad_input(rmpad_logits, indices, batch_size, seqlen=seqlen)
+
+        torch.testing.assert_close(masked_mean(pad_logits, attention_mask[:, :, None]),
+                                   masked_mean(origin_logits, attention_mask[:, :, None]),
+                                   atol=1e-2,
+                                   rtol=1e-5)
+    print('Value model check pass')
+
+
+if __name__ == '__main__':
+    test_hf_casual_models()
+    test_hf_value_models()
--- a/tests/model/test_transformers_ulysses.py
+++ b/tests/model/test_transformers_ulysses.py
+# Copyright 2024 Bytedance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import time
+import contextlib
+from dataclasses import dataclass
+
+import pytest
+import torch
+import copy
+import torch.distributed
+from torch.distributed import init_device_mesh
+from verl.utils.distributed import initialize_global_process_group
+from verl.utils.model import create_random_mask, compute_position_id_with_mask
+from verl.utils.ulysses import ulysses_pad_and_slice_inputs, gather_outpus_and_unpad
+from verl.utils.ulysses import get_ulysses_sequence_parallel_world_size, set_ulysses_sequence_parallel_group
+from verl.workers.sharding_manager import FSDPUlyssesShardingManager
+from verl.protocol import DataProto
+from flash_attn.bert_padding import unpad_input, index_first_axis, rearrange
+from transformers import LlamaConfig, Qwen2Config, PretrainedConfig
+from transformers import AutoModelForCausalLM
+from verl.models.transformers.monkey_patch import apply_monkey_patch
+
+# TODO(sgm): add more models for test
+# we only need one scale for each model
+
+
+@dataclass
+class SequenceParallelConfig:
+    config: PretrainedConfig
+    sp_size: int
+    is_valid: bool
+
+
+def test_configs():
+    return [
+        SequenceParallelConfig(LlamaConfig(num_hidden_layers=2, num_attention_heads=32, num_key_value_heads=32),
+                               sp_size=8,
+                               is_valid=True),
+        SequenceParallelConfig(Qwen2Config(num_hidden_layers=2,
+                                           num_attention_heads=28,
+                                           num_key_value_heads=4,
+                                           hidden_size=3584),
+                               sp_size=4,
+                               is_valid=True),
+        SequenceParallelConfig(Qwen2Config(num_hidden_layers=2,
+                                           num_attention_heads=28,
+                                           num_key_value_heads=4,
+                                           hidden_size=3584),
+                               sp_size=8,
+                               is_valid=False),
+        SequenceParallelConfig(Qwen2Config(num_hidden_layers=2, num_attention_heads=32, num_key_value_heads=4),
+                               sp_size=4,
+                               is_valid=True),
+        SequenceParallelConfig(Qwen2Config(num_hidden_layers=2, num_attention_heads=32, num_key_value_heads=4),
+                               sp_size=8,
+                               is_valid=True),
+    ]
+
+
+def sync_model_parameters_global(layer):
+    # synchronize weights
+    for p in layer.parameters():
+        torch.distributed.broadcast(tensor=p.data, src=0)
+
+
+@pytest.mark.parametrize("test_config", test_configs())
+def test_hf_casual_fwd_bwd(test_config):
+    if not torch.distributed.is_initialized():
+        initialize_global_process_group()
+
+    context = contextlib.nullcontext() if test_config.is_valid else pytest.raises(AssertionError)
+    with context:
+        world_size = torch.distributed.get_world_size()
+        _hf_casual_fwd_bwd(test_config.config, test_config.sp_size, world_size // test_config.sp_size)
+
+    # TODO: seems not work, will cause `socketStartConnect: Connect to xxx failed : Software caused connection abort`
+    # torch.distributed.destroy_process_group()
+
+
+def _hf_casual_fwd(config, sp_size, dp_size):
+    assert torch.cuda.device_count() >= 2, "need at least 2 gpus for test"
+
+    ulysses_device_mesh = init_device_mesh(device_type='cuda',
+                                           mesh_shape=(dp_size, sp_size),
+                                           mesh_dim_names=('dp', 'sp'))
+    sharding_manager = FSDPUlyssesShardingManager(ulysses_device_mesh)
+
+    batch_size = 1
+    seqlen = 128
+    response_length = 127
+
+    # patch before load
+    with torch.device('cuda'):
+        model = AutoModelForCausalLM.from_config(config=config,
+                                                 torch_dtype=torch.bfloat16,
+                                                 attn_implementation='flash_attention_2')
+        apply_monkey_patch(model, sp_size)
+        model = model.to(device='cuda')
+        sync_model_parameters_global(model)
+
+    # different rank will generate different input_ids following fsdp
+    input_ids = torch.randint(low=0, high=config.vocab_size, size=(batch_size, seqlen), device='cuda')
+    attention_mask = create_random_mask(input_ids=input_ids,
+                                        max_ratio_of_left_padding=0,
+                                        max_ratio_of_valid_token=0.9,
+                                        min_ratio_of_valid_token=0.8)
+    position_ids = compute_position_id_with_mask(
+        attention_mask)  # TODO(sgm): we can construct the position_ids_rmpad here
+
+    model_inputs = {
+        'input_ids': input_ids.cuda(),
+        'attention_mask': attention_mask.cuda(),
+        'position_ids': position_ids.int().cuda()
+    }
+
+    model_inputs = DataProto.from_dict(model_inputs)
+
+    # 1. perform ulysses forward
+    with sharding_manager:
+        model_inputs = sharding_manager.preprocess_data(model_inputs)
+        input_ids = model_inputs.batch['input_ids']
+        attention_mask = model_inputs.batch['attention_mask']
+        position_ids = model_inputs.batch['position_ids']
+        input_ids_rmpad, indices, *_ = unpad_input(input_ids.unsqueeze(-1),
+                                                   attention_mask)  # input_ids_rmpad (total_nnz, ...)
+        input_ids_rmpad = input_ids_rmpad.transpose(0, 1)  # (1, total_nnz)
+        # unpad the position_ids to align the rotary
+        position_ids_rmpad = index_first_axis(rearrange(position_ids.unsqueeze(-1), "b s ... -> (b s) ..."),
+                                              indices).transpose(0, 1)
+
+        # slice input tensor for ulysses
+        # input_ids are padded and sliced
+        # postition_ids are only padded but not sliced
+        input_ids_rmpad_sliced, position_ids_rmpad_padded, pad_size = ulysses_pad_and_slice_inputs(
+            input_ids_rmpad, position_ids_rmpad, sp_size=get_ulysses_sequence_parallel_world_size())
+
+        # input with input_ids_rmpad and postition_ids to enable flash attention varlen
+        logits_split_in_seq = model(input_ids_rmpad_sliced, position_ids=position_ids_rmpad_padded,
+                                    use_cache=False).logits  # (1, total_nnz/n, vocab_size)
+
+        # all_gather output
+        logits_full = gather_outpus_and_unpad(logits_split_in_seq, gather_dim=1, unpad_dim=1, padding_size=pad_size)
+
+    # 2. perform normal forward
+    set_ulysses_sequence_parallel_group(None)
+    logits_rmpad_local = model(input_ids_rmpad, position_ids=position_ids_rmpad,
+                               use_cache=False).logits  # (1, total_nnz, vocab_size)
+
+    mean_local = logits_rmpad_local.mean()
+    mean_full = logits_full.mean()
+    torch.testing.assert_close(mean_local, mean_full, rtol=1e-2, atol=1e-5)
+
+
+def _hf_casual_fwd_bwd(config, sp_size, dp_size):
+    assert torch.cuda.device_count() >= 2, "need at least 2 gpus for test"
+
+    ulysses_device_mesh = init_device_mesh(device_type='cuda',
+                                           mesh_shape=(dp_size, sp_size),
+                                           mesh_dim_names=('dp', 'sp'))
+    sharding_manager = FSDPUlyssesShardingManager(ulysses_device_mesh)
+
+    batch_size = 1
+    seqlen = 128
+    response_length = 127
+
+    # patch before load
+    with torch.device('cuda'):
+        model = AutoModelForCausalLM.from_config(config=config,
+                                                 torch_dtype=torch.bfloat16,
+                                                 attn_implementation='flash_attention_2')
+        apply_monkey_patch(model, sp_size)
+        model = model.to(device='cuda')
+        sync_model_parameters_global(model)
+
+    # different rank will generate different input_ids following fsdp
+    input_ids = torch.randint(low=0, high=config.vocab_size, size=(batch_size, seqlen), device='cuda')
+    attention_mask = create_random_mask(input_ids=input_ids,
+                                        max_ratio_of_left_padding=0,
+                                        max_ratio_of_valid_token=0.9,
+                                        min_ratio_of_valid_token=0.8)
+    position_ids = compute_position_id_with_mask(
+        attention_mask)  # TODO(sgm): we can construct the position_ids_rmpad here
+
+    model_inputs = {
+        'input_ids': input_ids.cuda(),
+        'attention_mask': attention_mask.cuda(),
+        'position_ids': position_ids.int().cuda()
+    }
+
+    model_inputs = DataProto.from_dict(model_inputs)
+
+    # 1. perform ulysses forward
+    with sharding_manager:
+        model_inputs = sharding_manager.preprocess_data(model_inputs)
+        input_ids = model_inputs.batch['input_ids']
+        attention_mask = model_inputs.batch['attention_mask']
+        position_ids = model_inputs.batch['position_ids']
+        input_ids_rmpad, indices, *_ = unpad_input(input_ids.unsqueeze(-1),
+                                                   attention_mask)  # input_ids_rmpad (total_nnz, ...)
+        input_ids_rmpad = input_ids_rmpad.transpose(0, 1)  # (1, total_nnz)
+        # unpad the position_ids to align the rotary
+        position_ids_rmpad = index_first_axis(rearrange(position_ids.unsqueeze(-1), "b s ... -> (b s) ..."),
+                                              indices).transpose(0, 1)
+
+        # slice input tensor for ulysses
+        # input_ids are padded and sliced
+        # postition_ids are only padded but not sliced
+        input_ids_rmpad_sliced, position_ids_rmpad_padded, pad_size = ulysses_pad_and_slice_inputs(
+            input_ids_rmpad, position_ids_rmpad, sp_size=get_ulysses_sequence_parallel_world_size())
+
+        # input with input_ids_rmpad and postition_ids to enable flash attention varlen
+        logits_split_in_seq = model(input_ids_rmpad_sliced, position_ids=position_ids_rmpad_padded,
+                                    use_cache=False).logits  # (1, total_nnz/n, vocab_size)
+
+        # all_gather output
+        logits_full = gather_outpus_and_unpad(logits_split_in_seq, gather_dim=1, unpad_dim=1, padding_size=pad_size)
+
+    # 2. perform normal forward
+    set_ulysses_sequence_parallel_group(None)
+    input_ids_full = copy.deepcopy(input_ids_rmpad)
+    position_ids_full = copy.deepcopy(position_ids_rmpad)
+    model_no_sp = copy.deepcopy(model)
+    logits_rmpad_local = model_no_sp(input_ids_full, position_ids=position_ids_full,
+                                     use_cache=False).logits  # (1, total_nnz, vocab_size)
+
+    mean_local = logits_rmpad_local.mean()
+    mean_full = logits_full.mean()
+
+    mean_full.backward()
+    mean_local.backward()
+
+    # 3. check the gradients
+    grad = model.model.layers[0].self_attn.q_proj.weight.grad
+    grad_full = model_no_sp.model.layers[0].self_attn.q_proj.weight.grad
+    torch.testing.assert_close(mean_local, mean_full, rtol=1e-2, atol=1e-5)
+    torch.testing.assert_close(grad, grad_full, atol=1e-2, rtol=1e-5)
+
+
+if __name__ == '__main__':
+    pytest.main([__file__, "-svv"])
--- a/tests/ray/check_worker_alive/main.py
+++ b/tests/ray/check_worker_alive/main.py
+# Copyright 2024 Bytedance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import time
+import sys
+import os
+
+import ray
+
+from verl.single_controller.ray.base import RayResourcePool, RayClassWithInitArgs, RayWorkerGroup
+from verl.single_controller.base.worker import Worker
+from verl.single_controller.base.decorator import register, Dispatch
+
+
+@ray.remote
+class TestActor(Worker):
+
+    def __init__(self) -> None:
+        super().__init__()
+
+    @register(dispatch_mode=Dispatch.ONE_TO_ALL, blocking=False)
+    def foo(self, wait_time):
+        time.sleep(wait_time)
+        sys.exit(1)
+
+
+if __name__ == "__main__":
+    wait_time = int(os.getenv("WAIT_TIME", "10"))
+
+    ray.init()
+
+    # test single-node-no-partition
+    print(f"test single-node-no-partition")
+    resource_pool = RayResourcePool([2], use_gpu=True)
+    class_with_args = RayClassWithInitArgs(cls=TestActor)
+
+    print("create worker group")
+    wg = RayWorkerGroup(resource_pool, class_with_args, name_prefix="test")
+
+    wg.start_worker_aliveness_check(1)
+    time.sleep(1)
+
+    print(time.time(), "start foo")
+
+    _ = wg.foo(wait_time)
+    print("foo started")
+
+    print(time.time(),
+          f"wait 6x wait time {wait_time*6} to let signal returned to process but still not exceed process wait time")
+    time.sleep(wait_time * 6)
+
+    ray.shutdown()