Commit f87b35b2 authored by jerrrrry's avatar jerrrrry
Browse files

Initial commit

parents
Pipeline #2648 failed with stages
in 0 seconds
#!/usr/bin/env bash
set -xeuo pipefail
NUM_GPUS=${NUM_GPUS:-8}
MODEL_ID=${MODEL_ID:-Qwen/Qwen2.5-0.5B}
MODEL_PATH=${MODEL_PATH:-${HOME}/models/${MODEL_ID}}
huggingface-cli download "${MODEL_ID}" --local-dir "${MODEL_PATH}"
TRAIN_FILES=${TRAIN_FILES:-$HOME/data/gsm8k/train.parquet}
VAL_FILES=${VAL_FILES:-$HOME/data/gsm8k/test.parquet}
MAX_PROMPT_LEN=${MAX_PROMPT_LEN:-512}
MAX_RESPONSE_LEN=${MAX_RESPONSE_LEN:-512}
ENGINE=${ENGINE:-vllm}
RM_PAD=${RM_PAD:-True}
ADV_ESTIMATOR=${ADV_ESTIMATOR:-gae}
USE_KL=${USE_KL:-False}
CUSTOM_REWARD_FN=${CUSTOM_REWARD_FN:-False}
ENABLE_CHUNKED_PREFILL=${ENABLE_CHUNKED_PREFILL:-True} # For vLLM VLM placeholder issue: https://github.com/vllm-project/vllm/issues/15185
# Validation
VAL_BEFORE_TRAIN=${VAL_BEFORE_TRAIN:-False}
TEST_FREQ=${TEST_FREQ:--1}
# Save & Resume
RESUME_MODE=${RESUME_MODE:-disable}
SAVE_FREQ=${SAVE_FREQ:--1}
TOT_TRAIN_STEPS=${TOT_TRAIN_STEPS:-1}
train_traj_micro_bsz_per_gpu=2 # b
n_resp_per_prompt=4 # g
train_traj_micro_bsz=$((train_traj_micro_bsz_per_gpu * NUM_GPUS)) # b * n
train_traj_mini_bsz=$((train_traj_micro_bsz * 2)) # 2 * b * n
train_prompt_mini_bsz=$((train_traj_mini_bsz * n_resp_per_prompt)) # 2 * b * n / g
train_prompt_bsz=$((train_prompt_mini_bsz * 2)) # 4 * b * n / g
reward_fn_name=null
reward_fn_file_path=null
output_file="$(pwd)/output.txt"
if [ "${CUSTOM_REWARD_FN}" = "True" ]; then
reward_fn_name="my_reward_function"
reward_fn_file_path="$(pwd)/my_reward_function.py"
rm -rf "${reward_fn_file_path}"
cat <<EOF > "$reward_fn_file_path"
def ${reward_fn_name}(data_source, solution_str, ground_truth, extra_info=None):
print(f"Congratulations!!! You have called ${reward_fn_name} successfully!!!")
return 0.1
EOF
rm -rf "${output_file}"
fi
exp_name="$(basename "${MODEL_ID,,}")-function-reward-minimal"
python3 -m verl.trainer.main_ppo \
algorithm.adv_estimator="${ADV_ESTIMATOR}" \
data.train_files="${TRAIN_FILES}" \
data.val_files="${VAL_FILES}" \
data.train_batch_size="${train_prompt_bsz}" \
data.max_prompt_length="${MAX_PROMPT_LEN}" \
data.max_response_length="${MAX_RESPONSE_LEN}" \
actor_rollout_ref.model.path="${MODEL_PATH}" \
actor_rollout_ref.actor.optim.lr=1e-6 \
actor_rollout_ref.model.use_remove_padding="${RM_PAD}" \
actor_rollout_ref.actor.ppo_mini_batch_size=${train_prompt_mini_bsz} \
actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=${train_traj_micro_bsz_per_gpu} \
actor_rollout_ref.actor.fsdp_config.param_offload=False \
actor_rollout_ref.actor.fsdp_config.optimizer_offload=False \
actor_rollout_ref.actor.use_kl_loss="${USE_KL}" \
actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=${train_traj_micro_bsz_per_gpu} \
actor_rollout_ref.rollout.tensor_model_parallel_size=2 \
actor_rollout_ref.rollout.name="${ENGINE}" \
actor_rollout_ref.rollout.gpu_memory_utilization=0.8 \
actor_rollout_ref.rollout.enable_chunked_prefill="${ENABLE_CHUNKED_PREFILL}" \
actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=${train_traj_micro_bsz_per_gpu} \
critic.optim.lr=1e-5 \
critic.model.use_remove_padding="${RM_PAD}" \
critic.model.path="${MODEL_PATH}" \
critic.model.enable_gradient_checkpointing=False \
critic.ppo_micro_batch_size_per_gpu=${train_traj_micro_bsz_per_gpu} \
critic.model.fsdp_config.param_offload=False \
critic.model.fsdp_config.optimizer_offload=False \
custom_reward_function.path="${reward_fn_file_path}"\
custom_reward_function.name="${reward_fn_name}"\
algorithm.use_kl_in_reward="${USE_KL}" \
algorithm.kl_penalty=kl \
algorithm.kl_ctrl.kl_coef=0.001 \
trainer.critic_warmup=0 \
trainer.logger=['console'] \
trainer.project_name='verl-test' \
trainer.experiment_name="${exp_name}" \
trainer.nnodes=1 \
trainer.n_gpus_per_node="${NUM_GPUS}" \
trainer.val_before_train="${VAL_BEFORE_TRAIN}" \
trainer.test_freq="${TEST_FREQ}" \
trainer.save_freq="${SAVE_FREQ}" \
trainer.resume_mode="${RESUME_MODE}" \
trainer.total_epochs=2 \
trainer.total_training_steps="${TOT_TRAIN_STEPS}" $@ \
| tee "${output_file}"
if [ "${CUSTOM_REWARD_FN}" = "True" ]; then
python3 tests/e2e/check_custom_rwd_fn.py --output_file="${output_file}"
check_exit_code=$?
rm -rf "${reward_fn_file_path}"
rm -rf "${output_file}"
# Return the exit code of check_custom_rwd_fn.py if it fails
if [ $check_exit_code -ne 0 ]; then
exit $check_exit_code
fi
fi
\ No newline at end of file
#!/usr/bin/env bash
set -xeuo pipefail
NUM_GPUS=${NUM_GPUS:-8}
MODEL_ID=${MODEL_ID:-Qwen/Qwen2.5-0.5B}
MODEL_PATH=${MODEL_PATH:-${HOME}/models/${MODEL_ID}}
huggingface-cli download "${MODEL_ID}" --local-dir "${MODEL_PATH}"
TRAIN_FILES=${TRAIN_FILES:-$HOME/data/gsm8k/train.parquet}
VAL_FILES=${VAL_FILES:-$HOME/data/gsm8k/test.parquet}
RM_PAD=${RM_PAD:-True}
SP_SIZE=${SP_SIZE:-1}
SEQ_BALANCE=${SEQ_BALANCE:-False}
LIGER=${LIGER:-False}
# Validation
VAL_BEFORE_TRAIN=${VAL_BEFORE_TRAIN:-False}
TEST_FREQ=${TEST_FREQ:--1}
# Save & Resume
RESUME_MODE=${RESUME_MODE:-disable}
SAVE_FREQ=${SAVE_FREQ:--1}
TOT_TRAIN_STEPS=${TOT_TRAIN_STEPS:-1}
train_traj_micro_bsz_per_gpu=2 # b
n_resp_per_prompt=4 # g
train_traj_micro_bsz=$((train_traj_micro_bsz_per_gpu * NUM_GPUS)) # b * n
train_traj_mini_bsz=$((train_traj_micro_bsz * 2)) # 2 * b * n
train_prompt_mini_bsz=$((train_traj_mini_bsz * n_resp_per_prompt)) # 2 * b * n / g
train_prompt_bsz=$((train_prompt_mini_bsz * 2)) # 4 * b * n / g
train_max_token_num_per_gpu=32768
infer_max_token_num_per_gpu=32768
exp_name="$(basename "${MODEL_ID,,}")-model-reward-minimal"
python3 -m verl.trainer.main_ppo \
algorithm.adv_estimator=gae \
data.train_files="${TRAIN_FILES}" \
data.val_files="${VAL_FILES}" \
data.train_batch_size=${train_prompt_bsz} \
data.max_prompt_length=512 \
data.max_response_length=512 \
data.return_raw_chat=True \
actor_rollout_ref.model.path="${MODEL_PATH}" \
actor_rollout_ref.model.use_liger="${LIGER}" \
actor_rollout_ref.actor.optim.lr=1e-6 \
actor_rollout_ref.model.use_remove_padding="${RM_PAD}" \
actor_rollout_ref.actor.optim.lr_warmup_steps_ratio=0.1 \
actor_rollout_ref.actor.ppo_mini_batch_size=${train_prompt_mini_bsz} \
actor_rollout_ref.actor.use_dynamic_bsz="${SEQ_BALANCE}" \
actor_rollout_ref.actor.ppo_max_token_len_per_gpu=${train_max_token_num_per_gpu} \
actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=${train_traj_micro_bsz_per_gpu} \
actor_rollout_ref.actor.ulysses_sequence_parallel_size="${SP_SIZE}" \
actor_rollout_ref.actor.fsdp_config.param_offload=False \
actor_rollout_ref.actor.fsdp_config.optimizer_offload=False \
actor_rollout_ref.actor.use_kl_loss=False \
actor_rollout_ref.rollout.log_prob_max_token_len_per_gpu=${infer_max_token_num_per_gpu} \
actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=${train_traj_micro_bsz_per_gpu} \
actor_rollout_ref.rollout.tensor_model_parallel_size=2 \
actor_rollout_ref.rollout.name=vllm \
actor_rollout_ref.rollout.gpu_memory_utilization=0.8 \
actor_rollout_ref.ref.log_prob_max_token_len_per_gpu=${infer_max_token_num_per_gpu} \
actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=${train_traj_micro_bsz_per_gpu} \
critic.optim.lr=1e-5 \
critic.ulysses_sequence_parallel_size="${SP_SIZE}" \
critic.model.use_remove_padding="${RM_PAD}" \
critic.optim.lr_warmup_steps_ratio=0.05 \
critic.model.path="${MODEL_PATH}" \
critic.model.enable_gradient_checkpointing=False \
critic.use_dynamic_bsz="${SEQ_BALANCE}" \
critic.ppo_max_token_len_per_gpu=${train_max_token_num_per_gpu} \
critic.ppo_micro_batch_size_per_gpu=${train_traj_micro_bsz_per_gpu} \
critic.model.fsdp_config.param_offload=False \
critic.model.fsdp_config.optimizer_offload=False \
reward_model.enable=True \
reward_model.ulysses_sequence_parallel_size="${SP_SIZE}" \
reward_model.model.path="${MODEL_PATH}" \
reward_model.model.use_remove_padding="${RM_PAD}" \
reward_model.model.fsdp_config.param_offload=True \
reward_model.use_dynamic_bsz="${SEQ_BALANCE}" \
reward_model.forward_max_token_len_per_gpu=${infer_max_token_num_per_gpu} \
reward_model.micro_batch_size_per_gpu=${train_traj_micro_bsz_per_gpu} \
algorithm.use_kl_in_reward=False \
trainer.critic_warmup=0 \
trainer.logger=['console'] \
trainer.project_name='verl-test' \
trainer.experiment_name="${exp_name}" \
trainer.nnodes=1 \
trainer.n_gpus_per_node="${NUM_GPUS}" \
trainer.val_before_train="${VAL_BEFORE_TRAIN}" \
trainer.test_freq="${VAL_BEFORE_TRAIN}" \
trainer.save_freq="${SAVE_FREQ}" \
trainer.resume_mode="${RESUME_MODE}" \
trainer.total_epochs=2 \
trainer.total_training_steps="${TOT_TRAIN_STEPS}" $@
#!/usr/bin/env bash
set -xeuo pipefail
NUM_GPUS=${NUM_GPUS:-8}
MODEL_ID=${MODEL_ID:-Qwen/Qwen2.5-0.5B-Instruct}
MODEL_PATH=${MODEL_PATH:-${HOME}/models/${MODEL_ID}}
huggingface-cli download "${MODEL_ID}" --local-dir "${MODEL_PATH}"
adv_estimator=grpo
kl_coef=0.0
use_kl_in_reward=False
use_kl_loss=False
kl_loss_coef=0.0
clip_ratio_low=0.2
clip_ratio_high=0.28
max_prompt_length=1024
max_response_length=2048
enable_overlong_buffer=True
overlong_buffer_len=128
overlong_penalty_factor=1.0
loss_agg_mode="token-mean"
enable_filter_groups=True
filter_groups_metric=seq_reward
max_num_gen_batches=10
train_traj_micro_bsz_per_gpu=2 # b
n_resp_per_prompt=4 # g
train_traj_micro_bsz=$((train_traj_micro_bsz_per_gpu * NUM_GPUS)) # b * n
train_traj_mini_bsz=$((train_traj_micro_bsz * 2)) # 2 * b * n
train_prompt_mini_bsz=$((train_traj_mini_bsz * n_resp_per_prompt)) # 2 * b * n / g
train_prompt_bsz=$((train_prompt_mini_bsz * 2)) # 4 * b * n / g
gen_prompt_bsz=$((train_prompt_bsz * 4))
exp_name="$(basename "${MODEL_ID,,}")-dapo-minimal"
python3 -m recipe.dapo.src.main_dapo \
data.train_files="${HOME}/data/gsm8k/train.parquet" \
data.val_files="${HOME}/data/gsm8k/test.parquet" \
reward_model.reward_manager=dapo \
algorithm.adv_estimator=${adv_estimator} \
algorithm.use_kl_in_reward=${use_kl_in_reward} \
algorithm.kl_ctrl.kl_coef=${kl_coef} \
actor_rollout_ref.actor.use_kl_loss=${use_kl_loss} \
actor_rollout_ref.actor.kl_loss_coef=${kl_loss_coef} \
actor_rollout_ref.actor.clip_ratio_low=${clip_ratio_low} \
actor_rollout_ref.actor.clip_ratio_high=${clip_ratio_high} \
data.max_prompt_length=${max_prompt_length} \
data.max_response_length=${max_response_length} \
reward_model.overlong_buffer.enable=${enable_overlong_buffer} \
reward_model.overlong_buffer.len=${overlong_buffer_len} \
reward_model.overlong_buffer.penalty_factor=${overlong_penalty_factor} \
actor_rollout_ref.actor.loss_agg_mode=${loss_agg_mode} \
data.train_batch_size=${train_prompt_bsz} \
data.gen_batch_size=${gen_prompt_bsz} \
algorithm.filter_groups.enable=${enable_filter_groups} \
algorithm.filter_groups.metric=${filter_groups_metric} \
algorithm.filter_groups.max_num_gen_batches=${max_num_gen_batches} \
actor_rollout_ref.model.path="${MODEL_PATH}" \
actor_rollout_ref.actor.optim.lr=1e-6 \
actor_rollout_ref.model.use_remove_padding=True \
actor_rollout_ref.rollout.n=${n_resp_per_prompt} \
actor_rollout_ref.actor.ppo_mini_batch_size=${train_prompt_mini_bsz} \
actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=${train_traj_micro_bsz_per_gpu} \
actor_rollout_ref.actor.fsdp_config.param_offload=False \
actor_rollout_ref.actor.fsdp_config.optimizer_offload=False \
actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=${train_traj_micro_bsz_per_gpu} \
actor_rollout_ref.rollout.tensor_model_parallel_size=2 \
actor_rollout_ref.rollout.name=vllm \
actor_rollout_ref.rollout.gpu_memory_utilization=0.8 \
actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=${train_traj_micro_bsz_per_gpu} \
actor_rollout_ref.ref.fsdp_config.param_offload=True \
trainer.logger=['console'] \
trainer.project_name='verl-test' \
trainer.experiment_name="${exp_name}" \
trainer.n_gpus_per_node=${NUM_GPUS} \
trainer.nnodes=1 \
trainer.save_freq=-1 \
trainer.total_epochs=2 \
trainer.resume_mode=disable \
trainer.val_before_train=False \
trainer.total_training_steps=1 $@
#!/usr/bin/env bash
set -xeuo pipefail
NUM_GPUS=${NUM_GPUS:-8}
MODEL_ID=${MODEL_ID:-Qwen/Qwen2.5-0.5B}
MODEL_PATH=${MODEL_PATH:-${HOME}/models/${MODEL_ID}}
huggingface-cli download "${MODEL_ID}" --local-dir "${MODEL_PATH}"
TRAIN_FILES=${TRAIN_FILES:-${HOME}/data/gsm8k/train.parquet}
VAL_FILES=${VAL_FILES:-${HOME}/data/gsm8k/test.parquet}
ADV_ESTIMATOR=${ADV_ESTIMATOR:-gae}
# Validation
VAL_BEFORE_TRAIN=${VAL_BEFORE_TRAIN:-False}
TEST_FREQ=${TEST_FREQ:--1}
# Save & Resume
RESUME_MODE=${RESUME_MODE:-disable}
SAVE_FREQ=${SAVE_FREQ:--1}
TOT_TRAIN_STEPS=${TOT_TRAIN_STEPS:-1}
train_traj_micro_bsz_per_gpu=2 # b
n_resp_per_prompt=4 # g
train_traj_micro_bsz=$((train_traj_micro_bsz_per_gpu * NUM_GPUS)) # b * n
train_traj_mini_bsz=$((train_traj_micro_bsz * 2)) # 2 * b * n
train_prompt_mini_bsz=$((train_traj_mini_bsz * n_resp_per_prompt)) # 2 * b * n / g
train_prompt_bsz=$((train_prompt_mini_bsz * 2)) # 4 * b * n / g
exp_name="$(basename "${MODEL_ID,,}")-megatron-gsm8k-minimal"
python3 -m verl.trainer.main_ppo --config-path=config \
--config-name='ppo_megatron_trainer.yaml'\
algorithm.adv_estimator="${ADV_ESTIMATOR}" \
data.train_files="${TRAIN_FILES}" \
data.val_files="${VAL_FILES}" \
data.train_batch_size=${train_prompt_bsz} \
data.max_prompt_length=512 \
data.max_response_length=512 \
data.filter_overlong_prompts=True \
data.truncation='error' \
actor_rollout_ref.model.path="${MODEL_PATH}" \
actor_rollout_ref.actor.optim.lr=1e-6 \
actor_rollout_ref.actor.ppo_mini_batch_size=${train_prompt_mini_bsz} \
actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=${train_traj_micro_bsz_per_gpu} \
actor_rollout_ref.actor.megatron.pipeline_model_parallel_size=2 \
actor_rollout_ref.actor.megatron.virtual_pipeline_model_parallel_size=2 \
actor_rollout_ref.actor.megatron.context_parallel_size=2 \
actor_rollout_ref.actor.megatron.tensor_model_parallel_size=2 \
actor_rollout_ref.actor.use_kl_loss=True \
actor_rollout_ref.actor.kl_loss_coef=0.001 \
actor_rollout_ref.actor.kl_loss_type=low_var_kl \
actor_rollout_ref.actor.checkpoint.contents=['model','hf_model','optimizer','extra'] \
actor_rollout_ref.rollout.name=vllm \
actor_rollout_ref.rollout.tensor_model_parallel_size=2 \
actor_rollout_ref.rollout.gpu_memory_utilization=0.8 \
actor_rollout_ref.rollout.n=${n_resp_per_prompt} \
actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=${train_traj_micro_bsz_per_gpu} \
actor_rollout_ref.ref.megatron.pipeline_model_parallel_size=2 \
actor_rollout_ref.ref.megatron.virtual_pipeline_model_parallel_size=2 \
actor_rollout_ref.ref.megatron.context_parallel_size=2 \
actor_rollout_ref.ref.megatron.tensor_model_parallel_size=2 \
actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=${train_traj_micro_bsz_per_gpu} \
critic.optim.lr=2e-5 \
critic.model.path="${MODEL_PATH}" \
critic.model.enable_gradient_checkpointing=False \
critic.ppo_micro_batch_size_per_gpu=4 \
critic.megatron.pipeline_model_parallel_size=2 \
critic.megatron.virtual_pipeline_model_parallel_size=2 \
critic.megatron.context_parallel_size=2 \
critic.megatron.tensor_model_parallel_size=2 \
critic.checkpoint.contents=['model','hf_model','optimizer','extra'] \
algorithm.use_kl_in_reward=True \
algorithm.kl_penalty=kl \
algorithm.kl_ctrl.kl_coef=0.001 \
trainer.critic_warmup=0 \
trainer.logger=['console'] \
trainer.project_name='verl-test' \
trainer.experiment_name="${exp_name}" \
trainer.nnodes=1 \
trainer.n_gpus_per_node=${NUM_GPUS} \
trainer.val_before_train="${VAL_BEFORE_TRAIN}" \
trainer.test_freq="${TEST_FREQ}" \
trainer.save_freq="${SAVE_FREQ}" \
trainer.resume_mode="${RESUME_MODE}" \
trainer.total_epochs=2 \
trainer.total_training_steps="${TOT_TRAIN_STEPS}" $@
\ No newline at end of file
#!/usr/bin/env bash
set -xeuo pipefail
NUM_GPUS=${NUM_GPUS:-8}
MODEL_ID=${MODEL_ID:-Qwen/Qwen2.5-0.5B}
MODEL_PATH=${MODEL_PATH:-${HOME}/models/${MODEL_ID}}
huggingface-cli download "${MODEL_ID}" --local-dir "${MODEL_PATH}"
TRAIN_FILES=${TRAIN_FILES:-${HOME}/data/gsm8k/train.parquet}
VAL_FILES=${VAL_FILES:-${HOME}/data/gsm8k/test.parquet}
train_traj_micro_bsz_per_gpu=2 # b
n_resp_per_prompt=4 # g
train_traj_micro_bsz=$((train_traj_micro_bsz_per_gpu * NUM_GPUS)) # b * n
train_traj_mini_bsz=$((train_traj_micro_bsz * 2)) # 2 * b * n
train_prompt_mini_bsz=$((train_traj_mini_bsz * n_resp_per_prompt)) # 2 * b * n / g
train_prompt_bsz=$((train_prompt_mini_bsz * 2)) # 4 * b * n / g
exp_name="$(basename "${MODEL_ID,,}")-prime-minimal"
python3 -m recipe.prime.main_prime \
data.train_files="${TRAIN_FILES}" \
data.val_files="${VAL_FILES}" \
data.train_batch_size=${train_prompt_bsz} \
data.max_prompt_length=512 \
data.max_response_length=512 \
data.filter_accuracy=True \
data.accuracy_lower_bound=0.2 \
data.accuracy_upper_bound=0.8 \
data.oversample_factor=4 \
data.return_raw_chat=True \
actor_rollout_ref.model.path="${MODEL_PATH}" \
actor_rollout_ref.actor.optim.lr=5e-7 \
actor_rollout_ref.model.use_remove_padding=True \
actor_rollout_ref.actor.ppo_mini_batch_size=${train_prompt_mini_bsz} \
actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=${train_traj_micro_bsz_per_gpu} \
actor_rollout_ref.model.enable_gradient_checkpointing=False \
actor_rollout_ref.actor.fsdp_config.param_offload=False \
actor_rollout_ref.actor.fsdp_config.optimizer_offload=False \
actor_rollout_ref.actor.use_kl_loss=False \
actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=${train_traj_micro_bsz_per_gpu} \
actor_rollout_ref.rollout.tensor_model_parallel_size=2 \
actor_rollout_ref.rollout.name=vllm \
actor_rollout_ref.rollout.n=${n_resp_per_prompt} \
actor_rollout_ref.rollout.gpu_memory_utilization=0.8 \
actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=${train_traj_micro_bsz_per_gpu} \
actor_rollout_ref.ref.fsdp_config.param_offload=True \
algorithm.adv_estimator=rloo \
algorithm.use_kl_in_reward=True \
algorithm.kl_penalty=kl \
algorithm.kl_ctrl.kl_coef=0.001 \
reward_model.model.path="${MODEL_PATH}" \
reward_model.micro_batch_size_per_gpu=${train_traj_micro_bsz_per_gpu} \
reward_model.model.update=before \
reward_model.model.beta_train=0.05 \
reward_model.model.optim.lr=1e-6 \
reward_model.model.optim.grad_clip=10.0 \
reward_model.model.input_tokenizer=null \
reward_model.mini_batch_size=${train_prompt_bsz} \
reward_model.reward_manager=prime \
trainer.val_before_train=False \
trainer.logger=['console'] \
trainer.project_name='verl-test' \
trainer.experiment_name="${exp_name}" \
trainer.n_gpus_per_node=${NUM_GPUS} \
trainer.nnodes=1 \
trainer.save_freq=-1 \
trainer.total_training_steps=1 $@
#!/usr/bin/env bash
set -xeuo pipefail
huggingface-cli download deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B \
--local-dir $HOME/models/deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B
python3 -m verl.trainer.main_generation \
trainer.nnodes=1 \
trainer.n_gpus_per_node=8 \
data.path=$HOME/data/r1/test.parquet \
data.prompt_key=prompt \
data.batch_size=1024 \
data.n_samples=1 \
data.output_path=$HOME/data/r1/test-output-k1.parquet \
model.path=$HOME/models/deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B \
rollout.temperature=0.6 \
rollout.top_p=0.95 \
rollout.prompt_length=1024 \
rollout.response_length=32768 \
rollout.tensor_model_parallel_size=1 \
rollout.gpu_memory_utilization=0.95 \
rollout.max_num_batched_tokens=65536 \
rollout.enforce_eager=False \
rollout.free_cache_engine=False
python3 -m recipe.r1.main_eval \
data.path=$HOME/data/r1/test-output-k1.parquet \
data.prompt_key=prompt \
data.response_key=responses \
custom_reward_function.path=recipe/r1/reward_score.py \
custom_reward_function.name=reward_func
\ No newline at end of file
#!/usr/bin/env bash
set -e -x
OUTPUT_FILE="/tmp/output_ray_trainer.txt"
export PATH=$PATH:~/.local/bin
rm -rf $OUTPUT_FILE
python3 tests/e2e/arithmetic_sequence/rl/main_trainer.py \
algorithm.adv_estimator=gae \
data.train_files=tests/e2e/arithmetic_sequence/data/train.parquet \
data.val_files=tests/e2e/arithmetic_sequence/data/test.parquet \
data.train_batch_size=800 \
data.max_prompt_length=16 \
data.max_response_length=32 \
data.return_raw_input_ids=True \
actor_rollout_ref.model.path=tests/e2e/arithmetic_sequence/model \
actor_rollout_ref.model.external_lib=tests.e2e.envs.digit_completion \
actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=200 \
actor_rollout_ref.actor.entropy_coeff=0 \
actor_rollout_ref.actor.optim.lr=1e-4 \
actor_rollout_ref.actor.use_kl_loss=False \
actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=200 \
actor_rollout_ref.rollout.name=hf \
actor_rollout_ref.rollout.tensor_model_parallel_size=1 \
critic.ppo_micro_batch_size_per_gpu=200 \
critic.model.path=tests/e2e/arithmetic_sequence/model \
critic.optim.lr=1e-3 \
algorithm.use_kl_in_reward=False \
trainer.total_epochs=200 \
trainer.experiment_name=arithmetic_sequences \
trainer.logger=['console'] \
trainer.n_gpus_per_node=1 \
trainer.test_freq=1 \
trainer.save_freq=110 | tee $OUTPUT_FILE;
python3 tests/e2e/check_results.py --output_file=$OUTPUT_FILE
rm -rf $OUTPUT_FILE
#!/usr/bin/env bash
set -e -x
OUTPUT_FILE="/tmp/output_ray_trainer.txt"
export PATH=$PATH:~/.local/bin
rm -rf $OUTPUT_FILE
python3 tests/e2e/arithmetic_sequence/rl/main_trainer.py \
algorithm.adv_estimator=gae \
data.train_files=tests/e2e/arithmetic_sequence/data/train.parquet \
data.val_files=tests/e2e/arithmetic_sequence/data/test.parquet \
data.train_batch_size=800 \
data.val_batch_size=200 \
data.max_prompt_length=16 \
data.max_response_length=32 \
data.return_raw_input_ids=True \
actor_rollout_ref.model.path=tests/e2e/arithmetic_sequence/model \
actor_rollout_ref.model.external_lib=tests.e2e.envs.digit_completion \
actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=200 \
actor_rollout_ref.actor.entropy_coeff=0 \
actor_rollout_ref.actor.optim.lr=1e-4 \
actor_rollout_ref.actor.use_kl_loss=False \
actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=200 \
actor_rollout_ref.rollout.name=hf \
actor_rollout_ref.rollout.use_fire_sampling=True \
actor_rollout_ref.rollout.tensor_model_parallel_size=1 \
critic.ppo_micro_batch_size_per_gpu=200 \
critic.model.path=tests/e2e/arithmetic_sequence/model \
critic.optim.lr=1e-3 \
algorithm.use_kl_in_reward=False \
trainer.total_epochs=200 \
trainer.experiment_name=arithmetic_sequences \
trainer.logger=['console'] \
trainer.n_gpus_per_node=1 \
trainer.test_freq=1 \
trainer.save_freq=110 | tee $OUTPUT_FILE;
python3 tests/e2e/check_results.py --output_file=$OUTPUT_FILE --target 0.19
rm -rf $OUTPUT_FILE
#!/usr/bin/env bash
set -e -x
huggingface-cli download Qwen/Qwen2.5-0.5B --local-dir $HOME/models/Qwen/Qwen2.5-0.5B
python3 tests/e2e/arithmetic_sequence/rl/main_trainer.py \
algorithm.adv_estimator=gae \
data.train_files=tests/e2e/arithmetic_sequence/data/train.parquet \
data.val_files=tests/e2e/arithmetic_sequence/data/test.parquet \
actor_rollout_ref.actor.use_kl_loss=False \
actor_rollout_ref.model.path=tests/e2e/arithmetic_sequence/model \
actor_rollout_ref.rollout.name=vllm \
actor_rollout_ref.rollout.tensor_model_parallel_size=1 \
actor_rollout_ref.model.tokenizer_path=tests/e2e/arithmetic_sequence/model \
critic.model.path=Qwen/Qwen2.5-0.5B \
critic.model.use_remove_padding=True \
algorithm.use_kl_in_reward=False \
trainer.total_epochs=1
\ No newline at end of file
#!/bin/bash
set -xeuo pipefail
# Get the configuration name and engine name from arguments
CONFIG_NAME="$1"
ENGINE="${2:-vllm}"
# Download model if needed
huggingface-cli download Qwen/Qwen2.5-0.5B --local-dir "$HOME/models/Qwen/Qwen2.5-0.5B"
# Run the training with the specified configuration
python3 -m verl.trainer.main_ppo \
--config-name "$CONFIG_NAME" "$@"
\ No newline at end of file
#!/usr/bin/env bash
set -xeuo pipefail
ENTRYPOINT=${ENTRYPOINT:-"-m verl.trainer.fsdp_sft_trainer"}
NUM_GPUS=${NUM_GPUS:-8}
MODEL_ID=${MODEL_ID:-Qwen/Qwen2.5-0.5B-Instruct}
MODEL_PATH=${MODEL_PATH:-${HOME}/models/${MODEL_ID}}
huggingface-cli download "${MODEL_ID}" --local-dir "${MODEL_PATH}"
TRAIN_FILES=${TRAIN_FILES:-$HOME/data/gsm8k/train.parquet}
VAL_FILES=${VAL_FILES:-$HOME/data/gsm8k/test.parquet}
SP_SIZE=${SP_SIZE:-1}
LIGER=${LIGER:-False}
MULTITURN=${MULTITURN:-False}
LORA_RANK=${LORA_RANK:-0}
RM_PAD=${RM_PAD:-True}
micro_bsz=2
NUM_GPUS=8
project_name="verl-test"
exp_name="$(basename "${MODEL_ID,,}")-sft-minimal"
ckpts_home=${ckpts_home:-$HOME/${project_name}/${exp_name}}
mkdir -p "${ckpts_home}"
torchrun --standalone --nnodes=1 --nproc_per_node=${NUM_GPUS} ${ENTRYPOINT} \
data.train_files="${TRAIN_FILES}" \
data.val_files="${VAL_FILES}" \
data.prompt_key=extra_info \
data.response_key=extra_info \
data.prompt_dict_keys=['question'] \
data.response_dict_keys=['answer'] \
data.multiturn.enable="${MULTITURN}" \
data.multiturn.messages_key=messages \
optim.lr=1e-4 \
data.micro_batch_size_per_gpu=${micro_bsz} \
model.partial_pretrain="${MODEL_PATH}" \
model.lora_rank="${LORA_RANK}" \
model.lora_alpha=16 \
model.target_modules=all-linear \
model.use_liger="${LIGER}" \
ulysses_sequence_parallel_size="${SP_SIZE}" \
use_remove_padding="${RM_PAD}" \
trainer.default_local_dir="${ckpts_home}" \
trainer.project_name="${project_name}" \
trainer.experiment_name="${exp_name}" \
trainer.total_training_steps=1 \
trainer.logger=['console'] \
trainer.default_hdfs_dir=null $@
rm -rf "${ckpts_home:?}/*"
\ No newline at end of file
# Copyright 2024 Bytedance Ltd. and/or its affiliates
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import torch
import torch.distributed
from tensordict import TensorDict
from verl.trainer.fsdp_sft_trainer import FSDPSFTTrainer
from torch.distributed.device_mesh import init_device_mesh
from verl.utils.distributed import initialize_global_process_group
def test_trainer_forward_consistency(trainer: FSDPSFTTrainer, total_steps: int = 4):
"""Test consistency between original forward pass and SP+rmpad forward passes.
Args:
trainer: The FSDPSFTTrainer instance to test
total_steps: Number of steps to test (default: 4)
"""
if trainer.device_mesh.get_rank() == 0:
print("\nStarting debug comparison between original and SP+rmpad forward passes...")
print(f"Sequence parallel size: {trainer.config.ulysses_sequence_parallel_size}")
print(f"Remove padding: {trainer.use_remove_padding}\n")
steps_remaining = total_steps
for epoch in range(1): # Just one epoch for testing
trainer.train_sampler.set_epoch(epoch=epoch)
for data in trainer.train_dataloader:
data = TensorDict(data, batch_size=trainer.config.data.train_batch_size).cuda()
trainer.fsdp_model.train()
micro_batches = data.split(trainer.config.data.micro_batch_size_per_gpu)
for idx, micro_batch in enumerate(micro_batches):
if trainer.device_mesh.get_rank() == 0:
print(f"\nProcessing micro batch {idx + 1}/{len(micro_batches)}")
# Compute losses using both methods
# Disable SP and rmpad
trainer.use_remove_padding = False
old_sp = trainer.config.ulysses_sequence_parallel_size
trainer.config.ulysses_sequence_parallel_size = 1
loss_ref = trainer._compute_loss_and_backward(micro_batch.copy(), do_backward=False)
# Do SP and rmpad
trainer.config.ulysses_sequence_parallel_size = old_sp
trainer.use_remove_padding = True
loss_sp = trainer._compute_loss_and_backward(micro_batch.copy(), do_backward=False)
# Collect losses across all ranks
loss_ref_all = loss_ref.clone()
loss_sp_all = loss_sp.clone()
torch.distributed.all_reduce(loss_ref_all, op=torch.distributed.ReduceOp.AVG)
torch.distributed.all_reduce(loss_sp_all, op=torch.distributed.ReduceOp.AVG)
# Calculate relative difference of averaged losses
rel_diff = torch.abs(loss_ref_all - loss_sp_all) / (torch.abs(loss_ref_all) + 1e-8)
if trainer.device_mesh.get_rank() == 0:
print("\nComparison Results (Averaged across ranks):")
print(f"Reference Loss: {loss_ref_all.item():.6f}")
print(f"SP+rmpad Loss: {loss_sp_all.item():.6f}")
print(f"Relative Difference: {rel_diff.item():.6f}")
assert rel_diff.item() < 1e-2, "Significant difference detected between averaged losses!"
print("Loss difference is within the acceptable range.")
steps_remaining -= 1
if steps_remaining == 0:
break
if steps_remaining == 0:
break
break
if trainer.device_mesh.get_rank() == 0:
print("\nDebug comparison completed successfully.")
def create_trainer(config):
"""Create and initialize a trainer instance with the given config.
Args:
config: Configuration object with training parameters
Returns:
FSDPSFTTrainer: Initialized trainer instance
"""
local_rank, rank, world_size = initialize_global_process_group()
device_mesh = init_device_mesh(device_type='cuda', mesh_shape=(world_size,), mesh_dim_names=('fsdp',))
dp_size = world_size // config.ulysses_sequence_parallel_size
ulysses_device_mesh = init_device_mesh(device_type='cuda',
mesh_shape=(dp_size, config.ulysses_sequence_parallel_size),
mesh_dim_names=('dp', 'sp'))
return FSDPSFTTrainer(config=config, device_mesh=device_mesh, ulysses_device_mesh=ulysses_device_mesh)
def main(config):
"""Main function to run trainer tests.
Args:
config: Configuration object with training parameters
"""
trainer = create_trainer(config)
test_trainer_forward_consistency(trainer)
if __name__ == '__main__':
import hydra
from omegaconf import DictConfig
@hydra.main(config_path="../../../verl/trainer/config", config_name="sft_trainer")
def hydra_entry(cfg: DictConfig) -> None:
main(cfg)
hydra_entry()
#!/usr/bin/env bash
# Tested with 1 & 4 GPUs
set -xeuo pipefail
MODEL_ID=${MODEL_ID:-Qwen/Qwen2.5-0.5B-Instruct}
NGPUS_PER_NODE=${NGPUS_PER_NODE:-4}
OUTPUT_PATH=${OUTPUT_PATH:-$HOME/data/gen/qwen_05_gen_test.parquet}
GEN_TP=${GEN_TP:-2} # Default tensor parallel size to 2
python3 -m verl.trainer.main_generation \
trainer.nnodes=1 \
trainer.n_gpus_per_node="${NGPUS_PER_NODE}" \
data.path="${HOME}/data/gsm8k/test.parquet" \
data.prompt_key=prompt \
data.n_samples=1 \
data.output_path="${OUTPUT_PATH}" \
model.path="${MODEL_ID}" \
+model.trust_remote_code=True \
rollout.temperature=1.0 \
rollout.top_k=50 \
rollout.top_p=0.7 \
rollout.prompt_length=2048 \
rollout.response_length=1024 \
rollout.tensor_model_parallel_size="${GEN_TP}" \
rollout.gpu_memory_utilization=0.8
# Copyright 2024 Bytedance Ltd. and/or its affiliates
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
Test memory buffers
- We start with two models with the same weights
- We use Memory buffer to make one of the models and then compare the parameters
"""
import torch
import gc
from transformers import LlamaModel, LlamaConfig
from verl.utils.memory_buffer import MemoryBufferModuleWrapper
def test_memory_buffers():
llama_config = LlamaConfig(vocab_size=256,
hidden_size=4096,
intermediate_size=11008,
num_hidden_layers=2,
num_attention_heads=16,
num_key_value_heads=16)
model = LlamaModel(config=llama_config).cuda()
model_copy = LlamaModel(config=llama_config).cuda()
model_copy.load_state_dict(model.state_dict())
model_named_params = dict(model.named_parameters())
model_copy_named_params = dict(model_copy.named_parameters())
norm_factor = 1024**3
t_before = torch.cuda.get_device_properties(0).total_memory / norm_factor
r_before = torch.cuda.memory_reserved(0) / norm_factor
a_before = torch.cuda.memory_allocated(0) / norm_factor
print(f'Before Total memory: {t_before} GB, reserved: {r_before} GB, allocated: {a_before} GB')
model_wrapper = MemoryBufferModuleWrapper(model)
t = torch.cuda.get_device_properties(0).total_memory / norm_factor
r = torch.cuda.memory_reserved(0) / norm_factor
a = torch.cuda.memory_allocated(0) / norm_factor
gc.collect()
torch.cuda.empty_cache()
print(f'After Total memory: {t} GB, reserved: {r} GB, allocated: {a} GB')
change_ratio = (a - a_before) / a_before
assert change_ratio < 0.01, f'make sure the allocated change is less than 1%, Got {change_ratio}'
for (name1, param1), (name2, param2) in zip(model.named_parameters(), model_copy.named_parameters()):
assert name1 == name2
assert torch.eq(param1.data, param2.data).all(), f'{param1.data}, {param2.data}, {name1}'
if __name__ == '__main__':
test_memory_buffers()
# Copyright 2024 Bytedance Ltd. and/or its affiliates
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
def test_flash_attn_cross_entropy():
from verl.utils.torch_functional import logprobs_from_logits_naive
from verl.utils.debug import log_gpu_memory_usage
from flash_attn.ops.triton.cross_entropy import cross_entropy_loss
import torch
from torch import nn
log_gpu_memory_usage('At start')
hidden_states = torch.randn(size=(2048, 5120), device='cuda', requires_grad=True, dtype=torch.bfloat16)
linear = nn.Linear(in_features=5120, out_features=155136, bias=False, device='cuda', dtype=torch.bfloat16)
logits = linear(hidden_states)
# logits = logits.float()
labels = torch.randint(low=0, high=155136, size=(2048,), device='cuda')
log_gpu_memory_usage('before computation')
# output = checkpoint.checkpoint(logprobs_from_logits, logits, labels, use_reentrant=True)
output = -cross_entropy_loss(logits, labels)[0]
# output = logprobs_from_logits(logits, labels)
log_gpu_memory_usage('After forward')
output.sum().backward()
log_gpu_memory_usage('After backward')
groundtruth = logprobs_from_logits_naive(logits.float(), labels)
torch.testing.assert_close(output, groundtruth)
# Copyright 2024 Bytedance Ltd. and/or its affiliates
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from verl.utils.model import create_random_mask
from flash_attn.bert_padding import unpad_input
import torch
import pytest
def test_log_probs_from_logits_response_rmpad():
from verl.utils.torch_functional import log_probs_from_logits_response, log_probs_from_logits_response_rmpad
vocab_size = 32000
batch_size = 2
prompt_length = 256
response_length = 256
input_ids = torch.randint(low=0, high=vocab_size, size=(batch_size, prompt_length + response_length), device='cuda')
attention_mask = create_random_mask(input_ids=input_ids,
max_ratio_of_left_padding=0.2,
max_ratio_of_valid_token=0.8,
min_ratio_of_valid_token=0.6)
response_mask = attention_mask[:, -response_length:]
assert torch.all(response_mask[:, 0] == 1)
logits = torch.randn(batch_size, prompt_length + response_length, vocab_size, device='cuda')
logits_rmpad = unpad_input(logits, attention_mask)[0]
expected_output = log_probs_from_logits_response(input_ids=input_ids,
logits=logits,
response_length=response_length)
actual_output = log_probs_from_logits_response_rmpad(input_ids=input_ids,
attention_mask=attention_mask,
logits_rmpad=logits_rmpad,
response_length=response_length)
# This should bitwise align as only this operation only contains gather operators
assert torch.all(torch.eq(actual_output * response_mask, expected_output * response_mask))
@pytest.mark.parametrize("dtype", [torch.float64, torch.float32, torch.float16, torch.bfloat16])
def test_logprobs_from_logits_v2(dtype):
from verl.utils.torch_functional import logprobs_from_logits_v2, logprobs_from_logits_naive
vocab_size = 32000
batch_size = 2
seq_len = 512
labels = torch.randint(low=0, high=vocab_size, size=(batch_size, seq_len), device='cuda')
logits = torch.randn(batch_size, seq_len, vocab_size, device='cuda', dtype=dtype)
expected_output = logprobs_from_logits_naive(labels=labels, logits=logits)
actual_output = logprobs_from_logits_v2(labels=labels, logits=logits)
if dtype in [torch.float16, torch.bfloat16]: # float16 falls back to an exactly equivalent method
assert torch.equal(actual_output, expected_output)
else: # small numerical difference when using gather / logsumexp approach
torch.testing.assert_close(actual_output, expected_output, rtol=1e-5, atol=1e-5)
def test_lr_scheduler():
from torch import nn
model = nn.Linear(10, 10)
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
from verl.utils.torch_functional import get_constant_schedule_with_warmup
constant_lr = get_constant_schedule_with_warmup(optimizer=optimizer, num_warmup_steps=2)
lr_lst = []
for _ in range(5):
lr_lst.append(constant_lr.get_last_lr()[0])
constant_lr.step()
torch.testing.assert_close(lr_lst, [0.0, 0.0005, 0.001, 0.001, 0.001])
from verl.utils.torch_functional import get_cosine_schedule_with_warmup
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
cosine_lr = get_cosine_schedule_with_warmup(optimizer=optimizer,
num_warmup_steps=2,
num_training_steps=5,
min_lr_ratio=0.1)
lr_lst = []
for _ in range(5):
lr_lst.append(cosine_lr.get_last_lr()[0])
cosine_lr.step()
torch.testing.assert_close(lr_lst, [0.0, 0.0005, 0.001, 0.0007750000000000002, 0.0003250000000000002])
#!/bin/bash
if [ "$#" -ne 1 ]; then
echo "Usage: $0 YOUR_GITHUB_TOKEN"
echo "Please provide exactly one input argument for your github token."
exit 1
fi
# Set your GitHub repository details
OWNER="volcengine"
REPO="verl"
TOKEN=$1
# API URL for workflow runs
API_URL="https://api.github.com/repos/$OWNER/$REPO/actions/runs?status=queued"
# Check required commands
command -v jq >/dev/null 2>&1 || { echo "jq is required but not installed. Aborting."; exit 1; }
# Get queued workflow runs
response=$(curl -s -H "Authorization: token $TOKEN" -H "Accept: application/vnd.github.v3+json" "$API_URL")
# Run this for debugging
# echo $response
# Extract run IDs
queued_run_ids=$(echo "$response" | jq -r '.workflow_runs[] | .id')
if [ -z "$queued_run_ids" ]; then
echo "No queued workflow runs found."
exit 0
fi
# Cancel each queued run
for run_id in $queued_run_ids; do
echo "Cancelling run $run_id"
cancel_url="https://api.github.com/repos/$OWNER/$REPO/actions/runs/$run_id/cancel"
curl -s -X POST -H "Authorization: token $TOKEN" -H "Accept: application/vnd.github.v3+json" "$cancel_url"
done
echo "Cancelled all queued workflow runs."
# Copyright 2024 Bytedance Ltd. and/or its affiliates
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import torch
from verl.utils.model import create_random_mask, compute_position_id_with_mask
from verl.utils.torch_functional import masked_mean, log_probs_from_logits_all_rmpad, logprobs_from_logits
from flash_attn.bert_padding import unpad_input, pad_input, index_first_axis, rearrange
from transformers import LlamaConfig, MistralConfig, GemmaConfig, Qwen2Config
from transformers import AutoModelForCausalLM, AutoModelForTokenClassification, AutoModelForSequenceClassification
# TODO(sgm): add more models for test
# we only need one scale for each model
test_configs = [
LlamaConfig(num_hidden_layers=1),
MistralConfig(num_hidden_layers=1),
GemmaConfig(num_hidden_layers=1),
Qwen2Config(num_hidden_layers=1)
]
def test_hf_casual_models():
batch_size = 4
seqlen = 128
response_length = 127
for config in test_configs:
# config = AutoConfig.from_pretrained(test_case)
with torch.device('cuda'):
model = AutoModelForCausalLM.from_config(config=config,
torch_dtype=torch.bfloat16,
attn_implementation='flash_attention_2')
model = model.to(device='cuda')
input_ids = torch.randint(low=0, high=config.vocab_size, size=(batch_size, seqlen), device='cuda')
attention_mask = create_random_mask(input_ids=input_ids,
max_ratio_of_left_padding=0.1,
max_ratio_of_valid_token=0.8,
min_ratio_of_valid_token=0.5)
position_ids = compute_position_id_with_mask(
attention_mask) # TODO(sgm): we can construct the position_ids_rmpad here
input_ids_rmpad, indices, *_ = unpad_input(input_ids.unsqueeze(-1),
attention_mask) # input_ids_rmpad (total_nnz, ...)
input_ids_rmpad = input_ids_rmpad.transpose(0, 1) # (1, total_nnz)
# unpad the position_ids to align the rotary
position_ids_rmpad = index_first_axis(rearrange(position_ids.unsqueeze(-1), "b s ... -> (b s) ..."),
indices).transpose(0, 1)
# input with input_ids_rmpad and postition_ids to enable flash attention varlen
logits_rmpad = model(input_ids_rmpad, position_ids=position_ids_rmpad,
use_cache=False).logits # (1, total_nnz, vocab_size)
origin_logits = model(input_ids=input_ids,
attention_mask=attention_mask,
position_ids=position_ids,
use_cache=False).logits
origin_logits_rmpad, origin_logits_indices, *_ = unpad_input(origin_logits, attention_mask)
logits_rmpad = logits_rmpad.squeeze(0)
log_probs = log_probs_from_logits_all_rmpad(input_ids_rmpad=input_ids_rmpad,
logits_rmpad=logits_rmpad,
indices=indices,
batch_size=batch_size,
seqlen=seqlen,
response_length=response_length) # (batch, seqlen)
origin_log_probs = log_probs_from_logits_all_rmpad(input_ids_rmpad=input_ids_rmpad,
logits_rmpad=origin_logits_rmpad,
indices=origin_logits_indices,
batch_size=batch_size,
seqlen=seqlen,
response_length=response_length) # (batch, seqlen)
torch.testing.assert_close(masked_mean(log_probs, attention_mask[:, -response_length - 1:-1]),
masked_mean(origin_log_probs, attention_mask[:, -response_length - 1:-1]),
atol=1e-2,
rtol=1e-5)
print(f'Check pass')
def test_hf_value_models():
batch_size = 4
seqlen = 128
for config in test_configs:
# config = AutoConfig.from_pretrained(test_case)
config.num_labels = 1
setattr(config, 'classifier_dropout', 0)
setattr(config, 'hidden_dropout', 0)
with torch.device('cuda'):
model = AutoModelForTokenClassification.from_config(config=config,
torch_dtype=torch.bfloat16,
attn_implementation='flash_attention_2')
model = model.to(device='cuda')
input_ids = torch.randint(low=0, high=config.vocab_size, size=(batch_size, seqlen), device='cuda')
attention_mask = create_random_mask(input_ids=input_ids,
max_ratio_of_left_padding=0.1,
max_ratio_of_valid_token=0.8,
min_ratio_of_valid_token=0.5)
position_ids = compute_position_id_with_mask(
attention_mask) # TODO(sgm): we can construct the position_ids_rmpad here
input_ids_rmpad, indices, *_ = unpad_input(input_ids.unsqueeze(-1),
attention_mask) # input_ids_rmpad (total_nnz, ...)
input_ids_rmpad = input_ids_rmpad.transpose(0, 1) # (1, total_nnz)
# unpad the position_ids to align the rotary
position_ids_rmpad = index_first_axis(rearrange(position_ids.unsqueeze(-1), "b s ... -> (b s) ..."),
indices).transpose(0, 1)
origin_logits = model(input_ids=input_ids,
attention_mask=attention_mask,
position_ids=position_ids,
use_cache=False).logits
# input with input_ids_rmpad and postition_ids to enable flash attention varlen
rmpad_logits = model(input_ids_rmpad, position_ids=position_ids_rmpad,
use_cache=False).logits # (1, total_nnz, 1)
rmpad_logits = rmpad_logits.squeeze(0)
pad_logits = pad_input(rmpad_logits, indices, batch_size, seqlen=seqlen)
torch.testing.assert_close(masked_mean(pad_logits, attention_mask[:, :, None]),
masked_mean(origin_logits, attention_mask[:, :, None]),
atol=1e-2,
rtol=1e-5)
print('Value model check pass')
if __name__ == '__main__':
test_hf_casual_models()
test_hf_value_models()
# Copyright 2024 Bytedance Ltd. and/or its affiliates
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import time
import contextlib
from dataclasses import dataclass
import pytest
import torch
import copy
import torch.distributed
from torch.distributed import init_device_mesh
from verl.utils.distributed import initialize_global_process_group
from verl.utils.model import create_random_mask, compute_position_id_with_mask
from verl.utils.ulysses import ulysses_pad_and_slice_inputs, gather_outpus_and_unpad
from verl.utils.ulysses import get_ulysses_sequence_parallel_world_size, set_ulysses_sequence_parallel_group
from verl.workers.sharding_manager import FSDPUlyssesShardingManager
from verl.protocol import DataProto
from flash_attn.bert_padding import unpad_input, index_first_axis, rearrange
from transformers import LlamaConfig, Qwen2Config, PretrainedConfig
from transformers import AutoModelForCausalLM
from verl.models.transformers.monkey_patch import apply_monkey_patch
# TODO(sgm): add more models for test
# we only need one scale for each model
@dataclass
class SequenceParallelConfig:
config: PretrainedConfig
sp_size: int
is_valid: bool
def test_configs():
return [
SequenceParallelConfig(LlamaConfig(num_hidden_layers=2, num_attention_heads=32, num_key_value_heads=32),
sp_size=8,
is_valid=True),
SequenceParallelConfig(Qwen2Config(num_hidden_layers=2,
num_attention_heads=28,
num_key_value_heads=4,
hidden_size=3584),
sp_size=4,
is_valid=True),
SequenceParallelConfig(Qwen2Config(num_hidden_layers=2,
num_attention_heads=28,
num_key_value_heads=4,
hidden_size=3584),
sp_size=8,
is_valid=False),
SequenceParallelConfig(Qwen2Config(num_hidden_layers=2, num_attention_heads=32, num_key_value_heads=4),
sp_size=4,
is_valid=True),
SequenceParallelConfig(Qwen2Config(num_hidden_layers=2, num_attention_heads=32, num_key_value_heads=4),
sp_size=8,
is_valid=True),
]
def sync_model_parameters_global(layer):
# synchronize weights
for p in layer.parameters():
torch.distributed.broadcast(tensor=p.data, src=0)
@pytest.mark.parametrize("test_config", test_configs())
def test_hf_casual_fwd_bwd(test_config):
if not torch.distributed.is_initialized():
initialize_global_process_group()
context = contextlib.nullcontext() if test_config.is_valid else pytest.raises(AssertionError)
with context:
world_size = torch.distributed.get_world_size()
_hf_casual_fwd_bwd(test_config.config, test_config.sp_size, world_size // test_config.sp_size)
# TODO: seems not work, will cause `socketStartConnect: Connect to xxx failed : Software caused connection abort`
# torch.distributed.destroy_process_group()
def _hf_casual_fwd(config, sp_size, dp_size):
assert torch.cuda.device_count() >= 2, "need at least 2 gpus for test"
ulysses_device_mesh = init_device_mesh(device_type='cuda',
mesh_shape=(dp_size, sp_size),
mesh_dim_names=('dp', 'sp'))
sharding_manager = FSDPUlyssesShardingManager(ulysses_device_mesh)
batch_size = 1
seqlen = 128
response_length = 127
# patch before load
with torch.device('cuda'):
model = AutoModelForCausalLM.from_config(config=config,
torch_dtype=torch.bfloat16,
attn_implementation='flash_attention_2')
apply_monkey_patch(model, sp_size)
model = model.to(device='cuda')
sync_model_parameters_global(model)
# different rank will generate different input_ids following fsdp
input_ids = torch.randint(low=0, high=config.vocab_size, size=(batch_size, seqlen), device='cuda')
attention_mask = create_random_mask(input_ids=input_ids,
max_ratio_of_left_padding=0,
max_ratio_of_valid_token=0.9,
min_ratio_of_valid_token=0.8)
position_ids = compute_position_id_with_mask(
attention_mask) # TODO(sgm): we can construct the position_ids_rmpad here
model_inputs = {
'input_ids': input_ids.cuda(),
'attention_mask': attention_mask.cuda(),
'position_ids': position_ids.int().cuda()
}
model_inputs = DataProto.from_dict(model_inputs)
# 1. perform ulysses forward
with sharding_manager:
model_inputs = sharding_manager.preprocess_data(model_inputs)
input_ids = model_inputs.batch['input_ids']
attention_mask = model_inputs.batch['attention_mask']
position_ids = model_inputs.batch['position_ids']
input_ids_rmpad, indices, *_ = unpad_input(input_ids.unsqueeze(-1),
attention_mask) # input_ids_rmpad (total_nnz, ...)
input_ids_rmpad = input_ids_rmpad.transpose(0, 1) # (1, total_nnz)
# unpad the position_ids to align the rotary
position_ids_rmpad = index_first_axis(rearrange(position_ids.unsqueeze(-1), "b s ... -> (b s) ..."),
indices).transpose(0, 1)
# slice input tensor for ulysses
# input_ids are padded and sliced
# postition_ids are only padded but not sliced
input_ids_rmpad_sliced, position_ids_rmpad_padded, pad_size = ulysses_pad_and_slice_inputs(
input_ids_rmpad, position_ids_rmpad, sp_size=get_ulysses_sequence_parallel_world_size())
# input with input_ids_rmpad and postition_ids to enable flash attention varlen
logits_split_in_seq = model(input_ids_rmpad_sliced, position_ids=position_ids_rmpad_padded,
use_cache=False).logits # (1, total_nnz/n, vocab_size)
# all_gather output
logits_full = gather_outpus_and_unpad(logits_split_in_seq, gather_dim=1, unpad_dim=1, padding_size=pad_size)
# 2. perform normal forward
set_ulysses_sequence_parallel_group(None)
logits_rmpad_local = model(input_ids_rmpad, position_ids=position_ids_rmpad,
use_cache=False).logits # (1, total_nnz, vocab_size)
mean_local = logits_rmpad_local.mean()
mean_full = logits_full.mean()
torch.testing.assert_close(mean_local, mean_full, rtol=1e-2, atol=1e-5)
def _hf_casual_fwd_bwd(config, sp_size, dp_size):
assert torch.cuda.device_count() >= 2, "need at least 2 gpus for test"
ulysses_device_mesh = init_device_mesh(device_type='cuda',
mesh_shape=(dp_size, sp_size),
mesh_dim_names=('dp', 'sp'))
sharding_manager = FSDPUlyssesShardingManager(ulysses_device_mesh)
batch_size = 1
seqlen = 128
response_length = 127
# patch before load
with torch.device('cuda'):
model = AutoModelForCausalLM.from_config(config=config,
torch_dtype=torch.bfloat16,
attn_implementation='flash_attention_2')
apply_monkey_patch(model, sp_size)
model = model.to(device='cuda')
sync_model_parameters_global(model)
# different rank will generate different input_ids following fsdp
input_ids = torch.randint(low=0, high=config.vocab_size, size=(batch_size, seqlen), device='cuda')
attention_mask = create_random_mask(input_ids=input_ids,
max_ratio_of_left_padding=0,
max_ratio_of_valid_token=0.9,
min_ratio_of_valid_token=0.8)
position_ids = compute_position_id_with_mask(
attention_mask) # TODO(sgm): we can construct the position_ids_rmpad here
model_inputs = {
'input_ids': input_ids.cuda(),
'attention_mask': attention_mask.cuda(),
'position_ids': position_ids.int().cuda()
}
model_inputs = DataProto.from_dict(model_inputs)
# 1. perform ulysses forward
with sharding_manager:
model_inputs = sharding_manager.preprocess_data(model_inputs)
input_ids = model_inputs.batch['input_ids']
attention_mask = model_inputs.batch['attention_mask']
position_ids = model_inputs.batch['position_ids']
input_ids_rmpad, indices, *_ = unpad_input(input_ids.unsqueeze(-1),
attention_mask) # input_ids_rmpad (total_nnz, ...)
input_ids_rmpad = input_ids_rmpad.transpose(0, 1) # (1, total_nnz)
# unpad the position_ids to align the rotary
position_ids_rmpad = index_first_axis(rearrange(position_ids.unsqueeze(-1), "b s ... -> (b s) ..."),
indices).transpose(0, 1)
# slice input tensor for ulysses
# input_ids are padded and sliced
# postition_ids are only padded but not sliced
input_ids_rmpad_sliced, position_ids_rmpad_padded, pad_size = ulysses_pad_and_slice_inputs(
input_ids_rmpad, position_ids_rmpad, sp_size=get_ulysses_sequence_parallel_world_size())
# input with input_ids_rmpad and postition_ids to enable flash attention varlen
logits_split_in_seq = model(input_ids_rmpad_sliced, position_ids=position_ids_rmpad_padded,
use_cache=False).logits # (1, total_nnz/n, vocab_size)
# all_gather output
logits_full = gather_outpus_and_unpad(logits_split_in_seq, gather_dim=1, unpad_dim=1, padding_size=pad_size)
# 2. perform normal forward
set_ulysses_sequence_parallel_group(None)
input_ids_full = copy.deepcopy(input_ids_rmpad)
position_ids_full = copy.deepcopy(position_ids_rmpad)
model_no_sp = copy.deepcopy(model)
logits_rmpad_local = model_no_sp(input_ids_full, position_ids=position_ids_full,
use_cache=False).logits # (1, total_nnz, vocab_size)
mean_local = logits_rmpad_local.mean()
mean_full = logits_full.mean()
mean_full.backward()
mean_local.backward()
# 3. check the gradients
grad = model.model.layers[0].self_attn.q_proj.weight.grad
grad_full = model_no_sp.model.layers[0].self_attn.q_proj.weight.grad
torch.testing.assert_close(mean_local, mean_full, rtol=1e-2, atol=1e-5)
torch.testing.assert_close(grad, grad_full, atol=1e-2, rtol=1e-5)
if __name__ == '__main__':
pytest.main([__file__, "-svv"])
# Copyright 2024 Bytedance Ltd. and/or its affiliates
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import time
import sys
import os
import ray
from verl.single_controller.ray.base import RayResourcePool, RayClassWithInitArgs, RayWorkerGroup
from verl.single_controller.base.worker import Worker
from verl.single_controller.base.decorator import register, Dispatch
@ray.remote
class TestActor(Worker):
def __init__(self) -> None:
super().__init__()
@register(dispatch_mode=Dispatch.ONE_TO_ALL, blocking=False)
def foo(self, wait_time):
time.sleep(wait_time)
sys.exit(1)
if __name__ == "__main__":
wait_time = int(os.getenv("WAIT_TIME", "10"))
ray.init()
# test single-node-no-partition
print(f"test single-node-no-partition")
resource_pool = RayResourcePool([2], use_gpu=True)
class_with_args = RayClassWithInitArgs(cls=TestActor)
print("create worker group")
wg = RayWorkerGroup(resource_pool, class_with_args, name_prefix="test")
wg.start_worker_aliveness_check(1)
time.sleep(1)
print(time.time(), "start foo")
_ = wg.foo(wait_time)
print("foo started")
print(time.time(),
f"wait 6x wait time {wait_time*6} to let signal returned to process but still not exceed process wait time")
time.sleep(wait_time * 6)
ray.shutdown()
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment