Commit b84161d1 authored by jerrrrry's avatar jerrrrry
Browse files

Initial commit

parents
Pipeline #2716 canceled with stages
## Enabled the dependabot to check the dependencies of the project
## Dependabot will open pull requests to update dependencies automatically
version: 2
updates:
- package-ecosystem: pip
directory: "/"
schedule:
interval: weekly
\ No newline at end of file
name: dataset
on:
# Trigger the workflow on push or pull request,
# but only for the main branch
pull_request:
branches:
- main
paths:
- "verl/utils/**/*.py"
- .github/workflows/dataset.yml
- "!verl/workers/fsdp_workers.py"
- "!verl/workers/megatron_workers.py"
- "!recipe/**"
# Cancel jobs on the same ref if a new one is triggered
concurrency:
group: ${{ github.workflow }}-${{ github.ref }}
cancel-in-progress: ${{ github.ref != 'refs/heads/main' }}
# Declare permissions just read content.
permissions:
contents: read
jobs:
ray:
runs-on: [L20x8]
timeout-minutes: 10 # Increase this timeout value as needed
env:
HTTP_PROXY: ${{ secrets.PROXY_HTTP }}
HTTPS_PROXY: ${{ secrets.PROXY_HTTPS }}
NO_PROXY: "localhost,127.0.0.1"
HF_HUB_ENABLE_HF_TRANSFER: "0" # This is more stable
container:
image: whatcanyousee/verl:ngc-th2.6.0-cu124-vllm0.8.2-mcore0.11.0-te2.0
options: --gpus all --shm-size=10g
steps:
- uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
with:
fetch-depth: 0
- name: Install the current repository
run: |
pip install -e .[test]
pip install --upgrade "ray>=2.40.0"
pip install cupy-cuda12x
- name: Running dataset tests
run: |
[ ! -d "$HOME/verl-data" ] && git clone --depth 1 https://github.com/eric-haibin-lin/verl-data ~/verl-data
python3 examples/data_preprocess/geo3k.py
pytest -s -x tests/verl/utils/dataset/test_rl_dataset.py
pytest -s -x tests/verl/utils/dataset/test_sft_dataset.py
pytest -s -x tests/verl/utils/test_import_utils.py
# pytest -s -x tests/verl/utils/dataset/test_rm_dataset.py
- name: Running ray test using cupy (move it to L20 when dockerfile ready)
run: |
cd tests/ray
pytest -s -x test_rvdz.py
name: e2e_ascend
on:
# Trigger the workflow on push or pull request,
# but only for the main branch
pull_request:
branches:
- main
paths:
- "**/*.py"
- .github/workflows/e2e_ascend.yml
permissions:
contents: read
jobs:
test:
name: verl Ascend test (self-host)
runs-on: [self-hosted, npu-0]
timeout-minutes: 5 # Increase this timeout value as needed
container:
image: quay.io/ascend/cann:8.0.0-910b-ubuntu22.04-py3.10
volumes:
- /usr/local/dcmi:/usr/local/dcmi
- /usr/local/bin/npu-smi:/usr/local/bin/npu-smi
- /usr/local/Ascend/driver/lib64/:/usr/local/Ascend/driver/lib64/
# Use self-host cache speed up pip and model download
# - /home/action/actions-runner/_work/cache:/github/home/.cache/
options: >-
--device /dev/davinci0
--device /dev/davinci_manager
--device /dev/devmm_svm
--device /dev/hisi_hdc
--privileged
--network "host"
steps:
- name: Check npu and CANN info
run: |
cat /usr/local/Ascend/ascend-toolkit/latest/"$(uname -i)"-linux/ascend_toolkit_install.info
npu-smi info
- name: Checkout volcengine/verl repo
uses: actions/checkout@v4
- name: Run test
run: |
lscpu
name: e2e_dapo
on:
# Trigger the workflow on push or pull request,
# but only for the main branch
pull_request:
branches:
- main
- v0.2.x
paths:
- "**/*.py"
# Home
- "recipe/dapo/src"
# Entrypoints
- ".github/workflows/e2e_dapo.yml"
- "examples/data_preprocess/gsm8k.py"
- "tests/e2e/run_dapo.sh"
- "!examples"
- "!verl/trainer/main_*.py"
- "!verl/trainer/fsdp_sft_trainer.py"
# Megatron
- "!verl/workers/**/megatron_*.py"
# Declare permissions just read content.
permissions:
contents: read
jobs:
e2e_dapo:
runs-on: [L20x8]
timeout-minutes: 40 # Increase this timeout value as needed
env:
HTTP_PROXY: ${{ secrets.PROXY_HTTP }}
HTTPS_PROXY: ${{ secrets.PROXY_HTTPS }}
NO_PROXY: "localhost,127.0.0.1"
HF_HUB_ENABLE_HF_TRANSFER: "0" # This is more stable
container:
image: whatcanyousee/verl:ngc-th2.6.0-cu124-vllm0.8.2-mcore0.11.0-te2.0
options: --gpus all --shm-size=10g
steps:
- uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
with:
fetch-depth: 0
- name: Install the current repository
run: |
pip3 install -e .[test,gpu]
- name: Prepare GSM8K dataset
run: |
python3 examples/data_preprocess/gsm8k.py
- name: Running the E2E test with the DAPO algorithm
run: |
ray stop --force
bash tests/e2e/run_dapo.sh
\ No newline at end of file
name: e2e_eval_aime24
on:
# Trigger the workflow on push or pull request,
# but only for the main branch
pull_request:
branches:
- main
paths:
- "**/*.py"
# Home
- "recipe/r1"
- "!recipe/r1/README.md"
# Entrypoints
- ".github/workflows/e2e_eval_aime24.yml"
- "tests/e2e/run_r1_distill_qwen_aime24_eval.sh"
- "verl/trainer/main_generation.py"
- "verl/trainer/config/generation.yaml"
- "!examples"
- "!verl/trainer/main_*.py"
- "!verl/trainer/fsdp_sft_trainer.py"
# Other recipes
- "!recipe"
# Declare permissions just read content.
permissions:
contents: read
jobs:
e2e_eval_aime24:
runs-on: [L20x8]
timeout-minutes: 40 # Increase this timeout value as needed
env:
HTTP_PROXY: ${{ secrets.PROXY_HTTP }}
HTTPS_PROXY: ${{ secrets.PROXY_HTTPS }}
NO_PROXY: "localhost,127.0.0.1"
HF_HUB_ENABLE_HF_TRANSFER: "0" # This is more stable
container:
image: whatcanyousee/verl:ngc-th2.6.0-cu124-vllm0.8.2-mcore0.11.0-te2.0
options: --gpus all --shm-size=10g
steps:
- uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
with:
fetch-depth: 0
- name: Install the current repository
run: |
pip3 install -e .[test,gpu,math]
- name: Prepare aime24 dataset
run: |
ray stop --force
python3 recipe/r1/data_process.py --task aime2024
- name: Running generation and evaluation in AIME 2024
run: |
ray stop --force
bash tests/e2e/run_r1_distill_qwen_aime24_eval.sh
\ No newline at end of file
name: e2e_ppo_trainer
on:
# Trigger the workflow on push or pull request,
# but only for the main branch
pull_request:
branches:
- main
- v0.2.x
paths:
- "**/*.py"
# Entrypoints
- ".github/workflows/e2e_ppo_trainer.yml"
- "examples/data_preprocess/gsm8k.py"
- "examples/data_preprocess/geo3k.py"
- "tests/e2e/ppo_trainer"
- "verl/trainer/main_ppo.py"
- "verl/trainer/config/ppo_trainer.yaml"
- "!examples"
- "!verl/trainer/main_*.py"
- "!verl/trainer/fsdp_sft_trainer.py"
# Recipes
- "!recipe"
# Megatron
- "!verl/workers/**/megatron_*.py"
# Cancel jobs on the same ref if a new one is triggered
concurrency:
group: ${{ github.workflow }}-${{ github.ref }}
cancel-in-progress: ${{ github.ref != 'refs/heads/main' }}
# Declare permissions just read content.
permissions:
contents: read
jobs:
e2e_ppo_trainer_vllm:
runs-on: [L20x8]
timeout-minutes: 40 # Increase this timeout value as needed
env:
HTTP_PROXY: ${{ secrets.PROXY_HTTP }}
HTTPS_PROXY: ${{ secrets.PROXY_HTTPS }}
NO_PROXY: "localhost,127.0.0.1"
HF_HUB_ENABLE_HF_TRANSFER: "0" # This is more stable
container:
image: whatcanyousee/verl:ngc-th2.6.0-cu124-vllm0.8.2-mcore0.11.0-te2.0
options: --gpus all --shm-size=10g
steps:
- uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
with:
fetch-depth: 0
- name: Install the current repository
run: |
pip3 install -e .[test,vllm]
- name: Prepare GSM8K dataset
run: |
ray stop --force
python3 examples/data_preprocess/gsm8k.py
# Function RM
- name: Running GSM8K E2E training tests on 8 L20 GPUs with rmpad using function rm with validation and saving
run: |
ray stop --force
VAL_BEFORE_TRAIN=True TEST_FREQ=1 SAVE_FREQ=1 bash tests/e2e/ppo_trainer/run_function_reward.sh
- name: Running GSM8K E2E training tests on 8 L20 GPUs with rmpad using function rm after resuming
run: |
ray stop --force
RESUME_MODE=auto bash tests/e2e/ppo_trainer/run_function_reward.sh
- name: Running GSM8K E2E without rmpad using function rm
run: |
ray stop --force
RM_PAD=False bash tests/e2e/ppo_trainer/run_function_reward.sh
- name: Running GSM8K E2E training tests on 8 L20 GPUs with rmpad using function rm (GRPO)
run: |
ray stop --force
ADV_ESTIMATOR=grpo USE_KL=True bash tests/e2e/ppo_trainer/run_function_reward.sh
- name: Running GSM8K E2E training tests on 8 L20 GPUs with rmpad using function rm (ReMax)
run: |
ray stop --force
ADV_ESTIMATOR=remax USE_KL=True bash tests/e2e/ppo_trainer/run_function_reward.sh
- name: Running GSM8K E2E training tests on 8 L20 GPUs with rmpad using customized reward function
run: |
ray stop --force
CUSTOM_REWARD_FN=True bash tests/e2e/ppo_trainer/run_function_reward.sh
- name: Running GSM8K E2E training tests on 8 L20 GPUs with rmpad using function rm with in-reward kl and kl loss
run: |
ray stop --force
USE_KL=True bash tests/e2e/ppo_trainer/run_function_reward.sh
# Model RM
- name: Running GRPO GSM8K E2E training tests with FSDP on 8 L20 GPUs (DeepSeek)
run: |
ray stop --force
MODEL_ID=deepseek-ai/deepseek-coder-1.3b-instruct bash tests/e2e/ppo_trainer/run_function_reward.sh
- name: Running GSM8K E2E with rmpad using model rm
run: |
ray stop --force
bash tests/e2e/ppo_trainer/run_model_reward.sh
- name: Running GSM8K E2E without rmpad using model rm
run: |
ray stop --force
RM_PAD=False bash tests/e2e/ppo_trainer/run_model_reward.sh
- name: Running GSM8K E2E with rmpad using model rm and ulysses sp=2
run: |
ray stop --force
SP_SIZE=2 bash tests/e2e/ppo_trainer/run_model_reward.sh
- name: Running GSM8K E2E with rmpad using model rm and dynamic batch size
run: |
ray stop --force
SEQ_BALANCE=True bash tests/e2e/ppo_trainer/run_model_reward.sh
- name: Running GSM8K E2E with rmpad using model rm with Liger Kernel enabled
run: |
ray stop --force
LIGER=True bash tests/e2e/ppo_trainer/run_model_reward.sh
e2e_ppo_trainer_vllm_vlm:
runs-on: [L20x8]
timeout-minutes: 40 # Increase this timeout value as needed
env:
HTTP_PROXY: ${{ secrets.PROXY_HTTP }}
HTTPS_PROXY: ${{ secrets.PROXY_HTTPS }}
NO_PROXY: "localhost,127.0.0.1"
HF_HUB_ENABLE_HF_TRANSFER: "0" # This is more stable
container:
image: hiyouga/verl:ngc-th2.6.0-cu126-vllm0.8.3-flashinfer0.2.2-cxx11abi0
options: --gpus all --shm-size=50g # Visual dataloader requires large memory
steps:
- uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
with:
fetch-depth: 0
- name: Install the current repository
run: |
pip3 install -e .[test,geo,vllm]
# Geo3k
- name: Prepare Geo3k dataset
run: |
ray stop --force
python3 examples/data_preprocess/geo3k.py
- name: Running Geo3k VLM E2E training tests on 8 L20 GPUs with rmpad using function rm
run: |
ray stop --force
TRAIN_FILES=$HOME/data/geo3k/train.parquet VAL_FILES=$HOME/data/geo3k/test.parquet \
MAX_PROMPT_LEN=1536 MAX_RESPONSE_LEN=1536 \
MODEL_ID=Qwen/Qwen2-VL-2B-Instruct \
ADV_ESTIMATOR=grpo RM_PAD=True USE_KL=True ENABLE_CHUNKED_PREFILL=False \
bash tests/e2e/ppo_trainer/run_function_reward.sh
e2e_ppo_trainer_sglang:
runs-on: [L20x8]
timeout-minutes: 40 # Increase this timeout value as needed
env:
HTTP_PROXY: ${{ secrets.PROXY_HTTP }}
HTTPS_PROXY: ${{ secrets.PROXY_HTTPS }}
NO_PROXY: "localhost,127.0.0.1"
HF_HUB_ENABLE_HF_TRANSFER: "0" # This is more stable
container:
image: ocss884/verl-sglang:ngc-th2.5.1-cu126-sglang0.4.4.post4
options: --gpus all --shm-size=10g
steps:
- uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
with:
fetch-depth: 0
- name: Install the current repository
run: |
pip3 install -e .[test,gpu,sglang] --no-deps
- name: Prepare gsm8k dataset
run: |
ray stop --force
python3 examples/data_preprocess/gsm8k.py
- name: Running GSM8K E2E training tests on 8 L20 GPUs with rmpad using function rm and save ckpt
run: |
ray stop --force
ENGINE=sglang bash tests/e2e/ppo_trainer/run_function_reward.sh
e2e_ppo_trainer_sglang_vlm:
runs-on: [L20x8]
timeout-minutes: 40 # Increase this timeout value as needed
env:
HTTP_PROXY: ${{ secrets.PROXY_HTTP }}
HTTPS_PROXY: ${{ secrets.PROXY_HTTPS }}
NO_PROXY: "localhost,127.0.0.1"
HF_HUB_ENABLE_HF_TRANSFER: "0" # This is more stable
container:
image: ocss884/verl-sglang:ngc-th2.5.1-cu126-sglang0.4.4.post4
options: --gpus all --shm-size=50g # Visual dataloader requires large memory
steps:
- uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
with:
fetch-depth: 0
- name: Install the current repository
run: |
pip3 install -e .[test,geo,gpu,sglang]
# Geo3k
- name: Prepare Geo3k dataset
run: |
ray stop --force
python3 examples/data_preprocess/geo3k.py
- name: Running Geo3k VLM E2E training tests on 8 L20 GPUs with rmpad using function rm
run: |
ray stop --force
TRAIN_FILES=$HOME/data/geo3k/train.parquet VAL_FILES=$HOME/data/geo3k/test.parquet \
MAX_PROMPT_LEN=1536 MAX_RESPONSE_LEN=1536 \
MODEL_ID=Qwen/Qwen2-VL-2B-Instruct \
ADV_ESTIMATOR=grpo RM_PAD=True USE_KL=True ENABLE_CHUNKED_PREFILL=False \
ENGINE=sglang bash tests/e2e/ppo_trainer/run_function_reward.sh
name: e2e_ppo_trainer_megatron
# latest version: Megatron-LM core_r0.11.0 https://github.com/NVIDIA/Megatron-LM/tree/core_r0.11.0
on:
# Trigger the workflow on push or pull request,
# but only for the main branch
pull_request:
branches:
- main
- v0.3.x
paths:
- "**/*.py"
# Entrypoints
- ".github/workflows/e2e_ppo_trainer_megatron.yml"
- "examples/data_preprocess/gsm8k.py"
- "tests/e2e/run_ppo_trainer_megatron.sh"
- "verl/trainer/main_ppo.py"
- "verl/trainer/config/ppo_megatron_trainer.yaml"
- "!examples"
- "!verl/trainer/main_*.py"
- "!verl/trainer/fsdp_sft_trainer.py"
# Recipes
- "!recipe"
# FSDP
- "!verl/workers/**/*dp_*.py"
# Cancel jobs on the same ref if a new one is triggered
concurrency:
group: ${{ github.workflow }}-${{ github.ref }}
cancel-in-progress: ${{ github.ref != 'refs/heads/main' }}
# Declare permissions just read content.
permissions:
contents: read
jobs:
e2e_ppo_trainer_megatron:
runs-on: [L20x8]
timeout-minutes: 40 # Increase this timeout value as needed
env:
HTTP_PROXY: ${{ secrets.PROXY_HTTP }}
HTTPS_PROXY: ${{ secrets.PROXY_HTTPS }}
NO_PROXY: "localhost,127.0.0.1"
HF_HUB_ENABLE_HF_TRANSFER: "0" # This is more stable
VLLM_ATTENTION_BACKEND: XFORMERS # TODO: Remove this if we update the vLLM
container:
image: whatcanyousee/verl:ngc-th2.6.0-cu124-vllm0.8.2-mcore0.11.0-te2.0
options: --gpus all --shm-size=10g
steps:
- uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
with:
fetch-depth: 0
- name: Install the current repository
run: |
pip3 install -e .[test]
- name: Prepare GSM8K dataset
run: |
python3 examples/data_preprocess/gsm8k.py
- name: Running GSM8K E2E training tests with 3D parallelism on 8 L20 GPUs with Megatron (Qwen) with validation and saving
run: |
ray stop --force
VAL_BEFORE_TRAIN=True TEST_FREQ=1 SAVE_FREQ=1 bash tests/e2e/run_ppo_trainer_megatron.sh
- name: Running GSM8K E2E training tests with 3D parallelism on 8 L20 GPUs with Megatron (Qwen) after resuming
run: |
ray stop --force
RESUME_MODE=auto bash tests/e2e/run_ppo_trainer_megatron.sh
- name: Test Megatron checkpoints merging function (Qwen Actor and Critic)
run: |
exp_name="qwen2.5-0.5b-megatron-gsm8k-minimal"
python scripts/model_merger.py --backend megatron --tie-word-embedding --hf_model_path Qwen/Qwen2.5-0.5B --local_dir checkpoints/verl-test/${exp_name}/global_step_1/actor --test --test_hf_dir checkpoints/verl-test/${exp_name}/global_step_1/actor/huggingface
python scripts/model_merger.py --backend megatron --is-value-model --hf_model_path Qwen/Qwen2.5-0.5B --local_dir checkpoints/verl-test/${exp_name}/global_step_1/critic --test --test_hf_dir checkpoints/verl-test/${exp_name}/global_step_1/critic/huggingface
- name: Running GRPO GSM8K E2E training tests with 3D parallelism on 8 L20 GPUs with Megatron (Qwen)
run: |
ray stop --force
ADV_ESTIMATOR=grpo bash tests/e2e/run_ppo_trainer_megatron.sh
- name: Running GSM8K E2E training tests with 3D parallelism on 8 L20 GPUs with Megatron (DeepSeek)
run: |
ray stop --force
SAVE_FREQ=1 MODEL_ID=deepseek-ai/deepseek-coder-1.3b-instruct bash tests/e2e/run_ppo_trainer_megatron.sh
- name: Running GSM8K E2E training tests with 3D parallelism on 8 L20 GPUs with Megatron (DeepSeek)
run: |
ray stop --force
RESUME_MODE=auto MODEL_ID=deepseek-ai/deepseek-coder-1.3b-instruct bash tests/e2e/run_ppo_trainer_megatron.sh
- name: Running GRPO GSM8K E2E training tests with 3D parallelism on 8 L20 GPUs with Megatron (Deepseek)
run: |
ray stop --force
ADV_ESTIMATOR=grpo MODEL_ID=deepseek-ai/deepseek-coder-1.3b-instruct bash tests/e2e/run_ppo_trainer_megatron.sh
- name: Test Megatron checkpoints merging function (DeepSeek Actor and Critic)
run: |
exp_name="deepseek-coder-1.3b-instruct-megatron-gsm8k-minimal"
python scripts/model_merger.py --backend megatron --hf_model_path deepseek-ai/deepseek-coder-1.3b-instruct --local_dir checkpoints/verl-test/${exp_name}/global_step_1/actor --test --test_hf_dir checkpoints/verl-test/${exp_name}/global_step_1/actor/huggingface
python scripts/model_merger.py --backend megatron --is-value-model --hf_model_path deepseek-ai/deepseek-coder-1.3b-instruct --local_dir checkpoints/verl-test/${exp_name}/global_step_1/critic --test --test_hf_dir checkpoints/verl-test/${exp_name}/global_step_1/critic/huggingface
\ No newline at end of file
name: e2e_prime
on:
# Trigger the workflow on push or pull request,
# but only for the main branch
pull_request:
branches:
- main
- v0.2.x
paths:
- "**/*.py"
# Home
- "recipe/prime"
# Entrypoints
- ".github/workflows/e2e_prime.yml"
- "examples/data_preprocess/gsm8k.py"
- "tests/e2e/run_prime.sh"
- "!examples"
- "!verl/trainer/main_*.py"
- "!verl/trainer/fsdp_sft_trainer.py"
# Megatron
- "!verl/workers/**/megatron_*.py"
# Declare permissions just read content.
permissions:
contents: read
jobs:
e2e_prime:
runs-on: [L20x8]
timeout-minutes: 40 # Increase this timeout value as needed
env:
HTTP_PROXY: ${{ secrets.PROXY_HTTP }}
HTTPS_PROXY: ${{ secrets.PROXY_HTTPS }}
NO_PROXY: "localhost,127.0.0.1"
HF_HUB_ENABLE_HF_TRANSFER: "0" # This is more stable
container:
image: whatcanyousee/verl:ngc-th2.6.0-cu124-vllm0.8.2-mcore0.11.0-te2.0
options: --gpus all --shm-size=10g
steps:
- uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
with:
fetch-depth: 0
- name: Install the current repository
run: |
pip3 install -e .[test,gpu]
- name: Prepare gsm8k dataset
run: |
ray stop --force
python3 examples/data_preprocess/gsm8k.py
- name: Running GSM8K E2E with prime alg
run: |
ray stop --force
bash tests/e2e/run_prime.sh
\ No newline at end of file
name: e2e_sft
on:
# Trigger the workflow on push or pull request,
# but only for the main branch
pull_request:
branches:
- main
- v0.2.x
paths:
- "**/*.py"
# Entrypoints
- ".github/workflows/e2e_sft.yml"
- "examples/data_preprocess/gsm8k.py"
- "tests/e2e/sft"
- "verl/trainer/fsdp_sft_trainer.py"
- "verl/trainer/config/sft_trainer.yaml"
- "!examples"
- "!verl/trainer/main_*.py"
# Recipes
- "!recipe"
# Megatron
- "!verl/workers/**/megatron_*.py"
# Cancel jobs on the same ref if a new one is triggered
concurrency:
group: ${{ github.workflow }}-${{ github.ref }}
cancel-in-progress: ${{ github.ref != 'refs/heads/main' }}
# Declare permissions just read content.
permissions:
contents: read
jobs:
e2e_sft:
runs-on: [L20x8]
timeout-minutes: 20 # Increase this timeout value as needed
env:
HTTP_PROXY: ${{ secrets.PROXY_HTTP }}
HTTPS_PROXY: ${{ secrets.PROXY_HTTPS }}
NO_PROXY: "localhost,127.0.0.1"
HF_HUB_ENABLE_HF_TRANSFER: "0" # This is more stable
container:
image: whatcanyousee/verl:ngc-th2.6.0-cu124-vllm0.8.2-mcore0.11.0-te2.0
options: --gpus all --shm-size=10g
steps:
- uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
with:
fetch-depth: 0
- name: Install the current repository
run: |
pip3 install peft
pip3 install -e .[test,gpu]
- name: Prepare gsm8k dataset
run: |
ray stop --force
python3 examples/data_preprocess/gsm8k.py
- name: Running GSM8K E2E training tests on 8 L20 GPUs with rmpad using function rm
run: |
ray stop --force
bash tests/e2e/sft/run_sft.sh
- name: Running GSM8K E2E training tests on 8 L20 GPUs w/o rmpad using function rm
run: |
ray stop --force
RM_PAD=False bash tests/e2e/sft/run_sft.sh
- name: Running GSM8K E2E training tests on 8 L20 GPUs with sequence parallism
run: |
ray stop --force
SP_SIZE=2 bash tests/e2e/sft/run_sft.sh
- name: Check loss difference between sequence parallel vs. default implementation
run: |
ray stop --force
ENTRYPOINT="tests/e2e/sft/test_sp_loss_match.py" SP_SIZE=2 bash tests/e2e/sft/run_sft.sh
- name: Running GSM8K E2E training tests on 8 L20 GPUs with sequence parallism and liger
run: |
ray stop --force
SP_SIZE=2 LIGER=True bash tests/e2e/sft/run_sft.sh
- name: Running GSM8K E2E training tests with LoRA
run: |
ray stop --force
LORA_RANK=32 bash tests/e2e/sft/run_sft.sh
# TODO: multiturn
\ No newline at end of file
name: model_rmpad
on:
# Trigger the workflow on push or pull request,
# but only for the main branch
pull_request:
branches:
- main
- v0.3.x
paths:
- "verl/**/*.py"
- "tests/**/*.sh"
- "tests/model/*"
- .github/workflows/model.yml
- "!recipe/**"
# Declare permissions just read content.
permissions:
contents: read
jobs:
model_rmpad:
runs-on: [L20x8]
timeout-minutes: 20 # Increase this timeout value as needed
env:
HTTP_PROXY: ${{ secrets.PROXY_HTTP }}
HTTPS_PROXY: ${{ secrets.PROXY_HTTPS }}
NO_PROXY: "localhost,127.0.0.1"
HF_HUB_ENABLE_HF_TRANSFER: "0" # This is more stable
container:
image: whatcanyousee/verl:ngc-th2.6.0-cu124-vllm0.8.2-mcore0.11.0-te2.0
options: --gpus all --shm-size=10g
steps:
- uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
with:
fetch-depth: 0
- name: Install the current repository and upgrade to latest transformers/flash_attn
run: |
pip3 install -e .[test]
pip3 install --upgrade transformers
- name: Running rmpad model tests on 8 L20 GPUs + flash_attn 2.5.8
run: |
pytest -s tests/model/test_transformer.py
- name: Running rmpad model tests on 8 L20 GPUs + latest flash_attn
run: |
pip3 install --upgrade flash_attn --no-build-isolation
pytest -s tests/model/test_transformer.py
- name: Running FSDP rmpad model tests on 8 L20 GPUs + latest flash_attn
run: |
torchrun --nproc_per_node=8 tests/checkpoint/test_fsdp_ckpt.py
- name: Running transformers ulysses tests on 8 L20 GPUs + latest transformers
run: |
torchrun --nproc_per_node=8 -m pytest tests/model/test_transformers_ulysses.py
- name: Running transformers ulysses tests on 8 L20 GPUs + transformers 4.49.0
run: |
pip3 install transformers==4.49.0
torchrun --nproc_per_node=8 -m pytest tests/model/test_transformers_ulysses.py
- name: Running transformers ulysses tests on 8 L20 GPUs + transformers 4.48.0
run: |
pip3 install transformers==4.48.0
torchrun --nproc_per_node=8 -m pytest tests/model/test_transformers_ulysses.py
- name: Running transformers ulysses tests on 8 L20 GPUs + transformers 4.47.0
run: |
pip3 install transformers==4.47.0
torchrun --nproc_per_node=8 -m pytest tests/model/test_transformers_ulysses.py
- name: Running transformers ulysses tests on 8 L20 GPUs + transformers 4.46.0
run: |
pip3 install transformers==4.46.0
torchrun --nproc_per_node=8 -m pytest tests/model/test_transformers_ulysses.py
- name: Running transformers ulysses tests on 8 L20 GPUs + transformers 4.45.0
run: |
pip3 install transformers==4.45.0
torchrun --nproc_per_node=8 -m pytest tests/model/test_transformers_ulysses.py
- name: Run distributed test
run: |
bash tests/distributed/run_all.sh
name: Pylint Check
on:
push:
paths:
- '**.py'
- 'requirements.txt'
- 'pyproject.toml'
pull_request:
paths:
- '**.py'
- 'requirements.txt'
- 'pyproject.toml'
jobs:
lint:
runs-on: ubuntu-latest
steps:
- name: Checkout code
uses: actions/checkout@v3
- name: Set up Python
uses: actions/setup-python@v4
with:
python-version: '3.12'
- name: Install pylint (version from requirements.txt)
run: |
PYLINT_VERSION=$(grep '^pylint' requirements.txt)
if [ -z "$PYLINT_VERSION" ]; then
echo "No pylint version found in requirements.txt"
exit 1
fi
# only install pylint to avoid dependency problems on CPU
pip install "$PYLINT_VERSION"
- name: Run pylint
run: |
pylint --recursive=y --rcfile=pyproject.toml ./
name: ray
on:
# Trigger the workflow on push or pull request,
# but only for the main branch
push:
branches:
- main
- v0.2.x
paths:
- "verl/single_controller/*.py"
- .github/workflows/ray_test.yml
pull_request:
branches:
- main
- v0.2.x
paths:
- "verl/single_controller/*.py"
- .github/workflows/ray_test.yml
- "!recipe/**"
# Cancel jobs on the same ref if a new one is triggered
concurrency:
group: ${{ github.workflow }}-${{ github.ref }}
cancel-in-progress: ${{ github.ref != 'refs/heads/main' }}
# Declare permissions just read content.
permissions:
contents: read
jobs:
ray:
runs-on: [L20x8]
timeout-minutes: 10 # Increase this timeout value as needed
env:
HTTP_PROXY: ${{ secrets.PROXY_HTTP }}
HTTPS_PROXY: ${{ secrets.PROXY_HTTPS }}
NO_PROXY: "localhost,127.0.0.1"
HF_HUB_ENABLE_HF_TRANSFER: "0" # This is more stable
container:
image: whatcanyousee/verl:ngc-th2.6.0-cu124-vllm0.8.2-mcore0.11.0-te2.0
options: --gpus all --shm-size=10g
steps:
- uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
with:
fetch-depth: 0
- name: Install the current repository
run: |
pip install -e .[test]
pip install --upgrade "ray>=2.40.0"
- name: Running ray tests that need 8 GPUs
run: |
cd tests/ray
pytest -s -x --ignore=test_check_worker_alive.py --ignore=test_rvdz.py .
name: sandbox
on:
# Trigger the workflow on push or pull request,
# but only for the main branch
pull_request:
branches:
- main
- v0.3.x
paths:
- "**/*.py"
- .github/workflows/sandbox.yml
# Cancel jobs on the same ref if a new one is triggered
concurrency:
group: ${{ github.workflow }}-${{ github.ref }}
cancel-in-progress: ${{ github.ref != 'refs/heads/main' }}
# Declare permissions just read content.
permissions:
contents: read
jobs:
sandbox:
runs-on: [L20x8]
timeout-minutes: 10 # Increase this timeout value as needed
env:
HTTP_PROXY: ${{ secrets.PROXY_HTTP }}
HTTPS_PROXY: ${{ secrets.PROXY_HTTPS }}
NO_PROXY: "localhost,127.0.0.1"
HF_HUB_ENABLE_HF_TRANSFER: "0" # This is more stable
container:
image: whatcanyousee/verl:ngc-th2.6.0-cu124-vllm0.8.2-mcore0.11.0-te2.0
options: --gpus all --shm-size=10g
steps:
- uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
with:
fetch-depth: 0
- name: Install the current repository
run: |
pip3 install -e .[test,prime]
pip3 install vllm==0.5.4
- name: Running sandbox tests on 8 L20 GPUs
run: |
cd tests/sandbox
pytest -s -x .
name: sanity
on:
# Trigger the workflow on push or pull request,
# but only for the main branch
push:
branches:
- main
- v0.2.x
paths:
- "**/*.py"
- .github/workflows/sanity.yml
pull_request:
branches:
- main
- v0.2.x
paths:
- "**/*.py"
- .github/workflows/sanity.yml
# Cancel jobs on the same ref if a new one is triggered
concurrency:
group: ${{ github.workflow }}-${{ github.ref }}
cancel-in-progress: ${{ github.ref != 'refs/heads/main' }}
# Declare permissions just read content.
permissions:
contents: read
jobs:
sanity:
runs-on: ubuntu-latest
timeout-minutes: 5 # Increase this timeout value as needed
strategy:
matrix:
python-version: ["3.10"]
steps:
- uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
- name: Set up Python ${{ matrix.python-version }}
uses: actions/setup-python@0b93645e9fea7318ecaed2b359559ac225c90a2b # v5.3.0
with:
python-version: ${{ matrix.python-version }}
- name: Install the current repository
run: |
pip install -e .[test]
- name: Run sanity test
run: |
pytest -s -x tests/sanity
- name: Run utility test
run: |
pytest -s -x tests/utility
- name: Run license test
run: |
python3 tests/sanity/check_license.py --directory .
# This workflow uses actions that are not certified by GitHub. They are provided
# by a third-party and are governed by separate terms of service, privacy
# policy, and support documentation.
name: Scorecard supply-chain security
on:
# For Branch-Protection check. Only the default branch is supported. See
# https://github.com/ossf/scorecard/blob/main/docs/checks.md#branch-protection
branch_protection_rule:
# To guarantee Maintained check is occasionally updated. See
# https://github.com/ossf/scorecard/blob/main/docs/checks.md#maintained
schedule:
- cron: '27 7 * * 1'
push:
branches: [ "main" ]
# Declare default permissions as read only.
permissions: read-all
jobs:
analysis:
name: Scorecard analysis
runs-on: ubuntu-latest
permissions:
# Needed to upload the results to code-scanning dashboard.
security-events: write
# Needed to publish results and get a badge (see publish_results below).
id-token: write
# Uncomment the permissions below if installing in a private repository.
# contents: read
# actions: read
steps:
- name: "Checkout code"
uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1
with:
persist-credentials: false
- name: "Run analysis"
uses: ossf/scorecard-action@0864cf19026789058feabb7e87baa5f140aac736 # v2.3.1
with:
results_file: results.sarif
results_format: sarif
# (Optional) "write" PAT token. Uncomment the `repo_token` line below if:
# - you want to enable the Branch-Protection check on a *public* repository, or
# - you are installing Scorecard on a *private* repository
# To create the PAT, follow the steps in https://github.com/ossf/scorecard-action?tab=readme-ov-file#authentication-with-fine-grained-pat-optional.
# repo_token: ${{ secrets.SCORECARD_TOKEN }}
# Public repositories:
# - Publish results to OpenSSF REST API for easy access by consumers
# - Allows the repository to include the Scorecard badge.
# - See https://github.com/ossf/scorecard-action#publishing-results.
# For private repositories:
# - `publish_results` will always be set to `false`, regardless
# of the value entered here.
publish_results: true
# Upload the results to GitHub's code scanning dashboard (optional).
# Commenting out will disable upload of results to your repo's Code Scanning dashboard
- name: "Upload to code-scanning"
uses: github/codeql-action/upload-sarif@9e8d0789d4a0fa9ceb6b1738f7e269594bdd67f0 #v3.28.9
with:
sarif_file: results.sarif
on:
push:
branches:
- main
pull_request:
permissions:
contents: read
jobs:
test:
runs-on: ubuntu-latest
steps:
- name: Checkout code
uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1
with:
fetch-depth: 0
- name: Secret Scanning
uses: trufflesecurity/trufflehog@7dc056a193116ba8d82154bf0549381c8fb8545c # v3.88.14
with:
extra_args: --results=verified,unknown
\ No newline at end of file
name: vllm
on:
# Trigger the workflow on push or pull request,
# but only for the main branch
pull_request:
branches:
- main
- v0.3.x
paths:
- "**/*.py"
# Entrypoints
- ".github/workflows/vllm.yml"
- "tests/generation"
- "verl/trainer/main_generation.py"
- "verl/trainer/config/generation.yaml"
- "!examples"
- "!verl/trainer/main_*.py"
- "!verl/trainer/fsdp_sft_trainer.py"
# Recipes
- "!recipe"
# FSDP
- "!verl/workers/**/*dp_*.py"
# Megatron
- "!verl/workers/**/megatron_*.py"
# SGLang
- "!**/*sglang*"
# Cancel jobs on the same ref if a new one is triggered
concurrency:
group: ${{ github.workflow }}-${{ github.ref }}
cancel-in-progress: ${{ github.ref != 'refs/heads/main' }}
# Declare permissions just read content.
permissions:
contents: read
jobs:
vllm:
runs-on: [L20x8]
timeout-minutes: 60 # Increase this timeout value as needed
env:
HTTP_PROXY: ${{ secrets.PROXY_HTTP }}
HTTPS_PROXY: ${{ secrets.PROXY_HTTPS }}
NO_PROXY: "localhost,127.0.0.1"
HF_HUB_ENABLE_HF_TRANSFER: "0" # This is more stable
HF_ENDPOINT: "https://hf-mirror.com"
container:
image: verlai/verl:vemlp-th2.4.0-cu124-vllm0.6.3-ray2.10-te1.7-v0.0.3
options: --gpus all --shm-size=10g
steps:
- uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
with:
fetch-depth: 0
- name: Install the current repository
run: |
pip3 install -e .[test]
pip3 install vllm==0.5.4
- name: Download Model to Use
run: |
huggingface-cli download Qwen/Qwen2.5-0.5B-Instruct
huggingface-cli download 'Qwen/Qwen2-7B-Instruct'
huggingface-cli download 'deepseek-ai/deepseek-llm-7b-chat'
export HF_HUB_OFFLINE=1
# Disable requests to avoid network errors
- name: Running vllm tests on 8 L20 GPUs
run: |
cd tests/rollout
torchrun --standalone --nnodes=1 --nproc_per_node=8 $(which pytest) -s test_vllm_hf_loader.py
- name: Test the latest vLLM
run: |
pip3 install --upgrade vllm==0.7.3
cd tests/rollout
torchrun --standalone --nnodes=1 --nproc_per_node=4 $(which pytest) -s test_vllm_spmd.py
- name: Run Qwen 0.5B generation test
run: |
cd tests/generation
export OUTPUT_PATH="${HOME}/data/gen/qwen_05_gen_test.parquet"
MODEL_ID=Qwen/Qwen2.5-0.5B-Instruct NGPUS_PER_NODE=4 GEN_TP=2 bash ./run_gen_qwen05.sh
rm -rf "${OUTPUT_PATH}"
- name: Run Qwen 0.5B generation test when world_size == 1
run: |
cd tests/generation
export OUTPUT_PATH="${HOME}/data/gen/qwen_05_gen_test.parquet"
MODEL_ID=Qwen/Qwen2.5-0.5B-Instruct NGPUS_PER_NODE=1 GEN_TP=1 bash ./run_gen_qwen05.sh
rm -rf "${OUTPUT_PATH}"
\ No newline at end of file
name: yapf
on:
# Trigger the workflow on push or pull request,
# but only for the main branch
push:
branches:
- main
- v0.2.x
paths:
- "**/*.py"
- .github/workflows/yapf_format.yml
pull_request:
branches:
- main
- v0.2.x
paths:
- "**/*.py"
- .github/workflows/yapf_format.yml
# Cancel jobs on the same ref if a new one is triggered
concurrency:
group: ${{ github.workflow }}-${{ github.ref }}
cancel-in-progress: ${{ github.ref != 'refs/heads/main' }}
# Declare permissions just read content.
permissions:
contents: read
jobs:
yapf:
runs-on: ubuntu-latest
strategy:
matrix:
python-version: ["3.12"]
steps:
- uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
# - name: checkout
# run: |
# commits=${{ github.event.pull_request.commits }}
# if [[ -n "$commits" ]]; then
# # Prepare enough depth for diffs with main
# git fetch --depth="$(( commits + 1 ))"
# fi
- name: Set up Python ${{ matrix.python-version }}
uses: actions/setup-python@0b93645e9fea7318ecaed2b359559ac225c90a2b # v5.3.0
with:
python-version: ${{ matrix.python-version }}
- name: Install dependencies
run: |
python -m pip install --upgrade pip
pip install --upgrade yapf
pip install toml==0.10.2
- name: Running yapf
run: |
yapf -r -vv -d --style=./.style.yapf verl tests examples recipe
**/*.pt
**/checkpoints
**/wget-log
**/_build/
**/*.ckpt
**/outputs
**/*.tar.gz
**/playground
**/wandb
# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
*$py.class
dataset/*
tensorflow/my_graph/*
.idea/
# C extensions
*.so
# Distribution / packaging
.Python
env/
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
tmp/
*.egg-info/
.installed.cfg
*.egg
# PyInstaller
# Usually these files are written by a python script from a template
# before PyInstaller builds the exe, so as to inject date/other infos into it.
*.manifest
*.spec
# Installer logs
pip-log.txt
pip-delete-this-directory.txt
# Unit test / coverage reports
htmlcov/
.tox/
.coverage
.coverage.*
.cache
nosetests.xml
coverage.xml
*,cover
.hypothesis/
# Translations
*.mo
*.pot
# Django stuff:
*.log
local_settings.py
# Flask stuff:
instance/
.webassets-cache
# Scrapy stuff:
.scrapy
# Sphinx documentation
docs/_build/
# PyBuilder
target/
# IPython Notebook
.ipynb_checkpoints
# pyenv
.python-version
# celery beat schedule file
celerybeat-schedule
# dotenv
.env
# virtualenv
venv/
.venv/
ENV/
# Spyder project settings
.spyderproject
# Rope project settings
.ropeproject
# vscode
.vscode
# Mac
.DS_Store
# output logs
tests/e2e/toy_examples/deepspeed/synchronous/output.txt
# vim
*.swp
# ckpt
*.lock
# data
*.parquet
# local logs
logs
log
outputs
# Read the Docs configuration file
# See https://docs.readthedocs.io/en/stable/config-file/v2.html for details
version: 2
build:
os: ubuntu-22.04
tools:
python: "3.11"
rust: "1.70"
sphinx:
configuration: docs/conf.py
python:
install:
- requirements: docs/requirements-docs.txt
- method: pip
path: .
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment