Commit 7f6cc211 authored by jerrrrry's avatar jerrrrry
Browse files

Initial commit

parents
Pipeline #2874 failed with stages
in 0 seconds
name: e2e_sppo
on:
# Trigger the workflow on push or pull request,
# but only for the main branch
push:
branches:
- main
- v0.*
paths:
- "**/*.py"
# Other entrypoints
- "!examples/**"
- "!tests/**"
- "!verl/trainer/main_*.py"
- "!verl/trainer/fsdp_sft_trainer.py"
# Other recipes
- "!recipe/**"
# Megatron
- "!verl/workers/**/megatron_*.py"
# Home
- "recipe/sppo"
# Entrypoints
- ".github/workflows/e2e_sppo.yml"
- "examples/data_preprocess/gsm8k.py"
- "tests/special_e2e/run_sppo.sh"
pull_request:
branches:
- main
- v0.*
paths:
- "**/*.py"
# Other entrypoints
- "!examples/**"
- "!tests/**"
- "!verl/trainer/main_*.py"
- "!verl/trainer/fsdp_sft_trainer.py"
# Other recipes
- "!recipe/**"
# Megatron
- "!verl/workers/**/megatron_*.py"
# Home
- "recipe/sppo"
# Entrypoints
- ".github/workflows/e2e_sppo.yml"
- "examples/data_preprocess/gsm8k.py"
- "tests/special_e2e/run_sppo.sh"
# Declare permissions just read content.
permissions:
contents: read
# Cancel jobs on the same ref if a new one is triggered
concurrency:
group: ${{ github.workflow }}-${{ github.ref }}
cancel-in-progress: ${{ github.ref != 'refs/heads/main' }}
jobs:
e2e_sppo:
runs-on: [L20x8]
timeout-minutes: 40 # Increase this timeout value as needed
env:
HTTP_PROXY: ${{ secrets.PROXY_HTTP }}
HTTPS_PROXY: ${{ secrets.PROXY_HTTPS }}
NO_PROXY: "localhost,127.0.0.1,hf-mirror.com"
HF_ENDPOINT: "https://hf-mirror.com"
HF_HUB_ENABLE_HF_TRANSFER: "0" # This is more stable
container:
image: verlai/verl:app-verl0.5-sglang0.4.8-mcore0.12.2-te2.2
options: --gpus all --shm-size=10g
steps:
- uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
with:
fetch-depth: 0
- name: Install the current repository
run: |
pip3 install -e .[test,gpu,sglang] --no-deps
- name: Prepare MATH dataset
run: |
python3 examples/data_preprocess/math_dataset.py --local_dir ./data/math
- name: Prepare Model checkpoint
run: |
huggingface-cli download Qwen/Qwen2.5-0.5B-Instruct --local-dir ./models/Qwen2.5-0.5B-Instruct
- name: Running the E2E test with the SPPO algorithm
run: |
ray stop --force
bash tests/special_e2e/run_sppo.sh
# # Tests layout
# Each folder under tests/ corresponds to a test category for a sub-namespace in verl. For instance:
# - `tests/trainer` for testing functionality related to `verl/trainer`
# - `tests/models` for testing functionality related to `verl/models`
# - ...
# There are a few folders with `special_` prefix, created for special purposes:
# - `special_distributed`: unit tests that must run with multiple GPUs
# - `special_e2e`: end-to-end tests with training/generation scripts
# - `special_npu`: tests for NPUs
# - `special_sanity`: a suite of quick sanity tests
# - `special_standalone`: a set of test that are designed to run in dedicated environments
# Accelerators for tests
# - By default tests are run with GPU available, except for the ones under `special_npu`, and any test script whose name ends with `on_cpu.py`.
# - For test scripts with `on_cpu.py` name suffix would be tested on CPU resources in linux environment.
# # Workflow layout
# All CI tests are configured by yaml files in `.github/workflows/`. Here's an overview of all test configs:
# 1. A list of always triggered CPU sanity tests: `check-pr-title.yml`, `secrets_scan.yml`, `check-pr-title,yml`, `pre-commit.yml`, `doc.yml`
# 2. Some heavy multi-GPU unit tests, such as `model.yml`, `vllm.yml`, `sgl.yml`
# 3. End-to-end tests: `e2e_*.yml`
# 4. Unit tests
# - `cpu_unit_tests.yml`, run pytest on all scripts with file name pattern `tests/**/test_*_on_cpu.py`
# - `gpu_unit_tests.yml`, run pytest on all scripts with file without the `on_cpu.py` suffix.
# - Since cpu/gpu unit tests by default runs all tests under `tests`, please make sure tests are manually excluded in them when
# - new workflow yaml is added to `.github/workflows`
# - new tests are added to workflow mentioned in 2.
name: GPU unit tests
on:
# Trigger the workflow on push or pull request,
# but only for the main branch
push:
branches:
- main
- v0.4.x
paths:
- "**/*.py"
- .github/workflows/gpu_unit_tests.yml
pull_request:
branches:
- main
- v0.4.x
paths:
# The order that you define paths patterns matters:
# A matching negative pattern (prefixed with !) after a positive match will exclude the path.
# A matching positive pattern after a negative match will include the path again.
- "**/*.py"
# Other entrypoints
- "!examples/**"
- "!verl/trainer/main_*.py"
- "!verl/trainer/fsdp_sft_trainer.py"
- "!recipe/**"
# Entrypoints
- .github/workflows/gpu_unit_tests.yml
- "tests/**test_*.py"
# Ignore CPU tests
- "!tests/*_on_cpu.py"
# Cancel jobs on the same ref if a new one is triggered
concurrency:
group: ${{ github.workflow }}-${{ github.ref }}
cancel-in-progress: ${{ github.ref != 'refs/heads/main' }}
# Declare permissions just read content.
permissions:
contents: read
jobs:
gpu_unit_tests:
runs-on: [L20x8]
timeout-minutes: 40 # Increase this timeout value as needed
env:
HTTP_PROXY: ${{ secrets.PROXY_HTTP }}
HTTPS_PROXY: ${{ secrets.PROXY_HTTPS }}
NO_PROXY: "localhost,127.0.0.1"
HF_HUB_ENABLE_HF_TRANSFER: 1
container:
image: verlai/verl:app-verl0.5-sglang0.4.8-mcore0.12.2-te2.2
options: --gpus all --shm-size=10g
steps:
- uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
with:
fetch-depth: 0
- name: Install the current repository
run: |
pip3 install hf_transfer
pip3 install --no-deps -e .[test]
pip3 install --upgrade "ray>=2.40.0"
pip3 install cupy-cuda12x
- name: Run all GPU unit tests
run: |
pytest -s -x --ignore-glob="*test_special_*.py" --ignore-glob='*on_cpu.py' --ignore-glob="*test_vllm*" --ignore-glob="*_sglang*" --ignore-glob="*_hf_rollout*" --ignore-glob="tests/models/" --ignore-glob='tests/special*' --ignore-glob="tests/experimental" tests/
- name: Testing LinearCrossEntropyTP Correctness, Computation Time and Memory Consumption
run: |
LOW_MEMORY=True torchrun --standalone --nnodes=1 --nproc-per-node=8 tests/utils/test_special_linear_cross_entropy_tp.py
- name: Testing FSDP2 actor functionality
run: |
torchrun --standalone --nnodes=1 --nproc-per-node=2 tests/workers/actor/test_special_dp_actor.py
- name: Testing FSDP2 critic functionality
run: |
torchrun --standalone --nnodes=1 --nproc-per-node=2 tests/workers/critic/test_special_dp_critic.py
# # Tests layout
# Each folder under tests/ corresponds to a test category for a sub-namespace in verl. For instance:
# - `tests/trainer` for testing functionality related to `verl/trainer`
# - `tests/models` for testing functionality related to `verl/models`
# - ...
# There are a few folders with `special_` prefix, created for special purposes:
# - `special_distributed`: unit tests that must run with multiple GPUs
# - `special_e2e`: end-to-end tests with training/generation scripts
# - `special_npu`: tests for NPUs
# - `special_sanity`: a suite of quick sanity tests
# - `special_standalone`: a set of test that are designed to run in dedicated environments
# Accelerators for tests
# - By default tests are run with GPU available, except for the ones under `special_npu`, and any test script whose name ends with `on_cpu.py`.
# - For test scripts with `on_cpu.py` name suffix would be tested on CPU resources in linux environment.
# # Workflow layout
# All CI tests are configured by yaml files in `.github/workflows/`. Here's an overview of all test configs:
# 1. A list of always triggered CPU sanity tests: `check-pr-title.yml`, `secrets_scan.yml`, `check-pr-title,yml`, `pre-commit.yml`, `doc.yml`
# 2. Some heavy multi-GPU unit tests, such as `model.yml`, `vllm.yml`, `sgl.yml`
# 3. End-to-end tests: `e2e_*.yml`
# 4. Unit tests
# - `cpu_unit_tests.yml`, run pytest on all scripts with file name pattern `tests/**/test_*_on_cpu.py`
# - `gpu_unit_tests.yml`, run pytest on all scripts with file without the `on_cpu.py` suffix.
# - Since cpu/gpu unit tests by default runs all tests under `tests`, please make sure tests are manually excluded in them when
# - new workflow yaml is added to `.github/workflows`
# - new tests are added to workflow mentioned in 2.
# name: Check PR Title
name: model_rmpad
on:
# Trigger the workflow on push or pull request,
# but only for the main branch
push:
branches:
- main
- v0.*
pull_request:
branches:
- main
- v0.*
paths:
- "verl/**/*.py"
# Entrypoints
- ".github/workflows/model.yml"
- "tests/special_distributed/test_fsdp_ckpt.py"
- "tests/models/**"
- "tests/special_distributed/run_all.sh"
# Declare permissions just read content.
permissions:
contents: read
# Cancel jobs on the same ref if a new one is triggered
concurrency:
group: ${{ github.workflow }}-${{ github.ref }}
cancel-in-progress: ${{ github.ref != 'refs/heads/main' }}
jobs:
model_rmpad:
runs-on: [L20x8]
timeout-minutes: 20 # Increase this timeout value as needed
env:
HTTP_PROXY: ${{ secrets.PROXY_HTTP }}
HTTPS_PROXY: ${{ secrets.PROXY_HTTPS }}
NO_PROXY: "localhost,127.0.0.1,hf-mirror.com"
HF_ENDPOINT: "https://hf-mirror.com"
HF_HUB_ENABLE_HF_TRANSFER: "0" # This is more stable
container:
image: verlai/verl:app-verl0.4-sglang0.4.6.post5-vllm0.8.5-mcore0.12.2-te2.2
options: --gpus all --shm-size=10g
steps:
- uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
with:
fetch-depth: 0
- name: Install the current repository and upgrade to latest transformers/flash_attn
run: |
pip3 install --no-deps -e .[test]
pip3 install --upgrade transformers==4.53.3
- name: Running rmpad model tests on 8 L20 GPUs + flash_attn 2.5.8
run: |
pytest -s tests/models/test_transformer.py
- name: Running rmpad model tests on 8 L20 GPUs + latest flash_attn
run: |
pytest -s tests/models/test_transformer.py
- name: Running FSDP rmpad model tests on 8 L20 GPUs + latest flash_attn
run: |
STRATEGY=fsdp torchrun --nproc_per_node=8 tests/special_distributed/test_fsdp_ckpt.py
- name: Running transformers ulysses tests on 8 L20 GPUs + latest transformers
run: |
torchrun --nproc_per_node=8 -m pytest tests/models/test_transformers_ulysses.py
- name: Running transformers ulysses tests on 8 L20 GPUs + transformers 4.49.0
run: |
pip3 install transformers==4.49.0
torchrun --nproc_per_node=8 -m pytest tests/models/test_transformers_ulysses.py
- name: Running transformers ulysses tests on 8 L20 GPUs + transformers 4.48.0
run: |
pip3 install transformers==4.48.0
torchrun --nproc_per_node=8 -m pytest tests/models/test_transformers_ulysses.py
- name: Running transformers ulysses tests on 8 L20 GPUs + transformers 4.47.0
run: |
pip3 install transformers==4.47.0
torchrun --nproc_per_node=8 -m pytest tests/models/test_transformers_ulysses.py
- name: Running transformers ulysses tests on 8 L20 GPUs + transformers 4.46.0
run: |
pip3 install transformers==4.46.0
torchrun --nproc_per_node=8 -m pytest tests/models/test_transformers_ulysses.py
- name: Running transformers ulysses tests on 8 L20 GPUs + transformers 4.45.0
run: |
pip3 install transformers==4.45.0
torchrun --nproc_per_node=8 -m pytest tests/models/test_transformers_ulysses.py
- name: Run distributed test
run: |
bash tests/special_distributed/run_all.sh
# TODO: Move this back to model_rmpad once FSDP2 is stable.
# NOTE: List as an independent job to make rerun easier.
model_rmpad_fsdp2_unstable:
runs-on: [L20x8]
timeout-minutes: 20 # Increase this timeout value as needed
env:
HTTP_PROXY: ${{ secrets.PROXY_HTTP }}
HTTPS_PROXY: ${{ secrets.PROXY_HTTPS }}
NO_PROXY: "localhost,127.0.0.1,hf-mirror.com"
HF_ENDPOINT: "https://hf-mirror.com"
HF_HUB_ENABLE_HF_TRANSFER: "0" # This is more stable
container:
image: verlai/verl:base-verl0.5-cu126-cudnn9.8-torch2.7.0-fa2.7.4
options: --gpus all --shm-size=10g
steps:
- uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
with:
fetch-depth: 0
- name: Install the current repository and upgrade to latest transformers/flash_attn
run: |
pip3 install --no-deps -e .[test]
pip3 install --upgrade transformers==4.53.3
- name: Running FSDP2 rmpad model tests on 8 L20 GPUs + latest flash_attn
run: |
STRATEGY=fsdp2 torchrun --nproc_per_node=8 tests/special_distributed/test_fsdp_ckpt.py
name: pre-commit-full
# Run weekly on Sunday at 00:00 UTC
on:
schedule:
- cron: "0 0 * * 0"
# Allow manual triggering
workflow_dispatch:
# Declare permissions just read content.
permissions:
contents: read
jobs:
pre-commit-full:
runs-on: ubuntu-latest
strategy:
matrix:
python-version: ["3.12"]
steps:
- uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
- name: Set up Python ${{ matrix.python-version }}
uses: actions/setup-python@0b93645e9fea7318ecaed2b359559ac225c90a2b # v5.3.0
with:
python-version: ${{ matrix.python-version }}
- name: Set ruff --output-format=github
run: |
sed -i 's/--output-format=full/--output-format=github/' .pre-commit-config.yaml
git add .pre-commit-config.yaml
- uses: pre-commit/action@v3.0.1
# c.f. https://github.com/pre-commit/action?tab=readme-ov-file#using-this-action
name: pre-commit
# No need to avoid / cancel lightweight pre-commit jobs
on:
pull_request:
push:
branches:
- main
- v0.*
# Declare permissions just read content.
permissions:
contents: read
jobs:
pre-commit:
runs-on: ubuntu-latest
strategy:
matrix:
python-version: ["3.12"]
steps:
- uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
- name: Set up Python ${{ matrix.python-version }}
uses: actions/setup-python@0b93645e9fea7318ecaed2b359559ac225c90a2b # v5.3.0
with:
python-version: ${{ matrix.python-version }}
- name: Install the current repository
run: |
pip install -e .
- name: Set ruff --output-format=github
run: |
sed -i 's/--output-format=full/--output-format=github/' .pre-commit-config.yaml
git add .pre-commit-config.yaml
# Check "--all-files" by default
- uses: pre-commit/action@v3.0.1
# # Tests layout
# Each folder under tests/ corresponds to a test category for a sub-namespace in verl. For instance:
# - `tests/trainer` for testing functionality related to `verl/trainer`
# - `tests/models` for testing functionality related to `verl/models`
# - ...
# There are a few folders with `special_` prefix, created for special purposes:
# - `special_distributed`: unit tests that must run with multiple GPUs
# - `special_e2e`: end-to-end tests with training/generation scripts
# - `special_npu`: tests for NPUs
# - `special_sanity`: a suite of quick sanity tests
# - `special_standalone`: a set of test that are designed to run in dedicated environments
# Accelerators for tests
# - By default tests are run with GPU available, except for the ones under `special_npu`, and any test script whose name ends with `on_cpu.py`.
# - For test scripts with `on_cpu.py` name suffix would be tested on CPU resources in linux environment.
# # Workflow layout
# All CI tests are configured by yaml files in `.github/workflows/`. Here's an overview of all test configs:
# 1. A list of always triggered CPU sanity tests: `check-pr-title.yml`, `secrets_scan.yml`, `check-pr-title,yml`, `pre-commit.yml`, `doc.yml`
# 2. Some heavy multi-GPU unit tests, such as `model.yml`, `vllm.yml`, `sgl.yml`
# 3. End-to-end tests: `e2e_*.yml`
# 4. Unit tests
# - `cpu_unit_tests.yml`, run pytest on all scripts with file name pattern `tests/**/test_*_on_cpu.py`
# - `gpu_unit_tests.yml`, run pytest on all scripts with file without the `on_cpu.py` suffix.
# - Since cpu/gpu unit tests by default runs all tests under `tests`, please make sure tests are manually excluded in them when
# - new workflow yaml is added to `.github/workflows`
# - new tests are added to workflow mentioned in 2.
# name: Check PR Title
name: sanity
on:
# Trigger the workflow on push or pull request,
# but only for the main branch
push:
branches:
- main
- v0.*
pull_request:
branches:
- main
- v0.*
paths:
- "**/*.py"
- .github/workflows/sanity.yml
- "tests/special_sanity/**"
# Cancel jobs on the same ref if a new one is triggered
concurrency:
group: ${{ github.workflow }}-${{ github.ref }}
cancel-in-progress: ${{ github.ref != 'refs/heads/main' }}
# Declare permissions just read content.
permissions:
contents: read
jobs:
sanity:
runs-on: ubuntu-latest
timeout-minutes: 5 # Increase this timeout value as needed
strategy:
matrix:
python-version: ["3.10"]
steps:
- uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
- name: Set up Python ${{ matrix.python-version }}
uses: actions/setup-python@0b93645e9fea7318ecaed2b359559ac225c90a2b # v5.3.0
with:
python-version: ${{ matrix.python-version }}
- name: Install the current repository
run: |
pip install -e .[test]
- name: Run sanity test
run: |
pytest -s -x tests/special_sanity
- name: Run license test
run: |
python3 tests/special_sanity/check_license.py --directory .
- name: Assert naming convention
run: |
if grep -rIn --exclude-dir=.git --exclude-dir=.github --exclude-dir=venv --exclude-dir=__pycache__ 'veRL' .; then
echo "Please use verl instead of veRL in the codebase"
exit 1
fi
- name: Assert SGLang naming convention
run: |
if grep -rIn --exclude-dir=.git --exclude-dir=.github --exclude-dir=venv --exclude-dir=__pycache__ -E 'Sglang|sgLang|sglAng|sglaNg|sglanG' .; then
echo "Please use SGLang or sglang as the formal name of SGLang rollout engine"
exit 1
fi
- name: Validate test folder structure
run: python3 tests/special_sanity/validate_structure.py
- name: Assert documentation requirement for functions
run: python3 tests/special_sanity/validate_imported_docs.py
- name: Assert device api usage in verl/recipe
run: python3 tests/special_sanity/check_device_api_usage.py --directory ./recipe
- name: Assert device api usage in verl/verl
run: python3 tests/special_sanity/check_device_api_usage.py --directory ./verl
- name: Assert documentation time info
run: python3 tests/special_sanity/check_docs_time_info.py
- name: Check docstrings for specified files
run: python3 tests/special_sanity/check_docstrings.py
# This workflow uses actions that are not certified by GitHub. They are provided
# by a third-party and are governed by separate terms of service, privacy
# policy, and support documentation.
name: Scorecard supply-chain security
on:
# For Branch-Protection check. Only the default branch is supported. See
# https://github.com/ossf/scorecard/blob/main/docs/checks.md#branch-protection
branch_protection_rule:
# To guarantee Maintained check is occasionally updated. See
# https://github.com/ossf/scorecard/blob/main/docs/checks.md#maintained
schedule:
- cron: "27 7 * * 1"
push:
branches:
- main
- v0.*
# Declare default permissions as read only.
permissions: read-all
jobs:
analysis:
name: Scorecard analysis
runs-on: ubuntu-latest
permissions:
# Needed to upload the results to code-scanning dashboard.
security-events: write
# Needed to publish results and get a badge (see publish_results below).
id-token: write
# Uncomment the permissions below if installing in a private repository.
# contents: read
# actions: read
steps:
- name: "Checkout code"
uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1
with:
persist-credentials: false
- name: "Run analysis"
uses: ossf/scorecard-action@0864cf19026789058feabb7e87baa5f140aac736 # v2.3.1
with:
results_file: results.sarif
results_format: sarif
# (Optional) "write" PAT token. Uncomment the `repo_token` line below if:
# - you want to enable the Branch-Protection check on a *public* repository, or
# - you are installing Scorecard on a *private* repository
# To create the PAT, follow the steps in https://github.com/ossf/scorecard-action?tab=readme-ov-file#authentication-with-fine-grained-pat-optional.
# repo_token: ${{ secrets.SCORECARD_TOKEN }}
# Public repositories:
# - Publish results to OpenSSF REST API for easy access by consumers
# - Allows the repository to include the Scorecard badge.
# - See https://github.com/ossf/scorecard-action#publishing-results.
# For private repositories:
# - `publish_results` will always be set to `false`, regardless
# of the value entered here.
publish_results: true
# Upload the results to GitHub's code scanning dashboard (optional).
# Commenting out will disable upload of results to your repo's Code Scanning dashboard
- name: "Upload to code-scanning"
uses: github/codeql-action/upload-sarif@9e8d0789d4a0fa9ceb6b1738f7e269594bdd67f0 #v3.28.9
with:
sarif_file: results.sarif
on:
push:
branches:
- main
- v0.*
pull_request:
permissions:
contents: read
jobs:
test:
runs-on: ubuntu-latest
steps:
- name: Checkout code
uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1
with:
fetch-depth: 0
- name: Secret Scanning
uses: trufflesecurity/trufflehog@7dc056a193116ba8d82154bf0549381c8fb8545c # v3.88.14
with:
extra_args: --results=verified,unknown
# # Tests layout
# Each folder under tests/ corresponds to a test category for a sub-namespace in verl. For instance:
# - `tests/trainer` for testing functionality related to `verl/trainer`
# - `tests/models` for testing functionality related to `verl/models`
# - ...
# There are a few folders with `special_` prefix, created for special purposes:
# - `special_distributed`: unit tests that must run with multiple GPUs
# - `special_e2e`: end-to-end tests with training/generation scripts
# - `special_npu`: tests for NPUs
# - `special_sanity`: a suite of quick sanity tests
# - `special_standalone`: a set of test that are designed to run in dedicated environments
# Accelerators for tests
# - By default tests are run with GPU available, except for the ones under `special_npu`, and any test script whose name ends with `on_cpu.py`.
# - For test scripts with `on_cpu.py` name suffix would be tested on CPU resources in linux environment.
# # Workflow layout
# All CI tests are configured by yaml files in `.github/workflows/`. Here's an overview of all test configs:
# 1. A list of always triggered CPU sanity tests: `check-pr-title.yml`, `secrets_scan.yml`, `check-pr-title,yml`, `pre-commit.yml`, `doc.yml`
# 2. Some heavy multi-GPU unit tests, such as `model.yml`, `vllm.yml`, `sgl.yml`
# 3. End-to-end tests: `e2e_*.yml`
# 4. Unit tests
# - `cpu_unit_tests.yml`, run pytest on all scripts with file name pattern `tests/**/test_*_on_cpu.py`
# - `gpu_unit_tests.yml`, run pytest on all scripts with file without the `on_cpu.py` suffix.
# - Since cpu/gpu unit tests by default runs all tests under `tests`, please make sure tests are manually excluded in them when
# - new workflow yaml is added to `.github/workflows`
# - new tests are added to workflow mentioned in 2.
name: sgl
on:
workflow_dispatch: # Manual
# Trigger the workflow on push or pull request,
# but only for the main branch
push:
branches:
- main
- v0.2.x
paths:
- "**/*.py"
- .github/workflows/vllm.yml
pull_request:
branches:
- main
- v0.2.x
paths:
- "**/*.py"
# Other entrypoints
- "!examples/**"
- "!tests/**"
- "!verl/trainer/main_*.py"
- "!verl/trainer/fsdp_sft_trainer.py"
# FSDP
- "!verl/workers/**/*dp_*.py"
# Megatron
- "!verl/workers/**/megatron_*.py"
# vLLM
- "!**/*vllm*"
# Recipes
- "!recipe/**"
# Entrypoints
- ".github/workflows/sgl.yml"
- "tests/rollout/*sglang*"
- "tests/rollout/async_rollout_utils.py"
- "tests/workers/rollout/*interaction*"
# Cancel jobs on the same ref if a new one is triggered
concurrency:
group: ${{ github.workflow }}-${{ github.ref }}
cancel-in-progress: ${{ github.ref != 'refs/heads/main' }}
# Declare permissions just read content.
permissions:
contents: read
jobs:
sgl:
runs-on: [L20x8]
timeout-minutes: 20 # Increase this timeout value as needed
env:
HTTP_PROXY: ${{ secrets.PROXY_HTTP }}
HTTPS_PROXY: ${{ secrets.PROXY_HTTPS }}
NO_PROXY: "localhost,127.0.0.1,hf-mirror.com"
HF_ENDPOINT: "https://hf-mirror.com"
HF_HUB_ENABLE_HF_TRANSFER: 1
SGL_DISABLE_TP_MEMORY_INBALANCE_CHECK: "True"
NCCL_SHM_DISABLE: "1"
NCCL_P2P_DISABLE: "1"
container:
image: verlai/verl:app-verl0.5-sglang0.4.8-mcore0.12.2-te2.2
options: --gpus all --shm-size=10g
steps:
- uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
with:
fetch-depth: 0
- name: Install the current repository
run: |
pip3 install hf_transfer fastmcp
pip3 install -e .[test,gpu,sglang] --no-deps
- name: Download Model to Use
run: |
huggingface-cli download 'Qwen/Qwen2-7B-Instruct'
huggingface-cli download 'Qwen/Qwen2.5-0.5B'
huggingface-cli download Qwen/Qwen2.5-1.5B-Instruct
export HF_HUB_OFFLINE=1
- name: Test the latest SGLang
run: |
cd tests/workers/rollout
torchrun --nnodes=1 --nproc_per_node=2 $(which pytest) -s test_sglang_spmd.py
- name: Test the latest SGLang Rollout async with interaction
run: |
cd tests/workers/rollout
torchrun --nnodes=1 --nproc_per_node=2 $(which pytest) -s test_sglang_async_rollout_w_interaction.py
- name: Test the latest SGLang Multi Interaction
run: |
cd tests/workers/rollout
torchrun --nnodes=1 --nproc_per_node=2 $(which pytest) -s test_sglang_multi_interaction.py
- name: Test the latest SGLang Rollout async with tool
run: |
cd tests/workers/rollout
torchrun --nnodes=1 --nproc_per_node=2 $(which pytest) -s test_sglang_async_rollout_w_tools.py
- name: Test the latest SGLang Rollout async with sandbox fusion tool
run: |
cd tests/workers/rollout
pytest -s test_sglang_async_rollout_sf_tools.py
- name: Test the latest SGLang Rollout async with search tool
run: |
cd tests/workers/rollout
pytest -s test_sglang_async_rollout_search_tools.py
- name: Test the latest SGLang Rollout async with mcp search tool
run: |
cd tests/workers/rollout
pytest -s test_sglang_async_rollout_mcp_tools.py
- name: Test the latest SGLang Rollout async with agent loop
run: |
ROLLOUT_NAME=sglang pytest -svvv tests/experimental/agent_loop/test_basic_agent_loop.py
# Note(haibin.lin): for any new test, please update gpu_unit_tests.yaml to avoid repeated tests
- name: Test the latest SGLang Rollout async with multimodal delta
run: |
cd tests/workers/rollout
pytest -s test_sglang_async_rollout_multimodal_delta.py
\ No newline at end of file
name: Type Annotation and Docstring Coverage
on:
pull_request:
paths:
- '**/*.py'
jobs:
type-coverage-check:
runs-on: [L20x8]
timeout-minutes: 20 # Increase this timeout value as needed
env:
HTTP_PROXY: ${{ secrets.PROXY_HTTP }}
HTTPS_PROXY: ${{ secrets.PROXY_HTTPS }}
NO_PROXY: "localhost,127.0.0.1,hf-mirror.com"
HF_ENDPOINT: "https://hf-mirror.com"
HF_HUB_ENABLE_HF_TRANSFER: 1
SGL_DISABLE_TP_MEMORY_INBALANCE_CHECK: "True"
container:
image: verlai/verl:app-verl0.5-sglang0.4.8-mcore0.12.2-te2.2
options: --gpus all --shm-size=10g
steps:
- uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
with:
fetch-depth: 0 # 🚨 Important: fetch full history so `origin/main` is available
- name: Install dependencies
run: |
pip install gitpython
pip install -e .
- name: Run type annotation coverage check
run: |
python3 tests/special_sanity/type_coverage_check.py
- name: Run docstring coverage check
run: |
python3 tests/special_sanity/check_api_docs.py verl
# # Tests layout
# Each folder under tests/ corresponds to a test category for a sub-namespace in verl. For instance:
# - `tests/trainer` for testing functionality related to `verl/trainer`
# - `tests/models` for testing functionality related to `verl/models`
# - ...
# There are a few folders with `special_` prefix, created for special purposes:
# - `special_distributed`: unit tests that must run with multiple GPUs
# - `special_e2e`: end-to-end tests with training/generation scripts
# - `special_npu`: tests for NPUs
# - `special_sanity`: a suite of quick sanity tests
# - `special_standalone`: a set of test that are designed to run in dedicated environments
# Accelerators for tests
# - By default tests are run with GPU available, except for the ones under `special_npu`, and any test script whose name ends with `on_cpu.py`.
# - For test scripts with `on_cpu.py` name suffix would be tested on CPU resources in linux environment.
# # Workflow layout
# All CI tests are configured by yaml files in `.github/workflows/`. Here's an overview of all test configs:
# 1. A list of always triggered CPU sanity tests: `check-pr-title.yml`, `secrets_scan.yml`, `check-pr-title,yml`, `pre-commit.yml`, `doc.yml`
# 2. Some heavy multi-GPU unit tests, such as `model.yml`, `vllm.yml`, `sgl.yml`
# 3. End-to-end tests: `e2e_*.yml`
# 4. Unit tests
# - `cpu_unit_tests.yml`, run pytest on all scripts with file name pattern `tests/**/test_*_on_cpu.py`
# - `gpu_unit_tests.yml`, run pytest on all scripts with file without the `on_cpu.py` suffix.
# - Since cpu/gpu unit tests by default runs all tests under `tests`, please make sure tests are manually excluded in them when
# - new workflow yaml is added to `.github/workflows`
# - new tests are added to workflow mentioned in 2.
name: vllm
on:
# Trigger the workflow on push or pull request,
# but only for the main branch
push:
branches:
- main
- v0.*
pull_request:
branches:
- main
- v0.*
paths:
- "**/*.py"
# Other entrypoints
- "!examples/**"
- "!tests/**"
- "!verl/trainer/main_*.py"
- "!verl/trainer/fsdp_sft_trainer.py"
# Recipes
- "!recipe/**"
# FSDP
- "!verl/workers/**/*dp_*.py"
# Megatron
- "!verl/workers/**/megatron_*.py"
# SGLang
- "!**/*sglang*"
# Entrypoints
- ".github/workflows/vllm.yml"
- "tests/special_e2e/generation"
- "tests/workers/rollout"
- "verl/trainer/main_generation.py"
- "verl/trainer/config/generation.yaml"
# Cancel jobs on the same ref if a new one is triggered
concurrency:
group: ${{ github.workflow }}-${{ github.ref }}
cancel-in-progress: ${{ github.ref != 'refs/heads/main' }}
# Declare permissions just read content.
permissions:
contents: read
jobs:
vllm:
runs-on: [L20x8]
timeout-minutes: 60 # Increase this timeout value as needed
env:
HTTP_PROXY: ${{ secrets.PROXY_HTTP }}
HTTPS_PROXY: ${{ secrets.PROXY_HTTPS }}
NO_PROXY: "localhost,127.0.0.1,hf-mirror.com"
HF_ENDPOINT: "https://hf-mirror.com"
HF_HUB_ENABLE_HF_TRANSFER: "0" # This is more stable
container:
image: verlai/verl:app-verl0.5-vllm0.9.1-mcore0.12.2-te2.2
options: --gpus all --shm-size=10g
steps:
- uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
with:
fetch-depth: 0
- name: Install the current repository
run: |
pip3 install -e .[test]
pip install tensordict==0.6.2
- name: Download Model to Use
run: |
huggingface-cli download Qwen/Qwen2.5-0.5B-Instruct
huggingface-cli download Qwen/Qwen2.5-1.5B-Instruct
huggingface-cli download 'Qwen/Qwen2-7B-Instruct'
huggingface-cli download 'deepseek-ai/deepseek-llm-7b-chat'
export HF_HUB_OFFLINE=1
# Disable requests to avoid network errors
- name: Test the latest vLLM
run: |
torchrun --standalone --nnodes=1 --nproc_per_node=4 $(which pytest) -s tests/workers/rollout/rollout_vllm/test_vllm_spmd.py
- name: Test the latest vLLM on model with rope scaling
run: |
torchrun --standalone --nnodes=1 --nproc_per_node=4 $(which pytest) -s tests/workers/rollout/rollout_vllm/test_vllm_model_rope_scaling.py
- name: Run Qwen 0.5B generation test
run: |
cd tests/special_e2e/generation
export OUTPUT_PATH="${HOME}/data/gen/qwen_05_gen_test.parquet"
MODEL_ID=Qwen/Qwen2.5-0.5B-Instruct NGPUS_PER_NODE=4 GEN_TP=2 bash ./run_gen_qwen05.sh
rm -rf "${OUTPUT_PATH}"
- name: Run Qwen 0.5B generation test when world_size == 1
run: |
cd tests/special_e2e/generation
export OUTPUT_PATH="${HOME}/data/gen/qwen_05_gen_test.parquet"
MODEL_ID=Qwen/Qwen2.5-0.5B-Instruct NGPUS_PER_NODE=1 GEN_TP=1 bash ./run_gen_qwen05.sh
rm -rf "${OUTPUT_PATH}"
- name: Test the latest vLLM Rollout async with agent loop
run: |
ROLLOUT_NAME=vllm pytest -svvv tests/experimental/agent_loop/test_basic_agent_loop.py
# Note(haibin.lin): for any new test, please update gpu_unit_tests.yaml to avoid repeated tests
**/*.pt
**/checkpoints
**/wget-log
**/_build/
**/*.ckpt
**/outputs
**/*.tar.gz
**/playground
**/wandb
# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
*$py.class
dataset/*
tensorflow/my_graph/*
.idea/
# C extensions
*.so
# Distribution / packaging
.Python
env/
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
tmp/
*.egg-info/
.installed.cfg
*.egg
# PyInstaller
# Usually these files are written by a python script from a template
# before PyInstaller builds the exe, so as to inject date/other infos into it.
*.manifest
*.spec
# Installer logs
pip-log.txt
pip-delete-this-directory.txt
# Unit test / coverage reports
htmlcov/
.tox/
.coverage
.coverage.*
.cache
nosetests.xml
coverage.xml
*,cover
.hypothesis/
pytest.ini
# Translations
*.mo
*.pot
# Django stuff:
*.log
local_settings.py
# Flask stuff:
instance/
.webassets-cache
# Scrapy stuff:
.scrapy
# Sphinx documentation
docs/_build/
# PyBuilder
target/
# IPython Notebook
.ipynb_checkpoints
# pyenv
.python-version
# celery beat schedule file
celerybeat-schedule
# dotenv
.env
# virtualenv
venv/
.venv/
ENV/
# Spyder project settings
.spyderproject
# Rope project settings
.ropeproject
# vscode
.vscode
# Mac
.DS_Store
# vim
*.swp
# ckpt
*.lock
# data
*.parquet
# local logs
logs
log
outputs
.history
repos:
- repo: https://github.com/astral-sh/ruff-pre-commit
rev: "v0.12.2"
hooks:
- id: ruff
args: ["--fix", "--show-fixes", "--output-format=full"]
exclude: ^.*\.(ipynb)$
- id: ruff-format
- repo: https://github.com/pre-commit/mirrors-mypy
rev: 'v1.17.0'
hooks:
- id: mypy
- repo: local
hooks:
- id: autogen-trainer-cfg
name: Generate and verify verl/trainer/config/_generated_*.yaml
entry: scripts/generate_trainer_config.sh
language: script
pass_filenames: false
- repo: local
hooks:
- id: check-docstrings
name: Check doc string coverage
entry: python3 tests/special_sanity/check_docstrings.py
language: python
pass_filenames: false
- repo: local
hooks:
- id: check-license
name: Check license
entry: python3 tests/special_sanity/check_license.py --directory .
language: python
pass_filenames: false
# Read the Docs configuration file
# See https://docs.readthedocs.io/en/stable/config-file/v2.html for details
version: 2
build:
os: ubuntu-22.04
tools:
python: "3.11"
rust: "1.70"
sphinx:
configuration: docs/conf.py
python:
install:
- requirements: docs/requirements-docs.txt
- method: pip
path: .
# Contributing to verl
Thank you for considering a contribution to verl! We welcome contributions of any kind - bug fixes, enhancements, documentation improvements, or even just feedback. Whether you're an experienced developer or this is your first open-source project, your help is invaluable.
Your support can take many forms:
- Report issues or unexpected behaviors.
- Suggest or implement new features.
- Improve or expand documentation.
- Review pull requests and assist other contributors.
- Spread the word: share verl in blog posts, social media, or give the repo a ⭐.
## Finding Issues to Contribute
Looking for ways to dive in? Check out these issues:
- [Good first issues](https://github.com/volcengine/verl/issues?q=is%3Aissue%20state%3Aopen%20label%3A%22good%20first%20issue%22)
- [Call for contribution](https://github.com/volcengine/verl/issues?q=is%3Aissue%20state%3Aopen%20label%3A%22call%20for%20contribution%22)
Furthermore, you can learn the development plan and roadmap via [RFC](https://github.com/volcengine/verl/issues?q=is%3Aissue%20state%3Aopen%20label%3ARFC) and [Roadmap](https://github.com/volcengine/verl/issues?q=state%3Aopen%20label%3A%22roadmap%22).
## Developing
- **Python-only**: install verl via `pip install -e .[test,vllm]` or `pip install -e .[test,sglang]` and iterate quickly. For full dependency setup, check out the verl [installation doc](https://verl.readthedocs.io/en/latest/start/install.html).
## Code Linting and Formatting
We rely on pre-commit to keep our code consistent. To set it up:
```bash
pip install pre-commit
pre-commit install
# for staged changes
pre-commit run
# for all files in the repo
pre-commit run --all-files
# run a specific hook with pre-commit
# pre-commit run --all-files --show-diff-on-failure --color=always <hood-id>
pre-commit run --all-files --show-diff-on-failure --color=always ruff
pre-commit run --all-files --show-diff-on-failure --color=always autogen-trainer-cfg
```
## Testing
Our test suites run on GitHub Actions. Check these workflows for details:
- [GPU unit tests](https://github.com/volcengine/verl/blob/main/.github/workflows/gpu_unit_tests.yml)
- [CPU unit tests](https://github.com/volcengine/verl/blob/main/.github/workflows/cpu_unit_tests.yml)
- [vLLM tests](https://github.com/volcengine/verl/blob/main/.github/workflows/vllm.yml)
- [SGLang tests](https://github.com/volcengine/verl/blob/main/.github/workflows/sgl.yml)
### Adding CI tests
If possible, please add CI test(s) for your new feature:
1. Find the most relevant workflow yml file, which usually corresponds to a `hydra` default config (e.g. `ppo_trainer`, `ppo_megatron_trainer`, `sft_trainer`, etc).
2. Add related path patterns to the `paths` section if not already included.
3. Minimize the workload of the test script(s) (see existing scripts for examples).
## Building the Docs
```
# Ensure verl is on your PYTHONPATH, e.g.:
pip install -e .[test]
# Install documentation dependencies
pip install -r requirements-docs.txt
# Generate HTML docs
make clean
make html
# Preview locally
python -m http.server -d _build/html/
```
Open your browser at http://localhost:8000 to explore the docs.
## Pull Requests & Code Reviews
Thanks for submitting a PR! To streamline reviews:
- Follow our Pull Request Template for title format and checklist.
- Adhere to our pre-commit lint rules and ensure all checks pass.
- Update docs for any user-facing changes.
- Add or update tests in the CI workflows, or explain why tests aren't applicable.
## License
See the [LICENSE](https://github.com/volcengine/verl/blob/main/LICENSE) file for full details.
## Thank You
We appreciate your contributions to verl. Your efforts help make the project stronger and more user-friendly. Happy coding!
Apache License
Version 2.0, January 2004
http://www.apache.org/licenses/
TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
1. Definitions.
"License" shall mean the terms and conditions for use, reproduction,
and distribution as defined by Sections 1 through 9 of this document.
"Licensor" shall mean the copyright owner or entity authorized by
the copyright owner that is granting the License.
"Legal Entity" shall mean the union of the acting entity and all
other entities that control, are controlled by, or are under common
control with that entity. For the purposes of this definition,
"control" means (i) the power, direct or indirect, to cause the
direction or management of such entity, whether by contract or
otherwise, or (ii) ownership of fifty percent (50%) or more of the
outstanding shares, or (iii) beneficial ownership of such entity.
"You" (or "Your") shall mean an individual or Legal Entity
exercising permissions granted by this License.
"Source" form shall mean the preferred form for making modifications,
including but not limited to software source code, documentation
source, and configuration files.
"Object" form shall mean any form resulting from mechanical
transformation or translation of a Source form, including but
not limited to compiled object code, generated documentation,
and conversions to other media types.
"Work" shall mean the work of authorship, whether in Source or
Object form, made available under the License, as indicated by a
copyright notice that is included in or attached to the work
(an example is provided in the Appendix below).
"Derivative Works" shall mean any work, whether in Source or Object
form, that is based on (or derived from) the Work and for which the
editorial revisions, annotations, elaborations, or other modifications
represent, as a whole, an original work of authorship. For the purposes
of this License, Derivative Works shall not include works that remain
separable from, or merely link (or bind by name) to the interfaces of,
the Work and Derivative Works thereof.
"Contribution" shall mean any work of authorship, including
the original version of the Work and any modifications or additions
to that Work or Derivative Works thereof, that is intentionally
submitted to Licensor for inclusion in the Work by the copyright owner
or by an individual or Legal Entity authorized to submit on behalf of
the copyright owner. For the purposes of this definition, "submitted"
means any form of electronic, verbal, or written communication sent
to the Licensor or its representatives, including but not limited to
communication on electronic mailing lists, source code control systems,
and issue tracking systems that are managed by, or on behalf of, the
Licensor for the purpose of discussing and improving the Work, but
excluding communication that is conspicuously marked or otherwise
designated in writing by the copyright owner as "Not a Contribution."
"Contributor" shall mean Licensor and any individual or Legal Entity
on behalf of whom a Contribution has been received by Licensor and
subsequently incorporated within the Work.
2. Grant of Copyright License. Subject to the terms and conditions of
this License, each Contributor hereby grants to You a perpetual,
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
copyright license to reproduce, prepare Derivative Works of,
publicly display, publicly perform, sublicense, and distribute the
Work and such Derivative Works in Source or Object form.
3. Grant of Patent License. Subject to the terms and conditions of
this License, each Contributor hereby grants to You a perpetual,
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
(except as stated in this section) patent license to make, have made,
use, offer to sell, sell, import, and otherwise transfer the Work,
where such license applies only to those patent claims licensable
by such Contributor that are necessarily infringed by their
Contribution(s) alone or by combination of their Contribution(s)
with the Work to which such Contribution(s) was submitted. If You
institute patent litigation against any entity (including a
cross-claim or counterclaim in a lawsuit) alleging that the Work
or a Contribution incorporated within the Work constitutes direct
or contributory patent infringement, then any patent licenses
granted to You under this License for that Work shall terminate
as of the date such litigation is filed.
4. Redistribution. You may reproduce and distribute copies of the
Work or Derivative Works thereof in any medium, with or without
modifications, and in Source or Object form, provided that You
meet the following conditions:
(a) You must give any other recipients of the Work or
Derivative Works a copy of this License; and
(b) You must cause any modified files to carry prominent notices
stating that You changed the files; and
(c) You must retain, in the Source form of any Derivative Works
that You distribute, all copyright, patent, trademark, and
attribution notices from the Source form of the Work,
excluding those notices that do not pertain to any part of
the Derivative Works; and
(d) If the Work includes a "NOTICE" text file as part of its
distribution, then any Derivative Works that You distribute must
include a readable copy of the attribution notices contained
within such NOTICE file, excluding those notices that do not
pertain to any part of the Derivative Works, in at least one
of the following places: within a NOTICE text file distributed
as part of the Derivative Works; within the Source form or
documentation, if provided along with the Derivative Works; or,
within a display generated by the Derivative Works, if and
wherever such third-party notices normally appear. The contents
of the NOTICE file are for informational purposes only and
do not modify the License. You may add Your own attribution
notices within Derivative Works that You distribute, alongside
or as an addendum to the NOTICE text from the Work, provided
that such additional attribution notices cannot be construed
as modifying the License.
You may add Your own copyright statement to Your modifications and
may provide additional or different license terms and conditions
for use, reproduction, or distribution of Your modifications, or
for any such Derivative Works as a whole, provided Your use,
reproduction, and distribution of the Work otherwise complies with
the conditions stated in this License.
5. Submission of Contributions. Unless You explicitly state otherwise,
any Contribution intentionally submitted for inclusion in the Work
by You to the Licensor shall be under the terms and conditions of
this License, without any additional terms or conditions.
Notwithstanding the above, nothing herein shall supersede or modify
the terms of any separate license agreement you may have executed
with Licensor regarding such Contributions.
6. Trademarks. This License does not grant permission to use the trade
names, trademarks, service marks, or product names of the Licensor,
except as required for reasonable and customary use in describing the
origin of the Work and reproducing the content of the NOTICE file.
7. Disclaimer of Warranty. Unless required by applicable law or
agreed to in writing, Licensor provides the Work (and each
Contributor provides its Contributions) on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
implied, including, without limitation, any warranties or conditions
of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
PARTICULAR PURPOSE. You are solely responsible for determining the
appropriateness of using or redistributing the Work and assume any
risks associated with Your exercise of permissions under this License.
8. Limitation of Liability. In no event and under no legal theory,
whether in tort (including negligence), contract, or otherwise,
unless required by applicable law (such as deliberate and grossly
negligent acts) or agreed to in writing, shall any Contributor be
liable to You for damages, including any direct, indirect, special,
incidental, or consequential damages of any character arising as a
result of this License or out of the use or inability to use the
Work (including but not limited to damages for loss of goodwill,
work stoppage, computer failure or malfunction, or any and all
other commercial damages or losses), even if such Contributor
has been advised of the possibility of such damages.
9. Accepting Warranty or Additional Liability. While redistributing
the Work or Derivative Works thereof, You may choose to offer,
and charge a fee for, acceptance of support, warranty, indemnity,
or other liability obligations and/or rights consistent with this
License. However, in accepting such obligations, You may act only
on Your own behalf and on Your sole responsibility, not on behalf
of any other Contributor, and only if You agree to indemnify,
defend, and hold each Contributor harmless for any liability
incurred by, or claims asserted against, such Contributor by reason
of your accepting any such warranty or additional liability.
END OF TERMS AND CONDITIONS
APPENDIX: How to apply the Apache License to your work.
To apply the Apache License to your work, attach the following
boilerplate notice, with the fields enclosed by brackets "[]"
replaced with your own identifying information. (Don't include
the brackets!) The text should be enclosed in the appropriate
comment syntax for the file format. We also recommend that a
file or class name and description of purpose be included on the
same "printed page" as the copyright notice for easier
identification within third-party archives.
Copyright [yyyy] [name of copyright owner]
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
Copyright 2023-2024 Bytedance Ltd. and/or its affiliates
\ No newline at end of file
<<<<<<< HEAD
# Verl
verl 版本0.5.0post
主要修改内容:
1. Main_ppo.py中ray.init需指定num_gpus。
2. Vllm的mode模块无法使用需全部屏蔽,所有跟sleep(level=1)和.wake_up(tags=["weights"])的都不适配。
3. Fsdp_workers.py中distributed后端需改为nccl。
4. Worker.py中修改了_setup_env_cuda_visible_devices。
5. Vllm_rollout_spmd中load_format参数穿不进来。
6. 激励函数reward_score的init中数据路径需修改。
7. 第5点修改后还需屏蔽在fsdp_vllm.py中关于model.load_weights部分。
运行命令请参考runverl.sh
需要运行命令之前添加export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7;export RAY_EXPERIMENTAL_NOSET_HIP_VISIBLE_DEVICES=1
目前只尝试了单节点八卡,后续尝试多节点
Bootstrap: docker
# Support - Traing: fsdp; Inference: vllm
# FROM: rocm/vllm:rocm6.2_mi300_ubuntu20.04_py3.9_vllm_0.6.4
# Support - Traing: fsdp; Inference: vllm, sglang
FROM lmsysorg/sglang:v0.4.5-rocm630
%environment
export PYTORCH_ROCM_ARCH="gfx90a;gfx942"
export HIPCC_COMPILE_FLAGS_APPEND="--amdgpu-target=gfx90a;gfx942 -D__HIP_PLATFORM_AMD__"
export CFLAGS="-D__HIP_PLATFORM_AMD__"
export CXXFLAGS="-D__HIP_PLATFORM_AMD__"
%post
# Create source directory
mkdir -p /opt/src
# Uninstall and reinstall vllm
pip uninstall -y vllm
cd /opt/src
git clone -b v0.6.3 https://github.com/vllm-project/vllm.git
cd vllm
MAX_JOBS=$(nproc) python3 setup.py install
cd /opt
rm -rf /opt/src/vllm
# Install dependencies
pip install "tensordict<0.6" --no-deps
pip install accelerate \
codetiming \
datasets \
dill \
hydra-core \
liger-kernel \
numpy \
pandas \
peft \
"pyarrow>=15.0.0" \
pylatexenc \
"ray[data,train,tune,serve]" \
torchdata \
transformers \
wandb \
orjson \
pybind11
# Clone and install verl from GitHub
cd /opt
git clone https://github.com/volcengine/verl.git
cd verl
# Uncomment to use a specific version
# git checkout v0.3.0.post0
pip install -e . --no-deps
# Install torch_memory_saver
pip install git+https://github.com/ExtremeViscent/torch_memory_saver.git --no-deps
\ No newline at end of file
# Base Image support aws EFA
# Build Image with frameworks based on this
FROM verlai/verl:app-verl0.5-sglang0.4.6.post5-mcore0.12.2
# For aws instances with EFA net interface (Sagemaker AI Pod)
# install EFA driver:
######## AWS EFA ############
ENV NCCL_VERSION=2.25.1-1
ENV DEBIAN_FRONTEND=noninteractive
ENV EFA_INSTALLER_VERSION=1.40.0
ENV AWS_OFI_NCCL_VERSION=1.14.2
ENV FI_EFA_SET_CUDA_SYNC_MEMOPS=0
ENV FI_PROVIDER=efa
RUN apt update && apt install -y linux-image-generic libhwloc-dev
RUN cd /tmp && \
curl -O https://efa-installer.amazonaws.com/aws-efa-installer-${EFA_INSTALLER_VERSION}.tar.gz && \
tar -xf aws-efa-installer-${EFA_INSTALLER_VERSION}.tar.gz && \
cd aws-efa-installer && \
./efa_installer.sh -y -g --skip-kmod --skip-limit-conf --no-verify && \
ldconfig && \
rm -rf /tmp/aws-efa-installer /var/lib/apt/lists/*
# NCCL EFA Plugin
RUN cd /tmp && \
curl -LO https://github.com/aws/aws-ofi-nccl/archive/refs/tags/v${AWS_OFI_NCCL_VERSION}.tar.gz && \
tar -xzf /tmp/v${AWS_OFI_NCCL_VERSION}.tar.gz && \
rm /tmp/v${AWS_OFI_NCCL_VERSION}.tar.gz && \
mv aws-ofi-nccl-${AWS_OFI_NCCL_VERSION} aws-ofi-nccl && \
cd /tmp/aws-ofi-nccl && \
./autogen.sh && \
./configure --prefix=/opt/amazon/efa \
--with-libfabric=/opt/amazon/efa \
--with-cuda=/usr/local/cuda \
--enable-platform-aws \
--with-mpi=/opt/amazon/openmpi && \
make -j$(nproc) install && \
rm -rf /tmp/aws-ofi/nccl
# NCCL
RUN echo "/usr/local/lib" >> /etc/ld.so.conf.d/local.conf && \
echo "/opt/amazon/openmpi/lib" >> /etc/ld.so.conf.d/efa.conf && \
ldconfig
ENV OMPI_MCA_pml=^cm,ucx \
OMPI_MCA_btl=tcp,self \
OMPI_MCA_btl_tcp_if_exclude=lo,docker0,veth_def_agent \
OPAL_PREFIX=/opt/amazon/openmpi \
NCCL_SOCKET_IFNAME=^docker,lo,veth_def_agent \
FI_EFA_USE_HUGE_PAGE=0
# docker build -t verl:awsefa --label "commit=$(git rev-parse --short HEAD)" .
# on aws:
# docker run --ipc=host --privileged --name verldev --gpus all --network=host --shm-size=1800gb -itd verl:awsefa
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment