Commit 7f6cc211 authored by jerrrrry's avatar jerrrrry
Browse files

Initial commit

parents
Pipeline #2874 failed with stages
in 0 seconds
# Start from the verl base image
# Dockerfile.base
FROM verlai/verl:base-verl0.5-cu126-cudnn9.8-torch2.7.0-fa2.7.4
# Define environments
ENV MAX_JOBS=32
ENV VLLM_WORKER_MULTIPROC_METHOD=spawn
ENV DEBIAN_FRONTEND=noninteractive
ENV NODE_OPTIONS=""
ENV PIP_ROOT_USER_ACTION=ignore
ENV HF_HUB_ENABLE_HF_TRANSFER="1"
# Install torch-2.7.0+cu126 + vllm-0.9.1
RUN pip install --resume-retries 999 --no-cache-dir vllm==0.9.1
# Fix packages
RUN pip install --no-cache-dir "tensordict==0.6.2" "transformers[hf_xet]>=4.51.0" accelerate datasets peft hf-transfer \
"numpy<2.0.0" "pyarrow>=19.0.1" pandas \
ray[default] codetiming hydra-core pylatexenc qwen-vl-utils wandb dill pybind11 liger-kernel mathruler blobfile xgrammar \
pytest py-spy pyext pre-commit ruff
RUN pip uninstall -y pynvml nvidia-ml-py && \
pip install --resume-retries 999 --no-cache-dir --upgrade "nvidia-ml-py>=12.560.30" "fastapi[standard]>=0.115.0" "optree>=0.13.0" "pydantic>=2.9" "grpcio>=1.62.1"
RUN pip install --resume-retries 999 --no-cache-dir nvidia-cudnn-cu12==9.8.0.87
# Install TransformerEngine
RUN export NVTE_FRAMEWORK=pytorch && pip3 install --resume-retries 999 --no-deps --no-cache-dir --no-build-isolation git+https://github.com/NVIDIA/TransformerEngine.git@v2.2.1
# Install Megatron-LM
RUN pip3 install --no-deps --no-cache-dir --no-build-isolation git+https://github.com/NVIDIA/Megatron-LM.git@core_v0.12.2
# Install mbridge
RUN pip3 install --no-cache-dir mbridge
# Fix qwen vl
RUN pip3 install --no-cache-dir --no-deps trl
\ No newline at end of file
# Base Docker Image of verl, with CUDA/Torch/FlashAttn/Apex/TransformerEngine, without other frameworks
# Target: verlai/verl:base-verl0.5-cu126-cudnn9.8-torch2.7.1-fa2.8.0-fi0.2.6
# Start from the NVIDIA official image (ubuntu-22.04 + cuda-12.6 + python-3.10)
# https://docs.nvidia.com/deeplearning/frameworks/pytorch-release-notes/rel-24-08.html
FROM nvcr.io/nvidia/pytorch:24.08-py3
# Define environments
ENV MAX_JOBS=16
ENV VLLM_WORKER_MULTIPROC_METHOD=spawn
ENV DEBIAN_FRONTEND=noninteractive
ENV NODE_OPTIONS=""
ENV PIP_ROOT_USER_ACTION=ignore
ENV HF_HUB_ENABLE_HF_TRANSFER="1"
# Define installation arguments
ARG APT_SOURCE=https://mirrors.tuna.tsinghua.edu.cn/ubuntu/
ARG PIP_INDEX=https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple
# Set apt source
RUN cp /etc/apt/sources.list /etc/apt/sources.list.bak && \
{ \
echo "deb ${APT_SOURCE} jammy main restricted universe multiverse"; \
echo "deb ${APT_SOURCE} jammy-updates main restricted universe multiverse"; \
echo "deb ${APT_SOURCE} jammy-backports main restricted universe multiverse"; \
echo "deb ${APT_SOURCE} jammy-security main restricted universe multiverse"; \
} > /etc/apt/sources.list
# Install systemctl
RUN apt-get update && \
apt-get install -y -o Dpkg::Options::="--force-confdef" systemd && \
apt-get clean
# Install tini
RUN apt-get update && \
apt-get install -y tini aria2 libfreeimage3 libfreeimage-dev zlib1g htop && \
apt-get clean
# Change pip source
RUN pip config set global.index-url "${PIP_INDEX}" && \
pip config set global.extra-index-url "${PIP_INDEX}" && \
python -m pip install --upgrade pip
# Uninstall nv-pytorch fork
RUN pip uninstall -y torch torchvision torchaudio \
pytorch-quantization pytorch-triton torch-tensorrt \
xgboost transformer_engine flash_attn apex megatron-core grpcio
RUN pip install --resume-retries 999 --no-cache-dir torch==2.7.0 torchvision==0.22.0 torchaudio==2.7.0
# Install flash-attn-2.7.4.post1, although built with torch2.6, it is compatible with torch2.7
# https://github.com/Dao-AILab/flash-attention/issues/1644#issuecomment-2899396361
RUN ABI_FLAG=$(python -c "import torch; print('TRUE' if torch._C._GLIBCXX_USE_CXX11_ABI else 'FALSE')") && \
URL="https://github.com/Dao-AILab/flash-attention/releases/download/v2.7.4.post1/flash_attn-2.7.4.post1+cu12torch2.6cxx11abi${ABI_FLAG}-cp310-cp310-linux_x86_64.whl" && \
FILE="flash_attn-2.7.4.post1+cu12torch2.6cxx11abi${ABI_FLAG}-cp310-cp310-linux_x86_64.whl" && \
wget -nv "${URL}" && \
pip install --no-cache-dir "${FILE}"
# Fix packages
RUN pip uninstall -y pynvml nvidia-ml-py && \
pip install --no-cache-dir --upgrade "nvidia-ml-py>=12.560.30" "fastapi[standard]>=0.115.0" "optree>=0.13.0" "pydantic>=2.9" "grpcio>=1.62.1"
# Install cudnn
RUN aria2c --max-tries=9999 https://developer.download.nvidia.com/compute/cudnn/9.8.0/local_installers/cudnn-local-repo-ubuntu2204-9.8.0_1.0-1_amd64.deb && \
dpkg -i cudnn-local-repo-ubuntu2204-9.8.0_1.0-1_amd64.deb && \
cp /var/cudnn-local-repo-ubuntu2204-9.8.0/cudnn-*-keyring.gpg /usr/share/keyrings/ && \
apt-get update && \
apt-get -y install cudnn-cuda-12 && \
rm cudnn-local-repo-ubuntu2204-9.8.0_1.0-1_amd64.deb
# Install Apex
RUN pip install -v --disable-pip-version-check --no-cache-dir --no-build-isolation --config-settings "--build-option=--cpp_ext" --config-settings "--build-option=--cuda_ext" --resume-retries 999 git+https://github.com/NVIDIA/apex.git
# Profiling tools
RUN aria2c --always-resume=true --max-tries=99999 https://developer.nvidia.com/downloads/assets/tools/secure/nsight-systems/2025_3/nsight-systems-2025.3.1_2025.3.1.90-1_amd64.deb && \
apt-get update && apt-get install -y libxcb-cursor0
RUN apt-get install -y ./nsight-systems-2025.3.1_2025.3.1.90-1_amd64.deb && \
rm -rf /usr/local/cuda/bin/nsys && \
ln -s /opt/nvidia/nsight-systems/2025.3.1/target-linux-x64/nsys /usr/local/cuda/bin/nsys && \
rm -rf /usr/local/cuda/bin/nsys-ui && \
ln -s /opt/nvidia/nsight-systems/2025.3.1/target-linux-x64/nsys-ui /usr/local/cuda/bin/nsys-ui && \
rm nsight-systems-2025.3.1_2025.3.1.90-1_amd64.deb
RUN pip install --resume-retries 999 --no-cache-dir "tensordict==0.6.2" torchdata "transformers[hf_xet]>=4.51.0" accelerate datasets peft hf-transfer \
"numpy<2.0.0" "pyarrow>=19.0.1" pandas cuda-bindings \
ray[default] codetiming hydra-core pylatexenc qwen-vl-utils wandb dill pybind11 liger-kernel mathruler blobfile xgrammar \
pytest py-spy pyext pre-commit ruff
# Install DeepEP
## the dependency of IBGDA
RUN ln -s /usr/lib/x86_64-linux-gnu/libmlx5.so.1 /usr/lib/x86_64-linux-gnu/libmlx5.so
## Clone and build deepep and deepep-nvshmem
RUN git clone -b v2.3.1 https://github.com/NVIDIA/gdrcopy.git && \
git clone https://github.com/deepseek-ai/DeepEP.git && \
cd DeepEP && git checkout a84a248
# Prepare nvshmem
RUN wget https://developer.nvidia.com/downloads/assets/secure/nvshmem/nvshmem_src_3.2.5-1.txz && \
tar -xvf nvshmem_src_3.2.5-1.txz && mv nvshmem_src deepep-nvshmem && \
cd deepep-nvshmem && git apply ../DeepEP/third-party/nvshmem.patch
ENV CUDA_HOME=/usr/local/cuda
### Set MPI environment variables. Having errors when not set.
ENV CPATH=/usr/local/mpi/include:$CPATH
ENV LD_LIBRARY_PATH=/usr/local/mpi/lib:$LD_LIBRARY_PATH
ENV LD_LIBRARY_PATH=/usr/local/x86_64-linux-gnu:$LD_LIBRARY_PATH
ENV GDRCOPY_HOME=/workspace/gdrcopy
## Build deepep-nvshmem
RUN cd deepep-nvshmem && \
NVSHMEM_SHMEM_SUPPORT=0 \
NVSHMEM_UCX_SUPPORT=0 \
NVSHMEM_USE_NCCL=0 \
NVSHMEM_MPI_SUPPORT=0 \
NVSHMEM_IBGDA_SUPPORT=1 \
NVSHMEM_PMIX_SUPPORT=0 \
NVSHMEM_TIMEOUT_DEVICE_POLLING=0 \
NVSHMEM_USE_GDRCOPY=1 \
cmake -G Ninja -S . -B build/ -DCMAKE_INSTALL_PREFIX=/workspace/deepep-nvshmem/install && cmake --build build/ --target install
ENV NVSHMEM_DIR=/workspace/deepep-nvshmem/install
ENV LD_LIBRARY_PATH=$NVSHMEM_DIR/lib:$LD_LIBRARY_PATH
ENV PATH=$NVSHMEM_DIR/bin:$PATH
## Build deepep
RUN cd DeepEP && \
python setup.py install
# Reset pip config
RUN pip config unset global.index-url && \
pip config unset global.extra-index-url
# Base Docker Image of verl, with CUDA/Torch/FlashAttn/Apex/TransformerEngine, without other frameworks
# Target: verlai/verl:base-verl0.5-cu126-cudnn9.8-torch2.7.1-fa2.8.0-fi0.2.6
# Start from the NVIDIA official image (ubuntu-22.04 + cuda-12.6 + python-3.10)
# https://docs.nvidia.com/deeplearning/frameworks/pytorch-release-notes/rel-24-08.html
FROM nvcr.io/nvidia/pytorch:24.08-py3
# Define environments
ENV MAX_JOBS=16
ENV VLLM_WORKER_MULTIPROC_METHOD=spawn
ENV DEBIAN_FRONTEND=noninteractive
ENV NODE_OPTIONS=""
ENV PIP_ROOT_USER_ACTION=ignore
ENV HF_HUB_ENABLE_HF_TRANSFER="1"
# Define installation arguments
ARG APT_SOURCE=https://mirrors.tuna.tsinghua.edu.cn/ubuntu/
ARG PIP_INDEX=https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple
# Set apt source
RUN cp /etc/apt/sources.list /etc/apt/sources.list.bak && \
{ \
echo "deb ${APT_SOURCE} jammy main restricted universe multiverse"; \
echo "deb ${APT_SOURCE} jammy-updates main restricted universe multiverse"; \
echo "deb ${APT_SOURCE} jammy-backports main restricted universe multiverse"; \
echo "deb ${APT_SOURCE} jammy-security main restricted universe multiverse"; \
} > /etc/apt/sources.list
# Install systemctl
RUN apt-get update && \
apt-get install -y -o Dpkg::Options::="--force-confdef" systemd && \
apt-get clean
# Install tini
RUN apt-get update && \
apt-get install -y tini aria2 libfreeimage3 libfreeimage-dev zlib1g htop && \
apt-get clean
# Change pip source
RUN pip config set global.index-url "${PIP_INDEX}" && \
pip config set global.extra-index-url "${PIP_INDEX}" && \
python -m pip install --upgrade pip
# Uninstall nv-pytorch fork
RUN pip uninstall -y torch torchvision torchaudio \
pytorch-quantization pytorch-triton torch-tensorrt \
xgboost transformer_engine flash_attn apex megatron-core grpcio
RUN pip install --resume-retries 999 --no-cache-dir torch==2.7.1 torchvision==0.22.1 torchaudio==2.7.1
# Install flash-attn-2.7.4.post1, although built with torch2.6, it is compatible with torch2.7
# https://github.com/Dao-AILab/flash-attention/issues/1644#issuecomment-2899396361
RUN ABI_FLAG=$(python -c "import torch; print('TRUE' if torch._C._GLIBCXX_USE_CXX11_ABI else 'FALSE')") && \
URL="https://github.com/Dao-AILab/flash-attention/releases/download/v2.7.4.post1/flash_attn-2.7.4.post1+cu12torch2.6cxx11abi${ABI_FLAG}-cp310-cp310-linux_x86_64.whl" && \
FILE="flash_attn-2.7.4.post1+cu12torch2.6cxx11abi${ABI_FLAG}-cp310-cp310-linux_x86_64.whl" && \
wget -nv "${URL}" && \
pip install --no-cache-dir "${FILE}"
# Fix packages
RUN pip uninstall -y pynvml nvidia-ml-py && \
pip install --no-cache-dir --upgrade "nvidia-ml-py>=12.560.30" "fastapi[standard]>=0.115.0" "optree>=0.13.0" "pydantic>=2.9" "grpcio>=1.62.1"
# Install cudnn
RUN aria2c --max-tries=9999 https://developer.download.nvidia.com/compute/cudnn/9.8.0/local_installers/cudnn-local-repo-ubuntu2204-9.8.0_1.0-1_amd64.deb && \
dpkg -i cudnn-local-repo-ubuntu2204-9.8.0_1.0-1_amd64.deb && \
cp /var/cudnn-local-repo-ubuntu2204-9.8.0/cudnn-*-keyring.gpg /usr/share/keyrings/ && \
apt-get update && \
apt-get -y install cudnn-cuda-12 && \
rm cudnn-local-repo-ubuntu2204-9.8.0_1.0-1_amd64.deb
# Install Apex
RUN pip install -v --disable-pip-version-check --no-cache-dir --no-build-isolation --config-settings "--build-option=--cpp_ext" --config-settings "--build-option=--cuda_ext" --resume-retries 999 git+https://github.com/NVIDIA/apex.git
# Profiling tools
RUN aria2c --always-resume=true --max-tries=99999 https://developer.nvidia.com/downloads/assets/tools/secure/nsight-systems/2025_3/nsight-systems-2025.3.1_2025.3.1.90-1_amd64.deb && \
apt-get update && apt-get install -y libxcb-cursor0
RUN apt-get install -y ./nsight-systems-2025.3.1_2025.3.1.90-1_amd64.deb && \
rm -rf /usr/local/cuda/bin/nsys && \
ln -s /opt/nvidia/nsight-systems/2025.3.1/target-linux-x64/nsys /usr/local/cuda/bin/nsys && \
rm -rf /usr/local/cuda/bin/nsys-ui && \
ln -s /opt/nvidia/nsight-systems/2025.3.1/target-linux-x64/nsys-ui /usr/local/cuda/bin/nsys-ui && \
rm nsight-systems-2025.3.1_2025.3.1.90-1_amd64.deb
RUN pip install --resume-retries 999 --no-cache-dir "tensordict==0.6.2" torchdata "transformers[hf_xet]>=4.52.3" accelerate datasets peft hf-transfer \
"numpy<2.0.0" "pyarrow>=19.0.1" pandas cuda-bindings \
ray[default] codetiming hydra-core pylatexenc qwen-vl-utils wandb dill pybind11 liger-kernel mathruler blobfile xgrammar \
pytest py-spy pyext pre-commit ruff
# Install DeepEP
## the dependency of IBGDA
RUN ln -s /usr/lib/x86_64-linux-gnu/libmlx5.so.1 /usr/lib/x86_64-linux-gnu/libmlx5.so
## Clone and build deepep and deepep-nvshmem
RUN git clone -b v2.3.1 https://github.com/NVIDIA/gdrcopy.git && \
git clone https://github.com/deepseek-ai/DeepEP.git && \
cd DeepEP && git checkout a84a248
# Prepare nvshmem
RUN wget https://developer.nvidia.com/downloads/assets/secure/nvshmem/nvshmem_src_3.2.5-1.txz && \
tar -xvf nvshmem_src_3.2.5-1.txz && mv nvshmem_src deepep-nvshmem && \
cd deepep-nvshmem && git apply ../DeepEP/third-party/nvshmem.patch
ENV CUDA_HOME=/usr/local/cuda
### Set MPI environment variables. Having errors when not set.
ENV CPATH=/usr/local/mpi/include:$CPATH
ENV LD_LIBRARY_PATH=/usr/local/mpi/lib:$LD_LIBRARY_PATH
ENV LD_LIBRARY_PATH=/usr/local/x86_64-linux-gnu:$LD_LIBRARY_PATH
ENV GDRCOPY_HOME=/workspace/gdrcopy
## Build deepep-nvshmem
RUN cd deepep-nvshmem && \
NVSHMEM_SHMEM_SUPPORT=0 \
NVSHMEM_UCX_SUPPORT=0 \
NVSHMEM_USE_NCCL=0 \
NVSHMEM_MPI_SUPPORT=0 \
NVSHMEM_IBGDA_SUPPORT=1 \
NVSHMEM_PMIX_SUPPORT=0 \
NVSHMEM_TIMEOUT_DEVICE_POLLING=0 \
NVSHMEM_USE_GDRCOPY=1 \
cmake -G Ninja -S . -B build/ -DCMAKE_INSTALL_PREFIX=/workspace/deepep-nvshmem/install && cmake --build build/ --target install
ENV NVSHMEM_DIR=/workspace/deepep-nvshmem/install
ENV LD_LIBRARY_PATH=$NVSHMEM_DIR/lib:$LD_LIBRARY_PATH
ENV PATH=$NVSHMEM_DIR/bin:$PATH
## Build deepep
RUN cd DeepEP && \
python setup.py install
# Reset pip config
RUN pip config unset global.index-url && \
pip config unset global.extra-index-url
# verl image with verl v0.5
## Important packages version
```txt
cuda==12.6
cudnn==9.8.0
torch==2.7.1
flash_attn=2.8.0 ##
sglang==0.4.8
vllm==0.8.5.post1
vidia-cudnn-cu12==9.8.0.87
transformer_engine==2.3
megatron.core==core_v0.12.2
# Preview
transformer_engine==2.5
megatron.core==core_r0.13.0
```
## Target
- Base image:
- `verlai/verl:base-verl0.5-cu126-cudnn9.8-torch2.7.0-fa2.7.4`: We offer a base image with deep ep built in, for vllm
- `verlai/verl:base-verl0.5-cu126-cudnn9.8-torch2.7.1-fa2.7.4`: We offer a base image with deep ep built in, for sglang
- App image:
- `verlai/verl:app-verl0.5-vllm0.9.1-mcore0.12.2-te2.2`
- `verlai/verl:app-verl0.5-sglang0.4.8-mcore0.12.2-te2.2`
- `verlai/verl:app-verl0.5-sglang0.4.9.post2-mcore0.12.2-te2.2`
\ No newline at end of file
# Start from the verl base image
# Dockerfile.base
FROM verlai/verl:base-verl0.5-cu126-cudnn9.8-torch2.7.1-fa2.8.0
# Define environments
ENV MAX_JOBS=8
ENV VLLM_WORKER_MULTIPROC_METHOD=spawn
ENV DEBIAN_FRONTEND=noninteractive
ENV NODE_OPTIONS=""
ENV PIP_ROOT_USER_ACTION=ignore
ENV HF_HUB_ENABLE_HF_TRANSFER="1"
# Install sglang-0.4.8 and torch-memory-saver
# Install FlashInfer Python package
RUN pip install --upgrade pip setuptools packaging
RUN pip install --resume-retries 999 --no-cache-dir --no-build-isolation flashinfer-python==0.2.6.post1
RUN pip install --resume-retries 999 --no-cache-dir "sglang[all]==0.4.8" && pip install torch-memory-saver --no-cache-dir
# Fix packages
RUN pip install --no-cache-dir "tensordict==0.6.2" "transformers[hf_xet]>=4.51.0" accelerate datasets peft hf-transfer \
"numpy<2.0.0" "pyarrow>=19.0.1" pandas \
ray[default] codetiming hydra-core pylatexenc qwen-vl-utils wandb dill pybind11 liger-kernel mathruler blobfile xgrammar \
pytest py-spy pyext pre-commit ruff
RUN pip uninstall -y pynvml nvidia-ml-py && \
pip install --resume-retries 999 --no-cache-dir --upgrade "nvidia-ml-py>=12.560.30" "fastapi[standard]>=0.115.0" "optree>=0.13.0" "pydantic>=2.9" "grpcio>=1.62.1"
RUN pip install --resume-retries 999 --no-cache-dir nvidia-cudnn-cu12==9.8.0.87
# Install TransformerEngine
RUN export NVTE_FRAMEWORK=pytorch && pip3 install --resume-retries 999 --no-deps --no-cache-dir --no-build-isolation git+https://github.com/NVIDIA/TransformerEngine.git@v2.3
# Install Megatron-LM
RUN pip3 install --no-deps --no-cache-dir --no-build-isolation git+https://github.com/NVIDIA/Megatron-LM.git@core_v0.12.2
# Install mbridge
RUN pip3 install --no-cache-dir mbridge
\ No newline at end of file
# Start from the verl base image
# Dockerfile.base
FROM verlai/verl:base-verl0.5-cu126-cudnn9.8-torch2.7.1-fa2.8.0
# Define environments
ENV MAX_JOBS=8
ENV VLLM_WORKER_MULTIPROC_METHOD=spawn
ENV DEBIAN_FRONTEND=noninteractive
ENV NODE_OPTIONS=""
ENV PIP_ROOT_USER_ACTION=ignore
ENV HF_HUB_ENABLE_HF_TRANSFER="1"
# Install sglang-0.4.8 and torch-memory-saver
# Install FlashInfer Python package
RUN pip install --upgrade pip setuptools packaging
RUN pip install --resume-retries 999 --no-cache-dir --no-build-isolation flashinfer-python==0.2.6.post1
RUN pip install --resume-retries 999 --no-cache-dir "sglang[all]==0.4.8" && pip install torch-memory-saver --no-cache-dir
# Fix packages
RUN pip install --no-cache-dir "tensordict==0.6.2" "transformers[hf_xet]>=4.51.0" accelerate datasets peft hf-transfer \
"numpy<2.0.0" "pyarrow>=19.0.1" pandas \
ray[default] codetiming hydra-core pylatexenc qwen-vl-utils wandb dill pybind11 liger-kernel mathruler blobfile xgrammar \
pytest py-spy pyext pre-commit ruff
RUN pip uninstall -y pynvml nvidia-ml-py && \
pip install --resume-retries 999 --no-cache-dir --upgrade "nvidia-ml-py>=12.560.30" "fastapi[standard]>=0.115.0" "optree>=0.13.0" "pydantic>=2.9" "grpcio>=1.62.1"
RUN pip install --resume-retries 999 --no-cache-dir nvidia-cudnn-cu12==9.8.0.87
# Install TransformerEngine
RUN export NVTE_FRAMEWORK=pytorch && pip3 install --resume-retries 999 --no-deps --no-cache-dir --no-build-isolation git+https://github.com/NVIDIA/TransformerEngine.git@release_v2.5
# Install Megatron-LM
RUN pip3 install --no-deps --no-cache-dir --no-build-isolation git+https://github.com/NVIDIA/Megatron-LM.git@core_v0.12.2
# Install mbridge
RUN pip3 install --no-cache-dir mbridge
\ No newline at end of file
# Base Docker Image of verl, with CUDA/Torch/FlashAttn/Apex/TransformerEngine, without other frameworks
# Target: verlai/verl:base-verl0.5-cu126-cudnn9.8-torch2.7.1-fa2.8.0-fi0.2.6
# Start from the NVIDIA official image (ubuntu-22.04 + cuda-12.6 + python-3.10)
# https://docs.nvidia.com/deeplearning/frameworks/pytorch-release-notes/rel-24-08.html
FROM nvcr.io/nvidia/pytorch:24.08-py3
# Define environments
ENV MAX_JOBS=16
ENV VLLM_WORKER_MULTIPROC_METHOD=spawn
ENV DEBIAN_FRONTEND=noninteractive
ENV NODE_OPTIONS=""
ENV PIP_ROOT_USER_ACTION=ignore
ENV HF_HUB_ENABLE_HF_TRANSFER="1"
# Define installation arguments
ARG APT_SOURCE=https://mirrors.tuna.tsinghua.edu.cn/ubuntu/
ARG PIP_INDEX=https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple
# Set apt source
RUN cp /etc/apt/sources.list /etc/apt/sources.list.bak && \
{ \
echo "deb ${APT_SOURCE} jammy main restricted universe multiverse"; \
echo "deb ${APT_SOURCE} jammy-updates main restricted universe multiverse"; \
echo "deb ${APT_SOURCE} jammy-backports main restricted universe multiverse"; \
echo "deb ${APT_SOURCE} jammy-security main restricted universe multiverse"; \
} > /etc/apt/sources.list
# Install systemctl
RUN apt-get update && \
apt-get install -y -o Dpkg::Options::="--force-confdef" systemd && \
apt-get clean
# Install tini
RUN apt-get update && \
apt-get install -y tini aria2 libfreeimage3 libfreeimage-dev zlib1g htop && \
apt-get clean
# Change pip source
RUN pip config set global.index-url "${PIP_INDEX}" && \
pip config set global.extra-index-url "${PIP_INDEX}" && \
python -m pip install --upgrade pip
# Uninstall nv-pytorch fork
RUN pip uninstall -y torch torchvision torchaudio \
pytorch-quantization pytorch-triton torch-tensorrt \
xgboost transformer_engine flash_attn apex megatron-core grpcio
RUN pip install --resume-retries 999 --no-cache-dir torch==2.7.1 torchvision==0.22.1 torchaudio==2.7.1
# Install flash-attn-2.8.0.post2 (cxx11abi=True)
RUN ABI_FLAG=$(python -c "import torch; print('TRUE' if torch._C._GLIBCXX_USE_CXX11_ABI else 'FALSE')") && \
URL="https://github.com/Dao-AILab/flash-attention/releases/download/v2.8.0.post2/flash_attn-2.8.0.post2+cu12torch2.7cxx11abi${ABI_FLAG}-cp310-cp310-linux_x86_64.whl" && \
FILE="flash_attn-2.8.0.post2+cu12torch2.7cxx11abi${ABI_FLAG}-cp310-cp310-linux_x86_64.whl" && \
wget -nv "${URL}" && \
pip install --no-cache-dir "${FILE}"
# Fix packages
RUN pip uninstall -y pynvml nvidia-ml-py && \
pip install --no-cache-dir --upgrade "nvidia-ml-py>=12.560.30" "fastapi[standard]>=0.115.0" "optree>=0.13.0" "pydantic>=2.9" "grpcio>=1.62.1"
# Install cudnn
RUN aria2c --max-tries=9999 https://developer.download.nvidia.com/compute/cudnn/9.8.0/local_installers/cudnn-local-repo-ubuntu2204-9.8.0_1.0-1_amd64.deb && \
dpkg -i cudnn-local-repo-ubuntu2204-9.8.0_1.0-1_amd64.deb && \
cp /var/cudnn-local-repo-ubuntu2204-9.8.0/cudnn-*-keyring.gpg /usr/share/keyrings/ && \
apt-get update && \
apt-get -y install cudnn-cuda-12 && \
rm cudnn-local-repo-ubuntu2204-9.8.0_1.0-1_amd64.deb
# Install Apex
RUN pip install -v --disable-pip-version-check --no-cache-dir --no-build-isolation --config-settings "--build-option=--cpp_ext" --config-settings "--build-option=--cuda_ext" --resume-retries 999 git+https://github.com/NVIDIA/apex.git
# Profiling tools
RUN aria2c --always-resume=true --max-tries=99999 https://developer.nvidia.com/downloads/assets/tools/secure/nsight-systems/2025_3/nsight-systems-2025.3.1_2025.3.1.90-1_amd64.deb && \
apt-get update && apt-get install -y libxcb-cursor0
RUN apt-get install -y ./nsight-systems-2025.3.1_2025.3.1.90-1_amd64.deb && \
rm -rf /usr/local/cuda/bin/nsys && \
ln -s /opt/nvidia/nsight-systems/2025.3.1/target-linux-x64/nsys /usr/local/cuda/bin/nsys && \
rm -rf /usr/local/cuda/bin/nsys-ui && \
ln -s /opt/nvidia/nsight-systems/2025.3.1/target-linux-x64/nsys-ui /usr/local/cuda/bin/nsys-ui && \
rm nsight-systems-2025.3.1_2025.3.1.90-1_amd64.deb
RUN pip install --resume-retries 999 --no-cache-dir "tensordict==0.6.2" torchdata "transformers[hf_xet]>=4.53" accelerate datasets peft hf-transfer \
"numpy<2.0.0" "pyarrow>=19.0.1" pandas cuda-bindings \
ray[default] codetiming hydra-core pylatexenc qwen-vl-utils wandb dill pybind11 liger-kernel mathruler blobfile xgrammar \
pytest py-spy pyext pre-commit ruff
# Install DeepEP
## the dependency of IBGDA
RUN ln -s /usr/lib/x86_64-linux-gnu/libmlx5.so.1 /usr/lib/x86_64-linux-gnu/libmlx5.so
## Clone and build deepep and deepep-nvshmem
RUN git clone -b v2.3.1 https://github.com/NVIDIA/gdrcopy.git && \
git clone https://github.com/deepseek-ai/DeepEP.git && \
cd DeepEP && git checkout a84a248
# Prepare nvshmem
RUN wget https://developer.nvidia.com/downloads/assets/secure/nvshmem/nvshmem_src_3.2.5-1.txz && \
tar -xvf nvshmem_src_3.2.5-1.txz && mv nvshmem_src deepep-nvshmem && \
cd deepep-nvshmem && git apply ../DeepEP/third-party/nvshmem.patch
ENV CUDA_HOME=/usr/local/cuda
### Set MPI environment variables. Having errors when not set.
ENV CPATH=/usr/local/mpi/include:$CPATH
ENV LD_LIBRARY_PATH=/usr/local/mpi/lib:$LD_LIBRARY_PATH
ENV LD_LIBRARY_PATH=/usr/local/x86_64-linux-gnu:$LD_LIBRARY_PATH
ENV GDRCOPY_HOME=/workspace/gdrcopy
## Build deepep-nvshmem
RUN cd deepep-nvshmem && \
NVSHMEM_SHMEM_SUPPORT=0 \
NVSHMEM_UCX_SUPPORT=0 \
NVSHMEM_USE_NCCL=0 \
NVSHMEM_MPI_SUPPORT=0 \
NVSHMEM_IBGDA_SUPPORT=1 \
NVSHMEM_PMIX_SUPPORT=0 \
NVSHMEM_TIMEOUT_DEVICE_POLLING=0 \
NVSHMEM_USE_GDRCOPY=1 \
cmake -G Ninja -S . -B build/ -DCMAKE_INSTALL_PREFIX=/workspace/deepep-nvshmem/install && cmake --build build/ --target install
ENV NVSHMEM_DIR=/workspace/deepep-nvshmem/install
ENV LD_LIBRARY_PATH=$NVSHMEM_DIR/lib:$LD_LIBRARY_PATH
ENV PATH=$NVSHMEM_DIR/bin:$PATH
## Build deepep
RUN cd DeepEP && \
python setup.py install
# Reset pip config
RUN pip config unset global.index-url && \
pip config unset global.extra-index-url
# verl image with verl v0.5
## Important packages version
```txt
cuda==12.6
cudnn==9.8.0
torch==2.7.1
flash_attn=2.8.0 ##
sglang==0.4.8
vllm==0.8.5.post1
vidia-cudnn-cu12==9.8.0.87
transformer_engine==2.3
megatron.core==core_v0.12.2
# Preview
transformer_engine==2.5
megatron.core==core_r0.13.0
```
## Target
- Base image:
- `verlai/verl:base-verl0.5-cu126-cudnn9.8-torch2.7.1-fa2.8.0`: We offer a base image with deep ep built in
- App image:
- `verlai/verl:app-verl0.5-sglang0.4.9-mcore0.12.2`
- `verlai/verl:app-verl0.5-sglang0.4.9-mcore0.13.0-preview`
- vllm temporarily not support latest version
\ No newline at end of file
# Start from the verl base image
# Dockerfile.base
FROM verlai/verl:base-verl0.5-preview-cu128-cudnn9.8-torch2.7.1-fa2.8.0-fi0.2.6
# Define environments
ENV MAX_JOBS=8
ENV VLLM_WORKER_MULTIPROC_METHOD=spawn
ENV DEBIAN_FRONTEND=noninteractive
ENV NODE_OPTIONS=""
ENV PIP_ROOT_USER_ACTION=ignore
ENV HF_HUB_ENABLE_HF_TRANSFER="1"
# Install sglang-0.4.8 and torch-memory-saver
# Install FlashInfer Python package
RUN pip install --resume-retries 999 --no-cache-dir --no-build-isolation flashinfer-python==0.2.6.post1
RUN pip install --resume-retries 999 --no-cache-dir "sglang[all]==0.4.8" && pip install torch-memory-saver --no-cache-dir
# Fix packages
RUN pip install --no-cache-dir "tensordict==0.6.2" "transformers[hf_xet]>=4.51.0" accelerate datasets peft hf-transfer \
"numpy<2.0.0" "pyarrow>=19.0.1" pandas \
ray[default] codetiming hydra-core pylatexenc qwen-vl-utils wandb dill pybind11 liger-kernel mathruler blobfile xgrammar \
pytest py-spy pre-commit ruff
RUN pip uninstall -y pynvml nvidia-ml-py && \
pip install --resume-retries 999 --no-cache-dir --upgrade "nvidia-ml-py>=12.560.30" "fastapi[standard]>=0.115.0" "optree>=0.13.0" "pydantic>=2.9" "grpcio>=1.62.1"
RUN pip install --resume-retries 999 --no-cache-dir nvidia-cudnn-cu12==9.8.0.87
# Install TransformerEngine
RUN export NVTE_FRAMEWORK=pytorch && pip3 install --resume-retries 999 --no-deps --no-cache-dir --no-build-isolation git+https://github.com/NVIDIA/TransformerEngine.git@release_v2.5
# Install Megatron-LM
RUN pip3 install --no-deps --no-cache-dir --no-build-isolation git+https://github.com/NVIDIA/Megatron-LM.git@core_r0.13.0
# Install mbridge
RUN pip3 install --no-cache-dir mbridge
\ No newline at end of file
# Base Docker Image of verl, with CUDA/Torch/FlashAttn/Apex/TransformerEngine, without other frameworks
# Target: verlai/verl:base-verl0.5-preview-cu128-cudnn9.8-torch2.7.1-fa2.8.0-fi0.2.6
# Start from the NVIDIA official image (ubuntu-22.04 + cuda-12.6 + python-3.10)
# https://docs.nvidia.com/deeplearning/frameworks/pytorch-release-notes/rel-24-08.html
FROM nvcr.io/nvidia/pytorch:25.02-py3
# Define environments
ENV MAX_JOBS=16
ENV VLLM_WORKER_MULTIPROC_METHOD=spawn
ENV DEBIAN_FRONTEND=noninteractive
ENV NODE_OPTIONS=""
ENV PIP_ROOT_USER_ACTION=ignore
ENV HF_HUB_ENABLE_HF_TRANSFER="1"
# Define installation arguments
ARG APT_SOURCE=https://mirrors.tuna.tsinghua.edu.cn/ubuntu/
ARG PIP_INDEX=https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple
# Set apt source
RUN cp /etc/apt/sources.list /etc/apt/sources.list.bak && \
{ \
echo "deb ${APT_SOURCE} jammy main restricted universe multiverse"; \
echo "deb ${APT_SOURCE} jammy-updates main restricted universe multiverse"; \
echo "deb ${APT_SOURCE} jammy-backports main restricted universe multiverse"; \
echo "deb ${APT_SOURCE} jammy-security main restricted universe multiverse"; \
} > /etc/apt/sources.list
# Install systemctl
RUN apt-get update && \
apt-get install -y -o Dpkg::Options::="--force-confdef" systemd && \
apt-get clean
# Install tini
RUN apt-get update && \
apt-get install -y tini aria2 libfreeimage3 libfreeimage-dev zlib1g htop && \
apt-get clean
# Change pip source
RUN pip config set global.index-url "${PIP_INDEX}" && \
pip config set global.extra-index-url "${PIP_INDEX}" && \
python -m pip install --upgrade pip
# Uninstall nv-pytorch fork
RUN pip uninstall -y torch torchvision torchaudio \
pytorch-quantization pytorch-triton torch-tensorrt \
xgboost transformer_engine flash_attn apex megatron-core grpcio
RUN pip install --resume-retries 999 --no-cache-dir torch==2.7.1 torchvision==0.22.1 torchaudio==2.7.1 --index-url https://download.pytorch.org/whl/cu128
# Install flash-attn-2.8.0.post2 (cxx11abi=True)
RUN ABI_FLAG=$(python -c "import torch; print('TRUE' if torch._C._GLIBCXX_USE_CXX11_ABI else 'FALSE')") && \
URL="https://github.com/Dao-AILab/flash-attention/releases/download/v2.8.0.post2/flash_attn-2.8.0.post2+cu12torch2.7cxx11abi${ABI_FLAG}-cp312-cp312-linux_x86_64.whl" && \
FILE="flash_attn-2.8.0.post2+cu12torch2.7cxx11abi${ABI_FLAG}-cp312-cp312-linux_x86_64.whl" && \
wget -nv "${URL}" && \
pip install --no-cache-dir "${FILE}"
# Fix packages
RUN pip uninstall -y pynvml nvidia-ml-py && \
pip install --no-cache-dir --upgrade "nvidia-ml-py>=12.560.30" "fastapi[standard]>=0.115.0" "optree>=0.13.0" "pydantic>=2.9" "grpcio>=1.62.1"
# Install cudnn
RUN aria2c --max-tries=9999 https://developer.download.nvidia.com/compute/cudnn/9.8.0/local_installers/cudnn-local-repo-ubuntu2204-9.8.0_1.0-1_amd64.deb && \
dpkg -i cudnn-local-repo-ubuntu2204-9.8.0_1.0-1_amd64.deb && \
cp /var/cudnn-local-repo-ubuntu2204-9.8.0/cudnn-*-keyring.gpg /usr/share/keyrings/ && \
apt-get update && \
apt-get -y install cudnn-cuda-12 && \
rm cudnn-local-repo-ubuntu2204-9.8.0_1.0-1_amd64.deb
# Install Apex
RUN pip install -v --disable-pip-version-check --no-cache-dir --no-build-isolation --config-settings "--build-option=--cpp_ext" --config-settings "--build-option=--cuda_ext" --resume-retries 999 git+https://github.com/NVIDIA/apex.git
# Profiling tools
RUN aria2c --always-resume=true --max-tries=99999 https://developer.nvidia.com/downloads/assets/tools/secure/nsight-systems/2025_3/nsight-systems-2025.3.1_2025.3.1.90-1_amd64.deb && \
apt-get update && apt-get install -y libxcb-cursor0
RUN apt-get install -y ./nsight-systems-2025.3.1_2025.3.1.90-1_amd64.deb && \
rm -rf /usr/local/cuda/bin/nsys && \
ln -s /opt/nvidia/nsight-systems/2025.3.1/target-linux-x64/nsys /usr/local/cuda/bin/nsys && \
rm -rf /usr/local/cuda/bin/nsys-ui && \
ln -s /opt/nvidia/nsight-systems/2025.3.1/target-linux-x64/nsys-ui /usr/local/cuda/bin/nsys-ui && \
rm nsight-systems-2025.3.1_2025.3.1.90-1_amd64.deb
RUN pip install --resume-retries 999 --no-cache-dir "tensordict==0.6.2" torchdata "transformers[hf_xet]>=4.51.0" accelerate datasets peft hf-transfer \
"numpy<2.0.0" "pyarrow>=19.0.1" pandas cuda-bindings \
ray[default] codetiming hydra-core pylatexenc qwen-vl-utils wandb dill pybind11 liger-kernel mathruler blobfile xgrammar \
pytest py-spy pre-commit ruff
# Reset pip config
RUN pip config unset global.index-url && \
pip config unset global.extra-index-url
# verl image with verl v0.5
## Important packages version
```txt
cuda==12.8
cudnn==9.8.0
torch==2.7.1
flash_attn=2.8.0 ##
sglang==0.4.8
transformer_engine==2.5
megatron.core==core_r0.13.0
vidia-cudnn-cu12==9.8.0.87
```
## Target
- Base image:
- `verlai/verl:base-verl0.5-preview-cu128-cudnn9.8-torch2.7.1-fa2.8.0`: We offer a base image with flash infer 0.2.6.post1 built in
- App image:
- `verlai/verl:app-verl0.5-preview-sglang0.4.8-mcore0.13.0-preview`
- vllm temporarily not support latest version
## !!!Notice!!!
- pyext is lack of maintainace and cannot work with python 3.12, consider using replacement and deprecating this package.
\ No newline at end of file
# Minimal makefile for Sphinx documentation
#
# You can set these variables from the command line.
SPHINXOPTS =
SPHINXBUILD = sphinx-build
SPHINXPROJ = verl
SOURCEDIR = .
BUILDDIR = _build
# Put it first so that "make" without argument is like "make help".
help:
@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
.PHONY: help Makefile
# Catch-all target: route all unknown targets to Sphinx using the new
# "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS).
%: Makefile
@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
# verl documentations
## Build the docs
```bash
# If you want to view auto-generated API docstring, please make sure verl is available in python path. For instance, install verl via:
# pip install .. -e[test]
# Install dependencies needed for building docs.
pip install -r requirements-docs.txt
# Build the docs.
make clean
make html
```
## Open the docs with your browser
```bash
python -m http.server -d _build/html/
```
Launch your browser and navigate to http://localhost:8000 to view the documentation. Alternatively you could drag the file `_build/html/index.html` to your local browser and view directly.
# Upgrading to vllm >= 0.7
Note: verl+vllm 0.8.3 is now stable. Please see ``docs/README_vllm0.8.md`` for upgrade guide.
## Installation
Note: At time of writing, verl+vllm 0.7.x supports **FSDP** for training and **vLLM** for rollout.
```
# Create the conda environment
conda create -n verl python==3.10
conda activate verl
# Install verl
git clone https://github.com/volcengine/verl.git
cd verl
pip3 install -e .
# Install the latest stable version of vLLM
pip3 install vllm==0.7.3
# Install flash-attn
pip3 install flash-attn --no-build-isolation
```
Note that if you are installing lower versions of vLLM (0.7.0, 0.7.1, 0.7.2), you need to make some tiny patches manually on vllm (/path/to/site-packages/vllm after installation) after the above steps:
- vllm/distributed/parallel_state.py: Remove the assertion below:
```
if (world_size
!= tensor_model_parallel_size * pipeline_model_parallel_size):
raise RuntimeError(
f"world_size ({world_size}) is not equal to "
f"tensor_model_parallel_size ({tensor_model_parallel_size}) x "
f"pipeline_model_parallel_size ({pipeline_model_parallel_size})")
```
- vllm/executor/uniproc_executor.py: change `local_rank = rank` to `local_rank = int(os.environ["LOCAL_RANK"])`
- vllm/model_executor/model_loader/weight_utils.py: remove the `torch.cuda.empty_cache()` in `pt_weights_iterator`
## Features
### Use cuda graph
After installation, examples using FSDP as training backends can be used. By default, the `enforce_eager` is set to True, which disables the cuda graph. To enjoy cuda graphs and the sleep mode of vLLM>=0.7, add the following lines to the bash script:
```
actor_rollout_ref.rollout.enforce_eager=False \
actor_rollout_ref.rollout.free_cache_engine=True \
```
For a typical job like examples/ppo_trainer/run_qwen2-7b_seq_balance.sh, the rollout generation time is 85 seconds with vLLM0.7.0. By enabling the cudagraph, the generation duration is further reduced to 62 seconds.
**Note:** Currently, if the `n` is greater than 1 in `SamplingParams` in vLLM>=0.7, there is a potential performance issue on the stability of rollout generation time (Some iterations would see generation time bursts) using vLLM's V0 Engine.
### Use vLLM V1 Engine
Using the vLLM V1 engine can avoid instability issues and achieve additional performance improvements. To use the V1 engine, you can first uninstall the previously installed vLLM and then follow the steps below to install the newer version.
```
git clone https://github.com/vllm-project/vllm.git
cd vllm
git checkout 2275784
sed -i "903a\ data_parallel_size = world_size // pipeline_model_parallel_size // tensor_model_parallel_size" ./vllm/distributed/parallel_state.py
VLLM_USE_PRECOMPILED=1 pip install --editable .
```
Then you can enable the V1 engine by setting `export VLLM_USE_V1=1`. In some benchmark tests, the V1 engine demonstrates a 1.5x speed improvement over the vLLM V0 engine.
The stable support of the vLLM V1 engine is available on verl main.
# Upgrading to vLLM >= 0.8
Last updated: 05/04/2025.
## Installation
Note: This version of verl+vLLM 0.8+ supports **FSDP** for training and **vLLM** for rollout.
```bash
# Create the conda environment
conda create -n verl python==3.10
conda activate verl
# Install verl
git clone https://github.com/volcengine/verl.git
cd verl
pip3 install -e .
# Install the latest stable version of vLLM
pip3 install vllm==0.8.3
# Install flash-attn
pip3 install flash-attn --no-build-isolation
```
We have a pre-built docker image for verl+vLLM 0.8.3. You can direct import it with the following command:
```bash
docker pull hiyouga/verl:ngc-th2.6.0-cu126-vllm0.8.3-flashinfer0.2.2-cxx11abi0
```
## Features
vLLM 0.8+ supports cuda graph and V1 engine by default in verl. To enable these features, remember to add the following lines to the bash script:
```bash
actor_rollout_ref.rollout.enforce_eager=False \
actor_rollout_ref.rollout.free_cache_engine=True \
```
and also **remove** the environment variable if it exists:
## Notes
When you just directly upgrade vllm>=0.8, some dependency packages may undergo version changes. If you encounter the following problems:
```bash
in <module> from torch.multiprocessing.reductions import ForkingPickler ImportError: cannot import name 'ForkingPickler' from 'torch.multiprocessing.reductions' (/opt/conda/lib/python3.11/site-packages/torch/multiprocessing/reductions.py)
```
You need to upgrade `tensordict` to version 0.6.2 using the command `pip install tensordict==0.6.2`.
/* Make the documentation use full screen width */
.wy-nav-content {
max-width: none !important;
width: 100% !important;
padding: 1.618em 3.236em !important;
}
/* Adjust the content wrapper - will be set by JavaScript */
.wy-nav-content-wrap {
margin-left: 300px;
transition: margin-left 0.2s ease;
width: auto !important;
position: relative !important;
background: white !important;
min-height: 100vh !important;
}
/* Make the main content area responsive */
.rst-content {
max-width: none !important;
width: 100% !important;
}
/* Optional: Adjust table widths to prevent overflow */
.rst-content table.docutils {
width: 100% !important;
table-layout: auto !important;
}
/* Optional: Better code block width handling */
.rst-content .highlight {
width: 100% !important;
}
/* Content area positioning already handled above */
/* Optional: Improve readability with some margin on very wide screens */
@media (min-width: 1400px) {
.wy-nav-content {
max-width: none !important;
margin: 0 auto !important;
}
}
/* Resizable sidebar styles */
.wy-nav-side {
position: fixed !important;
top: 0 !important;
bottom: 0 !important;
left: 0 !important;
width: 300px;
min-width: 200px;
max-width: 600px;
display: flex;
flex-direction: column;
z-index: 200 !important;
}
/* Ensure sidebar header (logo, search) adapts to width */
.wy-side-nav-search {
width: 100% !important;
box-sizing: border-box !important;
padding: 0.809em 0.809em !important;
}
.wy-side-nav-search input[type="text"] {
width: 100% !important;
box-sizing: border-box !important;
}
/* Make logo/title area responsive */
.wy-side-nav-search > div.version {
width: 100% !important;
}
.wy-side-nav-search > a {
width: 100% !important;
display: block !important;
white-space: nowrap !important;
overflow: hidden !important;
text-overflow: ellipsis !important;
}
/* Responsive adjustments for narrow sidebar */
@media (max-width: 300px) {
.wy-side-nav-search > a {
font-size: 0.9em !important;
}
.wy-side-nav-search input[type="text"] {
font-size: 0.8em !important;
}
}
/* Ensure search input doesn't overflow */
.wy-side-nav-search form {
width: 100% !important;
margin: 0 !important;
}
/* Make search icon responsive */
.wy-side-nav-search .wy-dropdown {
width: 100% !important;
}
/* Adjust search results dropdown width */
.wy-side-nav-search .wy-dropdown-menu {
width: 100% !important;
max-width: none !important;
left: 0 !important;
right: 0 !important;
}
/* Resize handle is created by JavaScript */
/* Make sure the sidebar content doesn't overflow */
.wy-side-scroll {
width: 100% !important;
flex: 1 !important;
overflow-y: auto !important;
overflow-x: hidden !important;
padding-right: 10px !important;
box-sizing: border-box !important;
scroll-behavior: auto !important; /* Prevent smooth scrolling on sidebar itself */
}
/* Ensure proper scroll behavior for main content area */
html {
scroll-behavior: smooth !important;
}
/* Ensure anchor links work properly in main content */
.wy-nav-content-wrap {
scroll-behavior: smooth !important;
}
/* Fix scroll to target for anchor links */
.rst-content {
scroll-behavior: smooth !important;
}
/* Fix anchor scroll offset to account for fixed header */
.rst-content .section {
scroll-margin-top: 60px;
}
/* Fix anchor scroll offset for headers */
.rst-content h1, .rst-content h2, .rst-content h3, .rst-content h4, .rst-content h5, .rst-content h6 {
scroll-margin-top: 60px;
}
/* Fix anchor scroll offset for specific scroll targets */
.rst-content .headerlink {
scroll-margin-top: 60px;
}
/* Fix sidebar navigation styling */
.wy-menu-vertical {
width: 100% !important;
}
.wy-menu-vertical li {
width: 100% !important;
}
.wy-menu-vertical a {
width: 100% !important;
word-wrap: break-word !important;
white-space: normal !important;
}
/* Content area margin is handled by JavaScript */
/* Custom drag handle (more visible) */
.resize-handle {
position: absolute;
top: 0;
right: 0;
width: 8px;
height: 100%;
background: #ccc;
cursor: col-resize;
z-index: 1001;
opacity: 0.3;
transition: opacity 0.2s ease;
}
.resize-handle:hover {
opacity: 0.8;
background: #999;
}
.resize-handle::before {
content: '';
position: absolute;
top: 50%;
left: 50%;
width: 2px;
height: 20px;
background: #666;
transform: translate(-50%, -50%);
border-radius: 1px;
}
.resize-handle:hover::before {
background: #333;
}
/* Ensure smooth resizing */
.wy-nav-side.resizing {
user-select: none;
pointer-events: none;
}
.wy-nav-side.resizing .wy-side-scroll {
overflow: hidden;
}
\ No newline at end of file
// Resizable sidebar functionality
document.addEventListener('DOMContentLoaded', function() {
const sidebar = document.querySelector('.wy-nav-side');
const content = document.querySelector('.wy-nav-content-wrap');
if (!sidebar || !content) return;
// Create resize handle
const resizeHandle = document.createElement('div');
resizeHandle.className = 'resize-handle';
sidebar.appendChild(resizeHandle);
let isResizing = false;
let startX = 0;
let startWidth = 0;
// Get initial width
const getInitialWidth = () => {
return 300; // Default width
};
// Save width to localStorage
const saveWidth = (width) => {
localStorage.setItem('sidebar-width', width);
};
// Load width from localStorage
const loadWidth = () => {
const savedWidth = localStorage.getItem('sidebar-width');
if (savedWidth) {
const width = parseInt(savedWidth, 10);
if (width >= 200 && width <= 600) {
return width;
}
}
return getInitialWidth();
};
// Apply width to sidebar and content
const applyWidth = (width) => {
// Update sidebar width
sidebar.style.width = width + 'px';
// Update content margin with !important to override any CSS
content.style.setProperty('margin-left', width + 'px', 'important');
// Also update any other content wrapper that might exist
const contentInner = document.querySelector('.wy-nav-content');
if (contentInner) {
contentInner.style.setProperty('margin-left', '0px', 'important');
}
// Force reflow and repaint
sidebar.offsetHeight;
content.offsetHeight;
// Trigger window resize event to notify other components
window.dispatchEvent(new Event('resize'));
};
// Initialize with saved width
const initialWidth = loadWidth();
applyWidth(initialWidth);
// Mouse down on resize handle
resizeHandle.addEventListener('mousedown', (e) => {
isResizing = true;
startX = e.clientX;
startWidth = parseInt(window.getComputedStyle(sidebar).width, 10);
sidebar.classList.add('resizing');
document.body.style.cursor = 'col-resize';
document.body.style.userSelect = 'none';
// Add overlay to prevent iframe issues
const overlay = document.createElement('div');
overlay.style.cssText = `
position: fixed;
top: 0;
left: 0;
width: 100%;
height: 100%;
z-index: 9999;
cursor: col-resize;
`;
overlay.id = 'resize-overlay';
document.body.appendChild(overlay);
e.preventDefault();
});
// Mouse move
document.addEventListener('mousemove', (e) => {
if (!isResizing) return;
const width = startWidth + e.clientX - startX;
const clampedWidth = Math.max(200, Math.min(600, width));
applyWidth(clampedWidth);
});
// Mouse up
document.addEventListener('mouseup', () => {
if (!isResizing) return;
isResizing = false;
sidebar.classList.remove('resizing');
document.body.style.cursor = '';
document.body.style.userSelect = '';
// Remove overlay
const overlay = document.getElementById('resize-overlay');
if (overlay) {
overlay.remove();
}
// Save the current width
const currentWidth = parseInt(window.getComputedStyle(sidebar).width, 10);
saveWidth(currentWidth);
});
// Handle window resize - removed to prevent infinite loop
// The sidebar width is fixed and managed by drag functionality, no need to recalculate on window resize
// Double-click to reset to default width
resizeHandle.addEventListener('dblclick', () => {
const defaultWidth = 300;
applyWidth(defaultWidth);
saveWidth(defaultWidth);
});
});
// Fix navigation issues - Using MutationObserver for reliable initialization
document.addEventListener('DOMContentLoaded', function() {
let navigationFixed = false;
function setupNavigationFix() {
if (navigationFixed) return;
// Find all links in the sidebar
const sidebarLinks = document.querySelectorAll('.wy-menu-vertical a');
// Only proceed if we have sidebar links
if (sidebarLinks.length === 0) return;
console.log('Setting up navigation fix...');
sidebarLinks.forEach(function(link) {
const href = link.getAttribute('href');
// Clone the link to remove all existing event listeners
const newLink = link.cloneNode(true);
// Add our own click handler
newLink.addEventListener('click', function(e) {
console.log('Link clicked:', href);
// If it's an anchor link within the same page
if (href && href.startsWith('#') && href !== '#') {
e.preventDefault();
e.stopPropagation();
const targetId = href.substring(1);
const targetElement = document.getElementById(targetId);
if (targetElement) {
// Calculate offset for fixed header
const headerHeight = 60;
const elementPosition = targetElement.getBoundingClientRect().top;
const offsetPosition = elementPosition + window.pageYOffset - headerHeight;
window.scrollTo({
top: offsetPosition,
behavior: 'smooth'
});
// Update URL hash
if (history.pushState) {
history.pushState(null, null, '#' + targetId);
} else {
location.hash = '#' + targetId;
}
}
}
// For external links, navigate normally
else if (href && !href.startsWith('#') && !href.startsWith('javascript:')) {
console.log('Navigating to external link:', href);
window.location.href = href;
}
});
// Replace the old link with the new one
link.parentNode.replaceChild(newLink, link);
});
navigationFixed = true;
// Handle initial page load with hash
if (window.location.hash) {
// Use requestAnimationFrame for better timing
requestAnimationFrame(() => {
const targetId = window.location.hash.substring(1);
const targetElement = document.getElementById(targetId);
if (targetElement) {
const headerHeight = 60;
const elementPosition = targetElement.getBoundingClientRect().top;
const offsetPosition = elementPosition + window.pageYOffset - headerHeight;
window.scrollTo({
top: offsetPosition,
behavior: 'smooth'
});
}
});
}
}
// Try to set up navigation fix immediately
setupNavigationFix();
// If it didn't work, use MutationObserver to watch for when sidebar links are added
if (!navigationFixed) {
const observer = new MutationObserver(function(mutations) {
mutations.forEach(function(mutation) {
if (mutation.type === 'childList' && mutation.addedNodes.length > 0) {
// Check if sidebar links were added
const sidebarLinks = document.querySelectorAll('.wy-menu-vertical a');
if (sidebarLinks.length > 0) {
setupNavigationFix();
if (navigationFixed) {
observer.disconnect();
}
}
}
});
});
// Start observing the document for changes
observer.observe(document.body, {
childList: true,
subtree: true
});
// Fallback timeout in case MutationObserver doesn't work
setTimeout(function() {
if (!navigationFixed) {
setupNavigationFix();
}
observer.disconnect();
}, 5000);
}
});
\ No newline at end of file
document.addEventListener("DOMContentLoaded", function () {
var script = document.createElement("script");
script.type = "module";
script.id = "runllm-widget-script";
script.src = "https://widget.runllm.com";
script.setAttribute("version", "stable");
script.setAttribute("crossorigin", "true");
script.setAttribute("runllm-keyboard-shortcut", "Mod+j");
script.setAttribute("runllm-name", "verl Chatbot");
script.setAttribute("runllm-position", "TOP_RIGHT");
script.setAttribute("runllm-assistant-id", "679");
script.async = true;
document.head.appendChild(script);
});
\ No newline at end of file
Agent Loop
==========
Last updated: 07/17/2025.
.. versionadded:: 0.4.2
[status: alpha]
.. warning::
Agent Loop is ready for use, but the API may change in future releaes.
Agent Loop is designed as general interface for multi-turn rollout and agentic reinforcement learning.
**Design goal**:
- Plugable user defined agent loop
- Provide standard request generate api with different inference frameworks
- Provide request level load balance between multiple inference servers
**Non-goal**:
- How tool is defined and how to call tool
In high level overview, agent loop is given a prompt, run user defined loop: call LLM generate api, call tools, ...
and return the final output. The final output is then calculated reward and used as trajectory for RL training.
.. image:: https://github.com/eric-haibin-lin/verl-community/blob/main/docs/agent_loop_overview.svg?raw=true
API Design
----------
``AgentLoopBase`` class is the abstraction of agent loop, and ``run`` method is the only interface that user need to implement.
The run method, given prompt messages in format: [{"role": "user"}, {"content": "..."}], and additional sampling params,
could do whatever user wants, such as
- call LLM generate api
- call tools: web search, database query, code sandbox, ...
- environment interaction
- reflection
- ...
.. code:: python
class AgentLoopBase(ABC):
@abstractmethod
async def run(self, messages: list[dict[str, Any]], sampling_params: dict[str, Any]) -> AgentLoopOutput:
"""Run agent loop to interact with LLM server and environment.
Args:
messages (List[Dict[str, Any]]): Input messages.
sampling_params (Dict[str, Any]): LLM sampling params.
Returns:
AgentLoopOutput: Agent loop output.
"""
raise NotImplementedError
After running user defined loop, run method should return ``AgentLoopOutput``, including prompt token ids,
response token ids, and response mask.
.. code:: python
class AgentLoopOutput(BaseModel):
"""Agent loop output."""
prompt_ids: list[int]
"""Prompt token ids."""
response_ids: list[int]
"""Response token ids including LLM generated token, tool response token."""
response_mask: list[int]
"""Response mask, 1 for LLM generated token, 0 for tool response token."""
.. image:: https://github.com/eric-haibin-lin/verl-community/blob/main/docs/agent_loop_output.svg?raw=true
.. note:: AgentLoopOutput only output one trajectory for a given prompt, multiple trajectories output is still under discussion.
Architecture Design
-------------------
.. image:: https://github.com/eric-haibin-lin/verl-community/blob/main/docs/agent_loop_architecture.png?raw=true
A single PPO step contain two phase: rollout and train. In rollout phase:
1. PPOTrainer sample a batch from dataset and call ``AgentLoopManager.generate_sequences``.
2. AgentLoopManager ``wake_up`` all async LLM server instances, which will sync weights between inference engine(vLLM/SGLang) and training engine(FSDP/Megatron-LM).
3. AgentLoopManager split batch into chunks and send each chunk to ``AgentLoopWorker``.
4. AgentLoopWorker receive chunk and for each prompt, spawn a user defined ``AgentLoopBase`` instance, run ``run`` coroutine until end and get ``AgentLoopOutput``.
.. tip::
AgentLoopWorker schedules multiple coroutines concurrently. If number of AgentLoopWorker equals batch_size, then each worker is response for one prompt.
In agent loop, when user need LLM generate response:
5. Call ``AsyncLLMServerManager.generate`` with prompt_ids.
6. AsyncLLMServerManager select a server instance with least request in first turn and send request to it. (In following turns, the request will be sent to the same server instance).
7. AsyncLLMServer receive a request, issue ipc/rpc with model_runner, and generate response. (There's slight differences between vLLM and SGLang, see below).
When all prompts in all AgentLoopWorker finish, AgentLoopManager gather results and return to PPOTrainer.
8. AgentLoopManager ``sleep`` all server instances, which will free kv cache and offload weights to CPU memory.
AsyncLLMServer
~~~~~~~~~~~~~~
AsyncLLMServer is the abstraction of LLM server with two types of generation api:
- `OpenAI chat completion <https://platform.openai.com/docs/api-reference/chat>`_: generate response for the given chat conversation.
- Token in token out: generate response ids for the given token ids.
We have officially supported vLLM and SGLang AsyncLLMServer, both of them implement the two api and are well tested.
Other inference engine should be easy to plug-in by implement the ``AsyncServerBase`` class.
.. code:: python
class AsyncServerBase(ABC):
@abstractmethod
async def chat_completion(self, raw_request: Request) -> JSONResponse:
"""OpenAI chat completion API.
Args:
raw_request (Request): raw json request
Returns:
JSONResponse: json response
API reference: https://platform.openai.com/docs/api-reference/chat/create
"""
raise NotImplementedError
@abstractmethod
async def generate(self, prompt_ids: list[int], sampling_params: dict[str, Any], request_id: str) -> list[int]:
"""Generate response ids given prompt ids.
Args:
prompt_ids (List[int]): prompt ids
sampling_params (Dict[str, Any]): sampling params
request_id (str): request id
Returns:
List[int]: response ids
"""
raise NotImplementedError
Chat completion vs Token in token out
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
.. warning::
The following conclusion is based on our recent experience and is still open to investigation and discussion.
Almost all agent frameworks (LangGraph, CrewAI, LlamaIndex, etc) call LLM with OpenAI chat completion api, and
keep chat history as messages. So user may expect that we should use the chat completion api in multi-turn rollout.
But based on our recent experience on single-turn training on DAPO and multi-turn training on `retool <https://github.com/volcengine/verl/tree/main/recipe/retool>`_,
we found the token_ids from apply the final messages may not equal to the token_ids by concat prompt_ids and response_ids in each turn.
.. image:: https://github.com/eric-haibin-lin/verl-community/blob/main/docs/multi_turn.png?raw=true
**Where does this inconsistency happened?**
First, the tool parser may alter the content. For example
.. code:: json
{"role": "assistant", "content": "Let me call a <tool_call>...</tool_call> and get the result"}
After tool_calls extraction, the messages is like this:
.. code:: json
{"role": "assistant", "content": "Let me call a and get the result", "tool_calls": [{"name": "foo", "arguments": "{}"}]}
Encode the extracted message back is not equal to the original LLM generated response_ids.
Second, the `decode-encode` may also lead to inconsistency: `Agent-R1 issue#30 <https://github.com/0russwest0/Agent-R1/issues/30#issuecomment-2826155367>`_.
**What is the impact of this inconsistency?**
This inconsistency is not a big problem for serving/agent system, but is critical to RL training.
It causes the trajectory deviate from the policy model distribution. We have observed that apply_chat_template
to the final chat history messages make PPO training not even converged in single-turn.
vLLM
^^^^
.. image:: https://github.com/eric-haibin-lin/verl-community/blob/main/docs/async_vllm.png?raw=true
For vLLM, the Async LLM Engine is running in same process as the server, and ModelRunner is running in same process as FSDP/Megatron-LM workers.
Async LLM Engine communicate with ModelRunner through ZeroMQ. When server receive a request, it directly call engine to generate response_ids.
SGLang
^^^^^^
.. image:: https://github.com/eric-haibin-lin/verl-community/blob/main/docs/async_sglang.png?raw=true
For SGLang, the Async LLM Engine is running in same process as FSDP/Megatron-LM worker-0, and it spawn multiple subprocesses as ModelRunner.
Also, Async LLM Engine communicate with ModelRunner through ZeroMQ. When server receive a request, it remote call the worker-0 and get response_ids.
AsyncLLMServerManager
~~~~~~~~~~~~~~~~~~~~~
AsyncLLMServerManager serve as proxy to multiple AsyncLLMServer instances, provides:
- load balance: select a server instance with least request in first turn and send request to it.
- sticky session: bind request_id to server instance, so that the same request_id will be sent to the same server instance in following turns.
AsyncLLMServerManager is passed to ``AgentLoopBase.__init__``, whenever user want to interact with LLM in agent loop,
they can call ``AsyncLLMServerManager.generate`` to generate response_ids.
.. code:: python
class AsyncLLMServerManager:
async def generate(
self,
request_id,
*,
prompt_ids: list[int],
sampling_params: dict[str, Any],
) -> list[int]:
"""Generate tokens from prompt ids.
Args:
request_id (str): request id for sticky session.
prompt_ids (List[int]): List of prompt token ids.
sampling_params (Dict[str, Any]): Sampling parameters for the chat completion.
Returns:
List[int]: List of generated token ids.
"""
...
Next
----
- :doc:`Agentic RL Training<../start/agentic_rl>`: Quick start agentic RL training with gsm8k dataset.
- `LangGraph MathExpression <https://github.com/volcengine/verl/tree/main/recipe/langgraph_agent/example>`_: Demonstrate how to use LangGraph to build agent loop.
- `Retool <https://github.com/volcengine/verl/tree/main/recipe/retool>`_: End-to-end retool paper reproduction using tool agent.
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment