Initial commit

7f6cc211 · jerrrrry · 7f6cc211 · 7f6cc211 · 7f6cc211 · 7f6cc211
Commit 7f6cc211 authored Aug 05, 2025 by jerrrrry
20 changed files
--- a/docker/verl0.5-cu126-torch2.7-fa2.7.4/Dockerfile.app.vllm.mcore0.12
+++ b/docker/verl0.5-cu126-torch2.7-fa2.7.4/Dockerfile.app.vllm.mcore0.12
+# Start from the verl base image
+# Dockerfile.base
+FROM verlai/verl:base-verl0.5-cu126-cudnn9.8-torch2.7.0-fa2.7.4
+
+# Define environments
+ENV MAX_JOBS=32
+ENV VLLM_WORKER_MULTIPROC_METHOD=spawn
+ENV DEBIAN_FRONTEND=noninteractive
+ENV NODE_OPTIONS=""
+ENV PIP_ROOT_USER_ACTION=ignore
+ENV HF_HUB_ENABLE_HF_TRANSFER="1"
+
+# Install torch-2.7.0+cu126 + vllm-0.9.1
+RUN pip install --resume-retries 999 --no-cache-dir vllm==0.9.1
+
+# Fix packages
+RUN pip install --no-cache-dir "tensordict==0.6.2" "transformers[hf_xet]>=4.51.0" accelerate datasets peft hf-transfer \
+    "numpy<2.0.0" "pyarrow>=19.0.1" pandas \
+    ray[default] codetiming hydra-core pylatexenc qwen-vl-utils wandb dill pybind11 liger-kernel mathruler blobfile xgrammar \
+    pytest py-spy pyext pre-commit ruff
+
+RUN pip uninstall -y pynvml nvidia-ml-py && \
+    pip install --resume-retries 999 --no-cache-dir --upgrade "nvidia-ml-py>=12.560.30" "fastapi[standard]>=0.115.0" "optree>=0.13.0" "pydantic>=2.9" "grpcio>=1.62.1"
+
+RUN pip install --resume-retries 999 --no-cache-dir nvidia-cudnn-cu12==9.8.0.87
+
+# Install TransformerEngine
+RUN export NVTE_FRAMEWORK=pytorch && pip3 install --resume-retries 999 --no-deps --no-cache-dir --no-build-isolation git+https://github.com/NVIDIA/TransformerEngine.git@v2.2.1
+
+# Install Megatron-LM
+RUN pip3 install --no-deps --no-cache-dir --no-build-isolation git+https://github.com/NVIDIA/Megatron-LM.git@core_v0.12.2
+
+# Install mbridge
+RUN pip3 install --no-cache-dir mbridge
+
+# Fix qwen vl
+RUN pip3 install --no-cache-dir --no-deps trl
\ No newline at end of file
--- a/docker/verl0.5-cu126-torch2.7-fa2.7.4/Dockerfile.base.torch2.7.0
+++ b/docker/verl0.5-cu126-torch2.7-fa2.7.4/Dockerfile.base.torch2.7.0
+# Base Docker Image of verl, with CUDA/Torch/FlashAttn/Apex/TransformerEngine, without other frameworks
+# Target: verlai/verl:base-verl0.5-cu126-cudnn9.8-torch2.7.1-fa2.8.0-fi0.2.6
+# Start from the NVIDIA official image (ubuntu-22.04 + cuda-12.6 + python-3.10)
+# https://docs.nvidia.com/deeplearning/frameworks/pytorch-release-notes/rel-24-08.html
+FROM nvcr.io/nvidia/pytorch:24.08-py3
+
+# Define environments
+ENV MAX_JOBS=16
+ENV VLLM_WORKER_MULTIPROC_METHOD=spawn
+ENV DEBIAN_FRONTEND=noninteractive
+ENV NODE_OPTIONS=""
+ENV PIP_ROOT_USER_ACTION=ignore
+ENV HF_HUB_ENABLE_HF_TRANSFER="1"
+
+# Define installation arguments
+ARG APT_SOURCE=https://mirrors.tuna.tsinghua.edu.cn/ubuntu/
+ARG PIP_INDEX=https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple
+
+# Set apt source
+RUN cp /etc/apt/sources.list /etc/apt/sources.list.bak && \
+    { \
+    echo "deb ${APT_SOURCE} jammy main restricted universe multiverse"; \
+    echo "deb ${APT_SOURCE} jammy-updates main restricted universe multiverse"; \
+    echo "deb ${APT_SOURCE} jammy-backports main restricted universe multiverse"; \
+    echo "deb ${APT_SOURCE} jammy-security main restricted universe multiverse"; \
+    } > /etc/apt/sources.list
+
+# Install systemctl
+RUN apt-get update && \
+    apt-get install -y -o Dpkg::Options::="--force-confdef" systemd && \
+    apt-get clean
+
+# Install tini
+RUN apt-get update && \
+    apt-get install -y tini aria2 libfreeimage3 libfreeimage-dev zlib1g htop && \
+    apt-get clean
+
+# Change pip source
+RUN pip config set global.index-url "${PIP_INDEX}" && \
+    pip config set global.extra-index-url "${PIP_INDEX}" && \
+    python -m pip install --upgrade pip
+
+# Uninstall nv-pytorch fork
+RUN pip uninstall -y torch torchvision torchaudio \
+    pytorch-quantization pytorch-triton torch-tensorrt \
+    xgboost transformer_engine flash_attn apex megatron-core grpcio
+
+RUN pip install --resume-retries 999 --no-cache-dir torch==2.7.0 torchvision==0.22.0 torchaudio==2.7.0
+
+# Install flash-attn-2.7.4.post1, although built with torch2.6, it is compatible with torch2.7
+# https://github.com/Dao-AILab/flash-attention/issues/1644#issuecomment-2899396361
+RUN ABI_FLAG=$(python -c "import torch; print('TRUE' if torch._C._GLIBCXX_USE_CXX11_ABI else 'FALSE')") && \
+    URL="https://github.com/Dao-AILab/flash-attention/releases/download/v2.7.4.post1/flash_attn-2.7.4.post1+cu12torch2.6cxx11abi${ABI_FLAG}-cp310-cp310-linux_x86_64.whl" && \
+    FILE="flash_attn-2.7.4.post1+cu12torch2.6cxx11abi${ABI_FLAG}-cp310-cp310-linux_x86_64.whl" && \
+    wget -nv "${URL}" && \
+    pip install --no-cache-dir "${FILE}"
+
+# Fix packages
+RUN pip uninstall -y pynvml nvidia-ml-py && \
+    pip install --no-cache-dir --upgrade "nvidia-ml-py>=12.560.30" "fastapi[standard]>=0.115.0" "optree>=0.13.0" "pydantic>=2.9" "grpcio>=1.62.1"
+
+# Install cudnn
+RUN aria2c --max-tries=9999 https://developer.download.nvidia.com/compute/cudnn/9.8.0/local_installers/cudnn-local-repo-ubuntu2204-9.8.0_1.0-1_amd64.deb && \
+    dpkg -i cudnn-local-repo-ubuntu2204-9.8.0_1.0-1_amd64.deb && \
+    cp /var/cudnn-local-repo-ubuntu2204-9.8.0/cudnn-*-keyring.gpg /usr/share/keyrings/ && \
+    apt-get update && \
+    apt-get -y install cudnn-cuda-12 && \
+    rm cudnn-local-repo-ubuntu2204-9.8.0_1.0-1_amd64.deb
+
+# Install Apex
+RUN pip install -v --disable-pip-version-check --no-cache-dir --no-build-isolation --config-settings "--build-option=--cpp_ext" --config-settings "--build-option=--cuda_ext" --resume-retries 999 git+https://github.com/NVIDIA/apex.git
+
+# Profiling tools
+RUN aria2c --always-resume=true --max-tries=99999 https://developer.nvidia.com/downloads/assets/tools/secure/nsight-systems/2025_3/nsight-systems-2025.3.1_2025.3.1.90-1_amd64.deb && \
+    apt-get update && apt-get install -y libxcb-cursor0
+
+RUN apt-get install -y ./nsight-systems-2025.3.1_2025.3.1.90-1_amd64.deb && \
+    rm -rf /usr/local/cuda/bin/nsys && \
+    ln -s /opt/nvidia/nsight-systems/2025.3.1/target-linux-x64/nsys  /usr/local/cuda/bin/nsys && \
+    rm -rf /usr/local/cuda/bin/nsys-ui && \
+    ln -s /opt/nvidia/nsight-systems/2025.3.1/target-linux-x64/nsys-ui /usr/local/cuda/bin/nsys-ui && \
+    rm nsight-systems-2025.3.1_2025.3.1.90-1_amd64.deb
+
+RUN pip install --resume-retries 999 --no-cache-dir "tensordict==0.6.2" torchdata "transformers[hf_xet]>=4.51.0" accelerate datasets peft hf-transfer \
+    "numpy<2.0.0" "pyarrow>=19.0.1" pandas cuda-bindings \
+    ray[default] codetiming hydra-core pylatexenc qwen-vl-utils wandb dill pybind11 liger-kernel mathruler blobfile xgrammar \
+    pytest py-spy pyext pre-commit ruff
+
+# Install DeepEP
+## the dependency of IBGDA
+RUN ln -s /usr/lib/x86_64-linux-gnu/libmlx5.so.1 /usr/lib/x86_64-linux-gnu/libmlx5.so
+
+## Clone and build deepep and deepep-nvshmem
+RUN git clone -b v2.3.1 https://github.com/NVIDIA/gdrcopy.git && \
+    git clone https://github.com/deepseek-ai/DeepEP.git  && \
+    cd DeepEP && git checkout a84a248
+
+# Prepare nvshmem
+RUN wget https://developer.nvidia.com/downloads/assets/secure/nvshmem/nvshmem_src_3.2.5-1.txz && \
+    tar -xvf nvshmem_src_3.2.5-1.txz && mv nvshmem_src deepep-nvshmem && \
+    cd deepep-nvshmem && git apply ../DeepEP/third-party/nvshmem.patch
+
+ENV CUDA_HOME=/usr/local/cuda
+### Set MPI environment variables. Having errors when not set.
+ENV CPATH=/usr/local/mpi/include:$CPATH
+ENV LD_LIBRARY_PATH=/usr/local/mpi/lib:$LD_LIBRARY_PATH
+ENV LD_LIBRARY_PATH=/usr/local/x86_64-linux-gnu:$LD_LIBRARY_PATH
+ENV GDRCOPY_HOME=/workspace/gdrcopy
+
+## Build deepep-nvshmem
+RUN cd deepep-nvshmem && \
+    NVSHMEM_SHMEM_SUPPORT=0 \
+    NVSHMEM_UCX_SUPPORT=0 \
+    NVSHMEM_USE_NCCL=0 \
+    NVSHMEM_MPI_SUPPORT=0 \
+    NVSHMEM_IBGDA_SUPPORT=1 \
+    NVSHMEM_PMIX_SUPPORT=0 \
+    NVSHMEM_TIMEOUT_DEVICE_POLLING=0 \
+    NVSHMEM_USE_GDRCOPY=1 \
+    cmake -G Ninja -S . -B build/ -DCMAKE_INSTALL_PREFIX=/workspace/deepep-nvshmem/install && cmake --build build/ --target install
+
+ENV NVSHMEM_DIR=/workspace/deepep-nvshmem/install
+ENV LD_LIBRARY_PATH=$NVSHMEM_DIR/lib:$LD_LIBRARY_PATH
+ENV PATH=$NVSHMEM_DIR/bin:$PATH
+
+## Build deepep
+RUN cd DeepEP && \
+    python setup.py install
+
+# Reset pip config
+RUN pip config unset global.index-url && \
+    pip config unset global.extra-index-url
+
--- a/docker/verl0.5-cu126-torch2.7-fa2.7.4/Dockerfile.base.torch2.7.1
+++ b/docker/verl0.5-cu126-torch2.7-fa2.7.4/Dockerfile.base.torch2.7.1
+# Base Docker Image of verl, with CUDA/Torch/FlashAttn/Apex/TransformerEngine, without other frameworks
+# Target: verlai/verl:base-verl0.5-cu126-cudnn9.8-torch2.7.1-fa2.8.0-fi0.2.6
+# Start from the NVIDIA official image (ubuntu-22.04 + cuda-12.6 + python-3.10)
+# https://docs.nvidia.com/deeplearning/frameworks/pytorch-release-notes/rel-24-08.html
+FROM nvcr.io/nvidia/pytorch:24.08-py3
+
+# Define environments
+ENV MAX_JOBS=16
+ENV VLLM_WORKER_MULTIPROC_METHOD=spawn
+ENV DEBIAN_FRONTEND=noninteractive
+ENV NODE_OPTIONS=""
+ENV PIP_ROOT_USER_ACTION=ignore
+ENV HF_HUB_ENABLE_HF_TRANSFER="1"
+
+# Define installation arguments
+ARG APT_SOURCE=https://mirrors.tuna.tsinghua.edu.cn/ubuntu/
+ARG PIP_INDEX=https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple
+
+# Set apt source
+RUN cp /etc/apt/sources.list /etc/apt/sources.list.bak && \
+    { \
+    echo "deb ${APT_SOURCE} jammy main restricted universe multiverse"; \
+    echo "deb ${APT_SOURCE} jammy-updates main restricted universe multiverse"; \
+    echo "deb ${APT_SOURCE} jammy-backports main restricted universe multiverse"; \
+    echo "deb ${APT_SOURCE} jammy-security main restricted universe multiverse"; \
+    } > /etc/apt/sources.list
+
+# Install systemctl
+RUN apt-get update && \
+    apt-get install -y -o Dpkg::Options::="--force-confdef" systemd && \
+    apt-get clean
+
+# Install tini
+RUN apt-get update && \
+    apt-get install -y tini aria2 libfreeimage3 libfreeimage-dev zlib1g htop && \
+    apt-get clean
+
+# Change pip source
+RUN pip config set global.index-url "${PIP_INDEX}" && \
+    pip config set global.extra-index-url "${PIP_INDEX}" && \
+    python -m pip install --upgrade pip
+
+# Uninstall nv-pytorch fork
+RUN pip uninstall -y torch torchvision torchaudio \
+    pytorch-quantization pytorch-triton torch-tensorrt \
+    xgboost transformer_engine flash_attn apex megatron-core grpcio
+
+RUN pip install --resume-retries 999 --no-cache-dir torch==2.7.1 torchvision==0.22.1 torchaudio==2.7.1
+
+# Install flash-attn-2.7.4.post1, although built with torch2.6, it is compatible with torch2.7
+# https://github.com/Dao-AILab/flash-attention/issues/1644#issuecomment-2899396361
+RUN ABI_FLAG=$(python -c "import torch; print('TRUE' if torch._C._GLIBCXX_USE_CXX11_ABI else 'FALSE')") && \
+    URL="https://github.com/Dao-AILab/flash-attention/releases/download/v2.7.4.post1/flash_attn-2.7.4.post1+cu12torch2.6cxx11abi${ABI_FLAG}-cp310-cp310-linux_x86_64.whl" && \
+    FILE="flash_attn-2.7.4.post1+cu12torch2.6cxx11abi${ABI_FLAG}-cp310-cp310-linux_x86_64.whl" && \
+    wget -nv "${URL}" && \
+    pip install --no-cache-dir "${FILE}"
+
+# Fix packages
+RUN pip uninstall -y pynvml nvidia-ml-py && \
+    pip install --no-cache-dir --upgrade "nvidia-ml-py>=12.560.30" "fastapi[standard]>=0.115.0" "optree>=0.13.0" "pydantic>=2.9" "grpcio>=1.62.1"
+
+# Install cudnn
+RUN aria2c --max-tries=9999 https://developer.download.nvidia.com/compute/cudnn/9.8.0/local_installers/cudnn-local-repo-ubuntu2204-9.8.0_1.0-1_amd64.deb && \
+    dpkg -i cudnn-local-repo-ubuntu2204-9.8.0_1.0-1_amd64.deb && \
+    cp /var/cudnn-local-repo-ubuntu2204-9.8.0/cudnn-*-keyring.gpg /usr/share/keyrings/ && \
+    apt-get update && \
+    apt-get -y install cudnn-cuda-12 && \
+    rm cudnn-local-repo-ubuntu2204-9.8.0_1.0-1_amd64.deb
+
+# Install Apex
+RUN pip install -v --disable-pip-version-check --no-cache-dir --no-build-isolation --config-settings "--build-option=--cpp_ext" --config-settings "--build-option=--cuda_ext" --resume-retries 999 git+https://github.com/NVIDIA/apex.git
+
+# Profiling tools
+RUN aria2c --always-resume=true --max-tries=99999 https://developer.nvidia.com/downloads/assets/tools/secure/nsight-systems/2025_3/nsight-systems-2025.3.1_2025.3.1.90-1_amd64.deb && \
+    apt-get update && apt-get install -y libxcb-cursor0
+
+RUN apt-get install -y ./nsight-systems-2025.3.1_2025.3.1.90-1_amd64.deb && \
+    rm -rf /usr/local/cuda/bin/nsys && \
+    ln -s /opt/nvidia/nsight-systems/2025.3.1/target-linux-x64/nsys  /usr/local/cuda/bin/nsys && \
+    rm -rf /usr/local/cuda/bin/nsys-ui && \
+    ln -s /opt/nvidia/nsight-systems/2025.3.1/target-linux-x64/nsys-ui /usr/local/cuda/bin/nsys-ui && \
+    rm nsight-systems-2025.3.1_2025.3.1.90-1_amd64.deb
+
+RUN pip install --resume-retries 999 --no-cache-dir "tensordict==0.6.2" torchdata "transformers[hf_xet]>=4.52.3" accelerate datasets peft hf-transfer \
+    "numpy<2.0.0" "pyarrow>=19.0.1" pandas cuda-bindings \
+    ray[default] codetiming hydra-core pylatexenc qwen-vl-utils wandb dill pybind11 liger-kernel mathruler blobfile xgrammar \
+    pytest py-spy pyext pre-commit ruff
+
+# Install DeepEP
+## the dependency of IBGDA
+RUN ln -s /usr/lib/x86_64-linux-gnu/libmlx5.so.1 /usr/lib/x86_64-linux-gnu/libmlx5.so
+
+## Clone and build deepep and deepep-nvshmem
+RUN git clone -b v2.3.1 https://github.com/NVIDIA/gdrcopy.git && \
+    git clone https://github.com/deepseek-ai/DeepEP.git  && \
+    cd DeepEP && git checkout a84a248
+
+# Prepare nvshmem
+RUN wget https://developer.nvidia.com/downloads/assets/secure/nvshmem/nvshmem_src_3.2.5-1.txz && \
+    tar -xvf nvshmem_src_3.2.5-1.txz && mv nvshmem_src deepep-nvshmem && \
+    cd deepep-nvshmem && git apply ../DeepEP/third-party/nvshmem.patch
+
+ENV CUDA_HOME=/usr/local/cuda
+### Set MPI environment variables. Having errors when not set.
+ENV CPATH=/usr/local/mpi/include:$CPATH
+ENV LD_LIBRARY_PATH=/usr/local/mpi/lib:$LD_LIBRARY_PATH
+ENV LD_LIBRARY_PATH=/usr/local/x86_64-linux-gnu:$LD_LIBRARY_PATH
+ENV GDRCOPY_HOME=/workspace/gdrcopy
+
+## Build deepep-nvshmem
+RUN cd deepep-nvshmem && \
+    NVSHMEM_SHMEM_SUPPORT=0 \
+    NVSHMEM_UCX_SUPPORT=0 \
+    NVSHMEM_USE_NCCL=0 \
+    NVSHMEM_MPI_SUPPORT=0 \
+    NVSHMEM_IBGDA_SUPPORT=1 \
+    NVSHMEM_PMIX_SUPPORT=0 \
+    NVSHMEM_TIMEOUT_DEVICE_POLLING=0 \
+    NVSHMEM_USE_GDRCOPY=1 \
+    cmake -G Ninja -S . -B build/ -DCMAKE_INSTALL_PREFIX=/workspace/deepep-nvshmem/install && cmake --build build/ --target install
+
+ENV NVSHMEM_DIR=/workspace/deepep-nvshmem/install
+ENV LD_LIBRARY_PATH=$NVSHMEM_DIR/lib:$LD_LIBRARY_PATH
+ENV PATH=$NVSHMEM_DIR/bin:$PATH
+
+## Build deepep
+RUN cd DeepEP && \
+    python setup.py install
+
+# Reset pip config
+RUN pip config unset global.index-url && \
+    pip config unset global.extra-index-url
+
--- a/docker/verl0.5-cu126-torch2.7-fa2.7.4/README.md
+++ b/docker/verl0.5-cu126-torch2.7-fa2.7.4/README.md
+# verl image with verl v0.5
+
+## Important packages version
+
+```txt
+cuda==12.6
+cudnn==9.8.0
+torch==2.7.1
+flash_attn=2.8.0    ##
+sglang==0.4.8
+vllm==0.8.5.post1
+vidia-cudnn-cu12==9.8.0.87
+transformer_engine==2.3
+megatron.core==core_v0.12.2
+# Preview
+transformer_engine==2.5
+megatron.core==core_r0.13.0
+```
+
+## Target
+
+- Base image:
+    - `verlai/verl:base-verl0.5-cu126-cudnn9.8-torch2.7.0-fa2.7.4`: We offer a base image with deep ep built in, for vllm
+    - `verlai/verl:base-verl0.5-cu126-cudnn9.8-torch2.7.1-fa2.7.4`: We offer a base image with deep ep built in, for sglang
+- App image:
+    - `verlai/verl:app-verl0.5-vllm0.9.1-mcore0.12.2-te2.2`
+    - `verlai/verl:app-verl0.5-sglang0.4.8-mcore0.12.2-te2.2`
+    - `verlai/verl:app-verl0.5-sglang0.4.9.post2-mcore0.12.2-te2.2`
\ No newline at end of file
--- a/docker/verl0.5-cu126-torch2.7.1-fa2.8.0/Dockerfile.app.sglang.mcore0.12
+++ b/docker/verl0.5-cu126-torch2.7.1-fa2.8.0/Dockerfile.app.sglang.mcore0.12
+# Start from the verl base image
+# Dockerfile.base
+FROM verlai/verl:base-verl0.5-cu126-cudnn9.8-torch2.7.1-fa2.8.0
+
+# Define environments
+ENV MAX_JOBS=8
+ENV VLLM_WORKER_MULTIPROC_METHOD=spawn
+ENV DEBIAN_FRONTEND=noninteractive
+ENV NODE_OPTIONS=""
+ENV PIP_ROOT_USER_ACTION=ignore
+ENV HF_HUB_ENABLE_HF_TRANSFER="1"
+
+# Install sglang-0.4.8 and torch-memory-saver
+# Install FlashInfer Python package
+RUN pip install --upgrade pip setuptools packaging
+RUN pip install --resume-retries 999 --no-cache-dir --no-build-isolation flashinfer-python==0.2.6.post1
+RUN pip install --resume-retries 999  --no-cache-dir "sglang[all]==0.4.8" && pip install torch-memory-saver --no-cache-dir
+
+# Fix packages
+RUN pip install --no-cache-dir "tensordict==0.6.2" "transformers[hf_xet]>=4.51.0" accelerate datasets peft hf-transfer \
+    "numpy<2.0.0" "pyarrow>=19.0.1" pandas \
+    ray[default] codetiming hydra-core pylatexenc qwen-vl-utils wandb dill pybind11 liger-kernel mathruler blobfile xgrammar \
+    pytest py-spy pyext pre-commit ruff
+
+RUN pip uninstall -y pynvml nvidia-ml-py && \
+    pip install --resume-retries 999 --no-cache-dir --upgrade "nvidia-ml-py>=12.560.30" "fastapi[standard]>=0.115.0" "optree>=0.13.0" "pydantic>=2.9" "grpcio>=1.62.1"
+
+RUN pip install --resume-retries 999 --no-cache-dir nvidia-cudnn-cu12==9.8.0.87
+
+# Install TransformerEngine
+RUN export NVTE_FRAMEWORK=pytorch && pip3 install --resume-retries 999 --no-deps --no-cache-dir --no-build-isolation git+https://github.com/NVIDIA/TransformerEngine.git@v2.3
+
+# Install Megatron-LM
+RUN pip3 install --no-deps --no-cache-dir --no-build-isolation git+https://github.com/NVIDIA/Megatron-LM.git@core_v0.12.2
+
+# Install mbridge
+RUN pip3 install --no-cache-dir mbridge
\ No newline at end of file
--- a/docker/verl0.5-cu126-torch2.7.1-fa2.8.0/Dockerfile.app.sglang.mcore0.13.preview
+++ b/docker/verl0.5-cu126-torch2.7.1-fa2.8.0/Dockerfile.app.sglang.mcore0.13.preview
+# Start from the verl base image
+# Dockerfile.base
+FROM verlai/verl:base-verl0.5-cu126-cudnn9.8-torch2.7.1-fa2.8.0
+
+# Define environments
+ENV MAX_JOBS=8
+ENV VLLM_WORKER_MULTIPROC_METHOD=spawn
+ENV DEBIAN_FRONTEND=noninteractive
+ENV NODE_OPTIONS=""
+ENV PIP_ROOT_USER_ACTION=ignore
+ENV HF_HUB_ENABLE_HF_TRANSFER="1"
+
+# Install sglang-0.4.8 and torch-memory-saver
+# Install FlashInfer Python package
+RUN pip install --upgrade pip setuptools packaging
+RUN pip install --resume-retries 999 --no-cache-dir --no-build-isolation flashinfer-python==0.2.6.post1
+RUN pip install --resume-retries 999  --no-cache-dir "sglang[all]==0.4.8" && pip install torch-memory-saver --no-cache-dir
+
+# Fix packages
+RUN pip install --no-cache-dir "tensordict==0.6.2" "transformers[hf_xet]>=4.51.0" accelerate datasets peft hf-transfer \
+    "numpy<2.0.0" "pyarrow>=19.0.1" pandas \
+    ray[default] codetiming hydra-core pylatexenc qwen-vl-utils wandb dill pybind11 liger-kernel mathruler blobfile xgrammar \
+    pytest py-spy pyext pre-commit ruff
+
+RUN pip uninstall -y pynvml nvidia-ml-py && \
+    pip install --resume-retries 999 --no-cache-dir --upgrade "nvidia-ml-py>=12.560.30" "fastapi[standard]>=0.115.0" "optree>=0.13.0" "pydantic>=2.9" "grpcio>=1.62.1"
+
+RUN pip install --resume-retries 999 --no-cache-dir nvidia-cudnn-cu12==9.8.0.87
+
+# Install TransformerEngine
+RUN export NVTE_FRAMEWORK=pytorch && pip3 install --resume-retries 999 --no-deps --no-cache-dir --no-build-isolation git+https://github.com/NVIDIA/TransformerEngine.git@release_v2.5
+
+# Install Megatron-LM
+RUN pip3 install --no-deps --no-cache-dir --no-build-isolation git+https://github.com/NVIDIA/Megatron-LM.git@core_v0.12.2
+
+# Install mbridge
+RUN pip3 install --no-cache-dir mbridge
\ No newline at end of file
--- a/docker/verl0.5-cu126-torch2.7.1-fa2.8.0/Dockerfile.base
+++ b/docker/verl0.5-cu126-torch2.7.1-fa2.8.0/Dockerfile.base
+# Base Docker Image of verl, with CUDA/Torch/FlashAttn/Apex/TransformerEngine, without other frameworks
+# Target: verlai/verl:base-verl0.5-cu126-cudnn9.8-torch2.7.1-fa2.8.0-fi0.2.6
+# Start from the NVIDIA official image (ubuntu-22.04 + cuda-12.6 + python-3.10)
+# https://docs.nvidia.com/deeplearning/frameworks/pytorch-release-notes/rel-24-08.html
+FROM nvcr.io/nvidia/pytorch:24.08-py3
+
+# Define environments
+ENV MAX_JOBS=16
+ENV VLLM_WORKER_MULTIPROC_METHOD=spawn
+ENV DEBIAN_FRONTEND=noninteractive
+ENV NODE_OPTIONS=""
+ENV PIP_ROOT_USER_ACTION=ignore
+ENV HF_HUB_ENABLE_HF_TRANSFER="1"
+
+# Define installation arguments
+ARG APT_SOURCE=https://mirrors.tuna.tsinghua.edu.cn/ubuntu/
+ARG PIP_INDEX=https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple
+
+# Set apt source
+RUN cp /etc/apt/sources.list /etc/apt/sources.list.bak && \
+    { \
+    echo "deb ${APT_SOURCE} jammy main restricted universe multiverse"; \
+    echo "deb ${APT_SOURCE} jammy-updates main restricted universe multiverse"; \
+    echo "deb ${APT_SOURCE} jammy-backports main restricted universe multiverse"; \
+    echo "deb ${APT_SOURCE} jammy-security main restricted universe multiverse"; \
+    } > /etc/apt/sources.list
+
+# Install systemctl
+RUN apt-get update && \
+    apt-get install -y -o Dpkg::Options::="--force-confdef" systemd && \
+    apt-get clean
+
+# Install tini
+RUN apt-get update && \
+    apt-get install -y tini aria2 libfreeimage3 libfreeimage-dev zlib1g htop && \
+    apt-get clean
+
+# Change pip source
+RUN pip config set global.index-url "${PIP_INDEX}" && \
+    pip config set global.extra-index-url "${PIP_INDEX}" && \
+    python -m pip install --upgrade pip
+
+# Uninstall nv-pytorch fork
+RUN pip uninstall -y torch torchvision torchaudio \
+    pytorch-quantization pytorch-triton torch-tensorrt \
+    xgboost transformer_engine flash_attn apex megatron-core grpcio
+
+RUN pip install --resume-retries 999 --no-cache-dir torch==2.7.1 torchvision==0.22.1 torchaudio==2.7.1
+
+# Install flash-attn-2.8.0.post2 (cxx11abi=True)
+RUN ABI_FLAG=$(python -c "import torch; print('TRUE' if torch._C._GLIBCXX_USE_CXX11_ABI else 'FALSE')") && \
+    URL="https://github.com/Dao-AILab/flash-attention/releases/download/v2.8.0.post2/flash_attn-2.8.0.post2+cu12torch2.7cxx11abi${ABI_FLAG}-cp310-cp310-linux_x86_64.whl" && \
+    FILE="flash_attn-2.8.0.post2+cu12torch2.7cxx11abi${ABI_FLAG}-cp310-cp310-linux_x86_64.whl" && \
+    wget -nv "${URL}" && \
+    pip install --no-cache-dir "${FILE}"
+
+# Fix packages
+RUN pip uninstall -y pynvml nvidia-ml-py && \
+    pip install --no-cache-dir --upgrade "nvidia-ml-py>=12.560.30" "fastapi[standard]>=0.115.0" "optree>=0.13.0" "pydantic>=2.9" "grpcio>=1.62.1"
+
+# Install cudnn
+RUN aria2c --max-tries=9999 https://developer.download.nvidia.com/compute/cudnn/9.8.0/local_installers/cudnn-local-repo-ubuntu2204-9.8.0_1.0-1_amd64.deb && \
+    dpkg -i cudnn-local-repo-ubuntu2204-9.8.0_1.0-1_amd64.deb && \
+    cp /var/cudnn-local-repo-ubuntu2204-9.8.0/cudnn-*-keyring.gpg /usr/share/keyrings/ && \
+    apt-get update && \
+    apt-get -y install cudnn-cuda-12 && \
+    rm cudnn-local-repo-ubuntu2204-9.8.0_1.0-1_amd64.deb
+
+# Install Apex
+RUN pip install -v --disable-pip-version-check --no-cache-dir --no-build-isolation --config-settings "--build-option=--cpp_ext" --config-settings "--build-option=--cuda_ext" --resume-retries 999 git+https://github.com/NVIDIA/apex.git
+
+# Profiling tools
+RUN aria2c --always-resume=true --max-tries=99999 https://developer.nvidia.com/downloads/assets/tools/secure/nsight-systems/2025_3/nsight-systems-2025.3.1_2025.3.1.90-1_amd64.deb && \
+    apt-get update && apt-get install -y libxcb-cursor0
+
+RUN apt-get install -y ./nsight-systems-2025.3.1_2025.3.1.90-1_amd64.deb && \
+    rm -rf /usr/local/cuda/bin/nsys && \
+    ln -s /opt/nvidia/nsight-systems/2025.3.1/target-linux-x64/nsys  /usr/local/cuda/bin/nsys && \
+    rm -rf /usr/local/cuda/bin/nsys-ui && \
+    ln -s /opt/nvidia/nsight-systems/2025.3.1/target-linux-x64/nsys-ui /usr/local/cuda/bin/nsys-ui && \
+    rm nsight-systems-2025.3.1_2025.3.1.90-1_amd64.deb
+
+RUN pip install --resume-retries 999 --no-cache-dir "tensordict==0.6.2" torchdata "transformers[hf_xet]>=4.53" accelerate datasets peft hf-transfer \
+    "numpy<2.0.0" "pyarrow>=19.0.1" pandas cuda-bindings \
+    ray[default] codetiming hydra-core pylatexenc qwen-vl-utils wandb dill pybind11 liger-kernel mathruler blobfile xgrammar \
+    pytest py-spy pyext pre-commit ruff
+
+# Install DeepEP
+## the dependency of IBGDA
+RUN ln -s /usr/lib/x86_64-linux-gnu/libmlx5.so.1 /usr/lib/x86_64-linux-gnu/libmlx5.so
+
+## Clone and build deepep and deepep-nvshmem
+RUN git clone -b v2.3.1 https://github.com/NVIDIA/gdrcopy.git && \
+    git clone https://github.com/deepseek-ai/DeepEP.git  && \
+    cd DeepEP && git checkout a84a248
+
+# Prepare nvshmem
+RUN wget https://developer.nvidia.com/downloads/assets/secure/nvshmem/nvshmem_src_3.2.5-1.txz && \
+    tar -xvf nvshmem_src_3.2.5-1.txz && mv nvshmem_src deepep-nvshmem && \
+    cd deepep-nvshmem && git apply ../DeepEP/third-party/nvshmem.patch
+
+ENV CUDA_HOME=/usr/local/cuda
+### Set MPI environment variables. Having errors when not set.
+ENV CPATH=/usr/local/mpi/include:$CPATH
+ENV LD_LIBRARY_PATH=/usr/local/mpi/lib:$LD_LIBRARY_PATH
+ENV LD_LIBRARY_PATH=/usr/local/x86_64-linux-gnu:$LD_LIBRARY_PATH
+ENV GDRCOPY_HOME=/workspace/gdrcopy
+
+## Build deepep-nvshmem
+RUN cd deepep-nvshmem && \
+    NVSHMEM_SHMEM_SUPPORT=0 \
+    NVSHMEM_UCX_SUPPORT=0 \
+    NVSHMEM_USE_NCCL=0 \
+    NVSHMEM_MPI_SUPPORT=0 \
+    NVSHMEM_IBGDA_SUPPORT=1 \
+    NVSHMEM_PMIX_SUPPORT=0 \
+    NVSHMEM_TIMEOUT_DEVICE_POLLING=0 \
+    NVSHMEM_USE_GDRCOPY=1 \
+    cmake -G Ninja -S . -B build/ -DCMAKE_INSTALL_PREFIX=/workspace/deepep-nvshmem/install && cmake --build build/ --target install
+
+ENV NVSHMEM_DIR=/workspace/deepep-nvshmem/install
+ENV LD_LIBRARY_PATH=$NVSHMEM_DIR/lib:$LD_LIBRARY_PATH
+ENV PATH=$NVSHMEM_DIR/bin:$PATH
+
+## Build deepep
+RUN cd DeepEP && \
+    python setup.py install
+
+# Reset pip config
+RUN pip config unset global.index-url && \
+    pip config unset global.extra-index-url
+
--- a/docker/verl0.5-cu126-torch2.7.1-fa2.8.0/README.md
+++ b/docker/verl0.5-cu126-torch2.7.1-fa2.8.0/README.md
+# verl image with verl v0.5
+
+## Important packages version
+
+```txt
+cuda==12.6
+cudnn==9.8.0
+torch==2.7.1
+flash_attn=2.8.0    ##
+sglang==0.4.8
+vllm==0.8.5.post1
+vidia-cudnn-cu12==9.8.0.87
+transformer_engine==2.3
+megatron.core==core_v0.12.2
+# Preview
+transformer_engine==2.5
+megatron.core==core_r0.13.0
+```
+
+## Target
+
+- Base image:
+    - `verlai/verl:base-verl0.5-cu126-cudnn9.8-torch2.7.1-fa2.8.0`: We offer a base image with deep ep built in
+- App image:
+    - `verlai/verl:app-verl0.5-sglang0.4.9-mcore0.12.2`
+    - `verlai/verl:app-verl0.5-sglang0.4.9-mcore0.13.0-preview`
+- vllm temporarily not support latest version
\ No newline at end of file
--- a/docker/verl0.5-preview-cu128-torch2.7.1-fa2.8.0/Dockerfile.app.sglang.megatron
+++ b/docker/verl0.5-preview-cu128-torch2.7.1-fa2.8.0/Dockerfile.app.sglang.megatron
+# Start from the verl base image
+# Dockerfile.base
+FROM verlai/verl:base-verl0.5-preview-cu128-cudnn9.8-torch2.7.1-fa2.8.0-fi0.2.6
+
+# Define environments
+ENV MAX_JOBS=8
+ENV VLLM_WORKER_MULTIPROC_METHOD=spawn
+ENV DEBIAN_FRONTEND=noninteractive
+ENV NODE_OPTIONS=""
+ENV PIP_ROOT_USER_ACTION=ignore
+ENV HF_HUB_ENABLE_HF_TRANSFER="1"
+
+# Install sglang-0.4.8 and torch-memory-saver
+# Install FlashInfer Python package
+RUN pip install --resume-retries 999 --no-cache-dir --no-build-isolation flashinfer-python==0.2.6.post1
+RUN pip install --resume-retries 999  --no-cache-dir "sglang[all]==0.4.8" && pip install torch-memory-saver --no-cache-dir
+
+# Fix packages
+RUN pip install --no-cache-dir "tensordict==0.6.2" "transformers[hf_xet]>=4.51.0" accelerate datasets peft hf-transfer \
+    "numpy<2.0.0" "pyarrow>=19.0.1" pandas \
+    ray[default] codetiming hydra-core pylatexenc qwen-vl-utils wandb dill pybind11 liger-kernel mathruler blobfile xgrammar \
+    pytest py-spy pre-commit ruff
+
+RUN pip uninstall -y pynvml nvidia-ml-py && \
+    pip install --resume-retries 999 --no-cache-dir --upgrade "nvidia-ml-py>=12.560.30" "fastapi[standard]>=0.115.0" "optree>=0.13.0" "pydantic>=2.9" "grpcio>=1.62.1"
+
+RUN pip install --resume-retries 999 --no-cache-dir nvidia-cudnn-cu12==9.8.0.87
+
+# Install TransformerEngine
+RUN export NVTE_FRAMEWORK=pytorch && pip3 install --resume-retries 999 --no-deps --no-cache-dir --no-build-isolation git+https://github.com/NVIDIA/TransformerEngine.git@release_v2.5
+
+# Install Megatron-LM
+RUN pip3 install --no-deps --no-cache-dir --no-build-isolation git+https://github.com/NVIDIA/Megatron-LM.git@core_r0.13.0
+
+# Install mbridge
+RUN pip3 install --no-cache-dir mbridge
\ No newline at end of file
--- a/docker/verl0.5-preview-cu128-torch2.7.1-fa2.8.0/Dockerfile.base
+++ b/docker/verl0.5-preview-cu128-torch2.7.1-fa2.8.0/Dockerfile.base
+# Base Docker Image of verl, with CUDA/Torch/FlashAttn/Apex/TransformerEngine, without other frameworks
+# Target: verlai/verl:base-verl0.5-preview-cu128-cudnn9.8-torch2.7.1-fa2.8.0-fi0.2.6
+# Start from the NVIDIA official image (ubuntu-22.04 + cuda-12.6 + python-3.10)
+# https://docs.nvidia.com/deeplearning/frameworks/pytorch-release-notes/rel-24-08.html
+FROM nvcr.io/nvidia/pytorch:25.02-py3
+
+# Define environments
+ENV MAX_JOBS=16
+ENV VLLM_WORKER_MULTIPROC_METHOD=spawn
+ENV DEBIAN_FRONTEND=noninteractive
+ENV NODE_OPTIONS=""
+ENV PIP_ROOT_USER_ACTION=ignore
+ENV HF_HUB_ENABLE_HF_TRANSFER="1"
+
+# Define installation arguments
+ARG APT_SOURCE=https://mirrors.tuna.tsinghua.edu.cn/ubuntu/
+ARG PIP_INDEX=https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple
+
+# Set apt source
+RUN cp /etc/apt/sources.list /etc/apt/sources.list.bak && \
+    { \
+    echo "deb ${APT_SOURCE} jammy main restricted universe multiverse"; \
+    echo "deb ${APT_SOURCE} jammy-updates main restricted universe multiverse"; \
+    echo "deb ${APT_SOURCE} jammy-backports main restricted universe multiverse"; \
+    echo "deb ${APT_SOURCE} jammy-security main restricted universe multiverse"; \
+    } > /etc/apt/sources.list
+
+# Install systemctl
+RUN apt-get update && \
+    apt-get install -y -o Dpkg::Options::="--force-confdef" systemd && \
+    apt-get clean
+
+# Install tini
+RUN apt-get update && \
+    apt-get install -y tini aria2 libfreeimage3 libfreeimage-dev zlib1g htop && \
+    apt-get clean
+
+# Change pip source
+RUN pip config set global.index-url "${PIP_INDEX}" && \
+    pip config set global.extra-index-url "${PIP_INDEX}" && \
+    python -m pip install --upgrade pip
+
+# Uninstall nv-pytorch fork
+RUN pip uninstall -y torch torchvision torchaudio \
+    pytorch-quantization pytorch-triton torch-tensorrt \
+    xgboost transformer_engine flash_attn apex megatron-core grpcio
+
+RUN pip install --resume-retries 999 --no-cache-dir torch==2.7.1 torchvision==0.22.1 torchaudio==2.7.1 --index-url https://download.pytorch.org/whl/cu128
+
+# Install flash-attn-2.8.0.post2 (cxx11abi=True)
+RUN ABI_FLAG=$(python -c "import torch; print('TRUE' if torch._C._GLIBCXX_USE_CXX11_ABI else 'FALSE')") && \
+    URL="https://github.com/Dao-AILab/flash-attention/releases/download/v2.8.0.post2/flash_attn-2.8.0.post2+cu12torch2.7cxx11abi${ABI_FLAG}-cp312-cp312-linux_x86_64.whl" && \
+    FILE="flash_attn-2.8.0.post2+cu12torch2.7cxx11abi${ABI_FLAG}-cp312-cp312-linux_x86_64.whl" && \
+    wget -nv "${URL}" && \
+    pip install --no-cache-dir "${FILE}"
+
+# Fix packages
+RUN pip uninstall -y pynvml nvidia-ml-py && \
+    pip install --no-cache-dir --upgrade "nvidia-ml-py>=12.560.30" "fastapi[standard]>=0.115.0" "optree>=0.13.0" "pydantic>=2.9" "grpcio>=1.62.1"
+
+# Install cudnn
+RUN aria2c --max-tries=9999 https://developer.download.nvidia.com/compute/cudnn/9.8.0/local_installers/cudnn-local-repo-ubuntu2204-9.8.0_1.0-1_amd64.deb && \
+    dpkg -i cudnn-local-repo-ubuntu2204-9.8.0_1.0-1_amd64.deb && \
+    cp /var/cudnn-local-repo-ubuntu2204-9.8.0/cudnn-*-keyring.gpg /usr/share/keyrings/ && \
+    apt-get update && \
+    apt-get -y install cudnn-cuda-12 && \
+    rm cudnn-local-repo-ubuntu2204-9.8.0_1.0-1_amd64.deb
+
+# Install Apex
+RUN pip install -v --disable-pip-version-check --no-cache-dir --no-build-isolation --config-settings "--build-option=--cpp_ext" --config-settings "--build-option=--cuda_ext" --resume-retries 999 git+https://github.com/NVIDIA/apex.git
+
+# Profiling tools
+RUN aria2c --always-resume=true --max-tries=99999 https://developer.nvidia.com/downloads/assets/tools/secure/nsight-systems/2025_3/nsight-systems-2025.3.1_2025.3.1.90-1_amd64.deb && \
+    apt-get update && apt-get install -y libxcb-cursor0
+
+RUN apt-get install -y ./nsight-systems-2025.3.1_2025.3.1.90-1_amd64.deb && \
+    rm -rf /usr/local/cuda/bin/nsys && \
+    ln -s /opt/nvidia/nsight-systems/2025.3.1/target-linux-x64/nsys  /usr/local/cuda/bin/nsys && \
+    rm -rf /usr/local/cuda/bin/nsys-ui && \
+    ln -s /opt/nvidia/nsight-systems/2025.3.1/target-linux-x64/nsys-ui /usr/local/cuda/bin/nsys-ui && \
+    rm nsight-systems-2025.3.1_2025.3.1.90-1_amd64.deb
+
+RUN pip install --resume-retries 999 --no-cache-dir "tensordict==0.6.2" torchdata "transformers[hf_xet]>=4.51.0" accelerate datasets peft hf-transfer \
+    "numpy<2.0.0" "pyarrow>=19.0.1" pandas cuda-bindings \
+    ray[default] codetiming hydra-core pylatexenc qwen-vl-utils wandb dill pybind11 liger-kernel mathruler blobfile xgrammar \
+    pytest py-spy pre-commit ruff
+
+# Reset pip config
+RUN pip config unset global.index-url && \
+    pip config unset global.extra-index-url
+
--- a/docker/verl0.5-preview-cu128-torch2.7.1-fa2.8.0/README.md
+++ b/docker/verl0.5-preview-cu128-torch2.7.1-fa2.8.0/README.md
+# verl image with verl v0.5
+
+## Important packages version
+
+```txt
+cuda==12.8
+cudnn==9.8.0
+torch==2.7.1
+flash_attn=2.8.0    ##
+sglang==0.4.8
+transformer_engine==2.5
+megatron.core==core_r0.13.0
+vidia-cudnn-cu12==9.8.0.87
+```
+
+## Target
+
+- Base image:
+    - `verlai/verl:base-verl0.5-preview-cu128-cudnn9.8-torch2.7.1-fa2.8.0`: We offer a base image with flash infer 0.2.6.post1 built in
+- App image:
+    - `verlai/verl:app-verl0.5-preview-sglang0.4.8-mcore0.13.0-preview`
+- vllm temporarily not support latest version
+
+## !!!Notice!!!
+
+- pyext is lack of maintainace and cannot work with python 3.12, consider using replacement and deprecating this package.
\ No newline at end of file
--- a/docs/Makefile
+++ b/docs/Makefile
+# Minimal makefile for Sphinx documentation
+#
+
+# You can set these variables from the command line.
+SPHINXOPTS    =
+SPHINXBUILD   = sphinx-build
+SPHINXPROJ    = verl
+SOURCEDIR     = .
+BUILDDIR      = _build
+
+# Put it first so that "make" without argument is like "make help".
+help:
+	@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
+
+.PHONY: help Makefile
+
+# Catch-all target: route all unknown targets to Sphinx using the new
+# "make mode" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).
+%: Makefile
+	@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
--- a/docs/README.md
+++ b/docs/README.md
+# verl documentations
+
+## Build the docs
+
+```bash
+# If you want to view auto-generated API docstring, please make sure verl is available in python path. For instance, install verl via:
+# pip install .. -e[test]
+
+# Install dependencies needed for building docs.
+pip install -r requirements-docs.txt
+
+# Build the docs.
+make clean
+make html
+```
+
+## Open the docs with your browser
+
+```bash
+python -m http.server -d _build/html/
+```
+Launch your browser and navigate to http://localhost:8000 to view the documentation. Alternatively you could drag the file `_build/html/index.html` to your local browser and view directly.
--- a/docs/README_vllm0.7.md
+++ b/docs/README_vllm0.7.md
+# Upgrading to vllm >= 0.7
+
+Note: verl+vllm 0.8.3 is now stable. Please see ``docs/README_vllm0.8.md`` for upgrade guide.
+
+## Installation
+
+Note: At time of writing, verl+vllm 0.7.x supports **FSDP** for training and **vLLM** for rollout.
+
+```
+# Create the conda environment
+conda create -n verl python==3.10
+conda activate verl
+
+# Install verl
+git clone https://github.com/volcengine/verl.git
+cd verl
+pip3 install -e .
+
+# Install the latest stable version of vLLM
+pip3 install vllm==0.7.3 
+
+# Install flash-attn
+pip3 install flash-attn --no-build-isolation
+
+```
+
+Note that if you are installing lower versions of vLLM (0.7.0, 0.7.1, 0.7.2), you need to make some tiny patches manually on vllm (/path/to/site-packages/vllm after installation) after the above steps:
+
+- vllm/distributed/parallel_state.py: Remove the assertion below:
+
+```
+if (world_size
+        != tensor_model_parallel_size * pipeline_model_parallel_size):
+    raise RuntimeError(
+        f"world_size ({world_size}) is not equal to "
+        f"tensor_model_parallel_size ({tensor_model_parallel_size}) x "
+        f"pipeline_model_parallel_size ({pipeline_model_parallel_size})")
+
+```
+
+- vllm/executor/uniproc_executor.py: change `local_rank = rank` to `local_rank = int(os.environ["LOCAL_RANK"])`
+- vllm/model_executor/model_loader/weight_utils.py: remove the `torch.cuda.empty_cache()` in `pt_weights_iterator`
+
+## Features
+
+### Use cuda graph
+
+After installation, examples using FSDP as training backends can be used. By default, the `enforce_eager` is set to True, which disables the cuda graph. To enjoy cuda graphs and the sleep mode of vLLM>=0.7, add the following lines to the bash script:
+
+```
+actor_rollout_ref.rollout.enforce_eager=False \
+actor_rollout_ref.rollout.free_cache_engine=True \
+
+```
+
+For a typical job like examples/ppo_trainer/run_qwen2-7b_seq_balance.sh, the rollout generation time is 85 seconds with vLLM0.7.0. By enabling the cudagraph, the generation duration is further reduced to 62 seconds.
+
+**Note:** Currently, if the `n` is greater than 1 in `SamplingParams` in vLLM>=0.7, there is a potential performance issue on the stability of rollout generation time (Some iterations would see generation time bursts) using vLLM's V0 Engine.
+
+### Use vLLM V1 Engine
+
+Using the vLLM V1 engine can avoid instability issues and achieve additional performance improvements. To use the V1 engine, you can first uninstall the previously installed vLLM and then follow the steps below to install the newer version.
+
+```
+git clone https://github.com/vllm-project/vllm.git
+cd vllm
+git checkout 2275784
+sed -i "903a\    data_parallel_size = world_size // pipeline_model_parallel_size // tensor_model_parallel_size" ./vllm/distributed/parallel_state.py
+VLLM_USE_PRECOMPILED=1 pip install --editable .
+```
+
+Then you can enable the V1 engine by setting `export VLLM_USE_V1=1`. In some benchmark tests, the V1 engine demonstrates a 1.5x speed improvement over the vLLM V0 engine.
+The stable support of the vLLM V1 engine is available on verl main.
--- a/docs/README_vllm0.8.md
+++ b/docs/README_vllm0.8.md
+# Upgrading to vLLM >= 0.8
+
+Last updated: 05/04/2025.
+
+## Installation
+
+Note: This version of verl+vLLM 0.8+ supports **FSDP** for training and **vLLM** for rollout.
+
+```bash
+# Create the conda environment
+conda create -n verl python==3.10
+conda activate verl
+
+# Install verl
+git clone https://github.com/volcengine/verl.git
+cd verl
+pip3 install -e .
+
+# Install the latest stable version of vLLM
+pip3 install vllm==0.8.3
+
+# Install flash-attn
+pip3 install flash-attn --no-build-isolation
+
+```
+
+We have a pre-built docker image for verl+vLLM 0.8.3. You can direct import it with the following command:
+
+```bash
+docker pull hiyouga/verl:ngc-th2.6.0-cu126-vllm0.8.3-flashinfer0.2.2-cxx11abi0
+```
+
+## Features
+
+vLLM 0.8+ supports cuda graph and V1 engine by default in verl. To enable these features, remember to add the following lines to the bash script:
+
+```bash
+actor_rollout_ref.rollout.enforce_eager=False \
+actor_rollout_ref.rollout.free_cache_engine=True \
+```
+
+and also **remove** the environment variable if it exists:
+
+## Notes
+
+When you just directly upgrade vllm>=0.8, some dependency packages may undergo version changes. If you encounter the following problems:
+
+```bash
+in <module> from torch.multiprocessing.reductions import ForkingPickler ImportError: cannot import name 'ForkingPickler' from 'torch.multiprocessing.reductions' (/opt/conda/lib/python3.11/site-packages/torch/multiprocessing/reductions.py)
+```
+
+You need to upgrade `tensordict` to version 0.6.2 using the command `pip install tensordict==0.6.2`.
--- a/docs/_static/custom.css
+++ b/docs/_static/custom.css
+/* Make the documentation use full screen width */
+.wy-nav-content {
+    max-width: none !important;
+    width: 100% !important;
+    padding: 1.618em 3.236em !important;
+}
+
+/* Adjust the content wrapper - will be set by JavaScript */
+.wy-nav-content-wrap {
+    margin-left: 300px;
+    transition: margin-left 0.2s ease;
+    width: auto !important;
+    position: relative !important;
+    background: white !important;
+    min-height: 100vh !important;
+}
+
+/* Make the main content area responsive */
+.rst-content {
+    max-width: none !important;
+    width: 100% !important;
+}
+
+/* Optional: Adjust table widths to prevent overflow */
+.rst-content table.docutils {
+    width: 100% !important;
+    table-layout: auto !important;
+}
+
+/* Optional: Better code block width handling */
+.rst-content .highlight {
+    width: 100% !important;
+}
+
+/* Content area positioning already handled above */
+
+/* Optional: Improve readability with some margin on very wide screens */
+@media (min-width: 1400px) {
+    .wy-nav-content {
+        max-width: none !important;
+        margin: 0 auto !important;
+    }
+}
+
+/* Resizable sidebar styles */
+.wy-nav-side {
+    position: fixed !important;
+    top: 0 !important;
+    bottom: 0 !important;
+    left: 0 !important;
+    width: 300px;
+    min-width: 200px;
+    max-width: 600px;
+    display: flex;
+    flex-direction: column;
+    z-index: 200 !important;
+}
+
+/* Ensure sidebar header (logo, search) adapts to width */
+.wy-side-nav-search {
+    width: 100% !important;
+    box-sizing: border-box !important;
+    padding: 0.809em 0.809em !important;
+}
+
+.wy-side-nav-search input[type="text"] {
+    width: 100% !important;
+    box-sizing: border-box !important;
+}
+
+/* Make logo/title area responsive */
+.wy-side-nav-search > div.version {
+    width: 100% !important;
+}
+
+.wy-side-nav-search > a {
+    width: 100% !important;
+    display: block !important;
+    white-space: nowrap !important;
+    overflow: hidden !important;
+    text-overflow: ellipsis !important;
+}
+
+/* Responsive adjustments for narrow sidebar */
+@media (max-width: 300px) {
+    .wy-side-nav-search > a {
+        font-size: 0.9em !important;
+    }
+    
+    .wy-side-nav-search input[type="text"] {
+        font-size: 0.8em !important;
+    }
+}
+
+/* Ensure search input doesn't overflow */
+.wy-side-nav-search form {
+    width: 100% !important;
+    margin: 0 !important;
+}
+
+/* Make search icon responsive */
+.wy-side-nav-search .wy-dropdown {
+    width: 100% !important;
+}
+
+/* Adjust search results dropdown width */
+.wy-side-nav-search .wy-dropdown-menu {
+    width: 100% !important;
+    max-width: none !important;
+    left: 0 !important;
+    right: 0 !important;
+}
+
+/* Resize handle is created by JavaScript */
+
+/* Make sure the sidebar content doesn't overflow */
+.wy-side-scroll {
+    width: 100% !important;
+    flex: 1 !important;
+    overflow-y: auto !important;
+    overflow-x: hidden !important;
+    padding-right: 10px !important;
+    box-sizing: border-box !important;
+    scroll-behavior: auto !important; /* Prevent smooth scrolling on sidebar itself */
+}
+
+/* Ensure proper scroll behavior for main content area */
+html {
+    scroll-behavior: smooth !important;
+}
+
+/* Ensure anchor links work properly in main content */
+.wy-nav-content-wrap {
+    scroll-behavior: smooth !important;
+}
+
+/* Fix scroll to target for anchor links */
+.rst-content {
+    scroll-behavior: smooth !important;
+}
+
+/* Fix anchor scroll offset to account for fixed header */
+.rst-content .section {
+    scroll-margin-top: 60px;
+}
+
+/* Fix anchor scroll offset for headers */
+.rst-content h1, .rst-content h2, .rst-content h3, .rst-content h4, .rst-content h5, .rst-content h6 {
+    scroll-margin-top: 60px;
+}
+
+/* Fix anchor scroll offset for specific scroll targets */
+.rst-content .headerlink {
+    scroll-margin-top: 60px;
+}
+
+/* Fix sidebar navigation styling */
+.wy-menu-vertical {
+    width: 100% !important;
+}
+
+.wy-menu-vertical li {
+    width: 100% !important;
+}
+
+.wy-menu-vertical a {
+    width: 100% !important;
+    word-wrap: break-word !important;
+    white-space: normal !important;
+}
+
+/* Content area margin is handled by JavaScript */
+
+/* Custom drag handle (more visible) */
+.resize-handle {
+    position: absolute;
+    top: 0;
+    right: 0;
+    width: 8px;
+    height: 100%;
+    background: #ccc;
+    cursor: col-resize;
+    z-index: 1001;
+    opacity: 0.3;
+    transition: opacity 0.2s ease;
+}
+
+.resize-handle:hover {
+    opacity: 0.8;
+    background: #999;
+}
+
+.resize-handle::before {
+    content: '';
+    position: absolute;
+    top: 50%;
+    left: 50%;
+    width: 2px;
+    height: 20px;
+    background: #666;
+    transform: translate(-50%, -50%);
+    border-radius: 1px;
+}
+
+.resize-handle:hover::before {
+    background: #333;
+}
+
+/* Ensure smooth resizing */
+.wy-nav-side.resizing {
+    user-select: none;
+    pointer-events: none;
+}
+
+.wy-nav-side.resizing .wy-side-scroll {
+    overflow: hidden;
+}
\ No newline at end of file
--- a/docs/_static/js/resizable-sidebar.js
+++ b/docs/_static/js/resizable-sidebar.js
+// Resizable sidebar functionality
+document.addEventListener('DOMContentLoaded', function() {
+    const sidebar = document.querySelector('.wy-nav-side');
+    const content = document.querySelector('.wy-nav-content-wrap');
+    
+    if (!sidebar || !content) return;
+    
+    // Create resize handle
+    const resizeHandle = document.createElement('div');
+    resizeHandle.className = 'resize-handle';
+    sidebar.appendChild(resizeHandle);
+    
+    let isResizing = false;
+    let startX = 0;
+    let startWidth = 0;
+    
+    // Get initial width
+    const getInitialWidth = () => {
+        return 300; // Default width
+    };
+    
+    // Save width to localStorage
+    const saveWidth = (width) => {
+        localStorage.setItem('sidebar-width', width);
+    };
+    
+    // Load width from localStorage
+    const loadWidth = () => {
+        const savedWidth = localStorage.getItem('sidebar-width');
+        if (savedWidth) {
+            const width = parseInt(savedWidth, 10);
+            if (width >= 200 && width <= 600) {
+                return width;
+            }
+        }
+        return getInitialWidth();
+    };
+    
+    // Apply width to sidebar and content
+    const applyWidth = (width) => {
+        // Update sidebar width
+        sidebar.style.width = width + 'px';
+        
+        // Update content margin with !important to override any CSS
+        content.style.setProperty('margin-left', width + 'px', 'important');
+        
+        // Also update any other content wrapper that might exist
+        const contentInner = document.querySelector('.wy-nav-content');
+        if (contentInner) {
+            contentInner.style.setProperty('margin-left', '0px', 'important');
+        }
+        
+        // Force reflow and repaint
+        sidebar.offsetHeight;
+        content.offsetHeight;
+        
+        // Trigger window resize event to notify other components
+        window.dispatchEvent(new Event('resize'));
+    };
+    
+    // Initialize with saved width
+    const initialWidth = loadWidth();
+    applyWidth(initialWidth);
+    
+    // Mouse down on resize handle
+    resizeHandle.addEventListener('mousedown', (e) => {
+        isResizing = true;
+        startX = e.clientX;
+        startWidth = parseInt(window.getComputedStyle(sidebar).width, 10);
+        
+        sidebar.classList.add('resizing');
+        document.body.style.cursor = 'col-resize';
+        document.body.style.userSelect = 'none';
+        
+        // Add overlay to prevent iframe issues
+        const overlay = document.createElement('div');
+        overlay.style.cssText = `
+            position: fixed;
+            top: 0;
+            left: 0;
+            width: 100%;
+            height: 100%;
+            z-index: 9999;
+            cursor: col-resize;
+        `;
+        overlay.id = 'resize-overlay';
+        document.body.appendChild(overlay);
+        
+        e.preventDefault();
+    });
+    
+    // Mouse move
+    document.addEventListener('mousemove', (e) => {
+        if (!isResizing) return;
+        
+        const width = startWidth + e.clientX - startX;
+        const clampedWidth = Math.max(200, Math.min(600, width));
+        applyWidth(clampedWidth);
+    });
+    
+    // Mouse up
+    document.addEventListener('mouseup', () => {
+        if (!isResizing) return;
+        
+        isResizing = false;
+        sidebar.classList.remove('resizing');
+        document.body.style.cursor = '';
+        document.body.style.userSelect = '';
+        
+        // Remove overlay
+        const overlay = document.getElementById('resize-overlay');
+        if (overlay) {
+            overlay.remove();
+        }
+        
+        // Save the current width
+        const currentWidth = parseInt(window.getComputedStyle(sidebar).width, 10);
+        saveWidth(currentWidth);
+    });
+    
+    // Handle window resize - removed to prevent infinite loop
+    // The sidebar width is fixed and managed by drag functionality, no need to recalculate on window resize
+    
+    // Double-click to reset to default width
+    resizeHandle.addEventListener('dblclick', () => {
+        const defaultWidth = 300;
+        applyWidth(defaultWidth);
+        saveWidth(defaultWidth);
+    });
+});
+
+// Fix navigation issues - Using MutationObserver for reliable initialization
+document.addEventListener('DOMContentLoaded', function() {
+    let navigationFixed = false;
+    
+    function setupNavigationFix() {
+        if (navigationFixed) return;
+        
+        // Find all links in the sidebar
+        const sidebarLinks = document.querySelectorAll('.wy-menu-vertical a');
+        
+        // Only proceed if we have sidebar links
+        if (sidebarLinks.length === 0) return;
+        
+        console.log('Setting up navigation fix...');
+        
+        sidebarLinks.forEach(function(link) {
+            const href = link.getAttribute('href');
+            
+            // Clone the link to remove all existing event listeners
+            const newLink = link.cloneNode(true);
+            
+            // Add our own click handler
+            newLink.addEventListener('click', function(e) {
+                console.log('Link clicked:', href);
+                
+                // If it's an anchor link within the same page
+                if (href && href.startsWith('#') && href !== '#') {
+                    e.preventDefault();
+                    e.stopPropagation();
+                    
+                    const targetId = href.substring(1);
+                    const targetElement = document.getElementById(targetId);
+                    
+                    if (targetElement) {
+                        // Calculate offset for fixed header
+                        const headerHeight = 60;
+                        const elementPosition = targetElement.getBoundingClientRect().top;
+                        const offsetPosition = elementPosition + window.pageYOffset - headerHeight;
+                        
+                        window.scrollTo({
+                            top: offsetPosition,
+                            behavior: 'smooth'
+                        });
+                        
+                        // Update URL hash
+                        if (history.pushState) {
+                            history.pushState(null, null, '#' + targetId);
+                        } else {
+                            location.hash = '#' + targetId;
+                        }
+                    }
+                }
+                // For external links, navigate normally
+                else if (href && !href.startsWith('#') && !href.startsWith('javascript:')) {
+                    console.log('Navigating to external link:', href);
+                    window.location.href = href;
+                }
+            });
+            
+            // Replace the old link with the new one
+            link.parentNode.replaceChild(newLink, link);
+        });
+        
+        navigationFixed = true;
+        
+        // Handle initial page load with hash
+        if (window.location.hash) {
+            // Use requestAnimationFrame for better timing
+            requestAnimationFrame(() => {
+                const targetId = window.location.hash.substring(1);
+                const targetElement = document.getElementById(targetId);
+                if (targetElement) {
+                    const headerHeight = 60;
+                    const elementPosition = targetElement.getBoundingClientRect().top;
+                    const offsetPosition = elementPosition + window.pageYOffset - headerHeight;
+                    
+                    window.scrollTo({
+                        top: offsetPosition,
+                        behavior: 'smooth'
+                    });
+                }
+            });
+        }
+    }
+    
+    // Try to set up navigation fix immediately
+    setupNavigationFix();
+    
+    // If it didn't work, use MutationObserver to watch for when sidebar links are added
+    if (!navigationFixed) {
+        const observer = new MutationObserver(function(mutations) {
+            mutations.forEach(function(mutation) {
+                if (mutation.type === 'childList' && mutation.addedNodes.length > 0) {
+                    // Check if sidebar links were added
+                    const sidebarLinks = document.querySelectorAll('.wy-menu-vertical a');
+                    if (sidebarLinks.length > 0) {
+                        setupNavigationFix();
+                        if (navigationFixed) {
+                            observer.disconnect();
+                        }
+                    }
+                }
+            });
+        });
+        
+        // Start observing the document for changes
+        observer.observe(document.body, {
+            childList: true,
+            subtree: true
+        });
+        
+        // Fallback timeout in case MutationObserver doesn't work
+        setTimeout(function() {
+            if (!navigationFixed) {
+                setupNavigationFix();
+            }
+            observer.disconnect();
+        }, 5000);
+    }
+});
\ No newline at end of file
--- a/docs/_static/js/runllm-widget.js
+++ b/docs/_static/js/runllm-widget.js
+document.addEventListener("DOMContentLoaded", function () {
+    var script = document.createElement("script");
+    script.type = "module";
+    script.id = "runllm-widget-script";
+    script.src = "https://widget.runllm.com";
+    script.setAttribute("version", "stable");
+    script.setAttribute("crossorigin", "true");
+    script.setAttribute("runllm-keyboard-shortcut", "Mod+j");
+    script.setAttribute("runllm-name", "verl Chatbot");
+    script.setAttribute("runllm-position", "TOP_RIGHT");
+    script.setAttribute("runllm-assistant-id", "679");
+    script.async = true;
+    document.head.appendChild(script);
+  });
\ No newline at end of file
--- a/docs/_static/logo.png
+++ b/docs/_static/logo.png
--- a/docs/advance/agent_loop.rst
+++ b/docs/advance/agent_loop.rst
+Agent Loop
+==========
+
+Last updated: 07/17/2025.
+
+.. versionadded:: 0.4.2
+   [status: alpha]
+
+.. warning::
+   Agent Loop is ready for use, but the API may change in future releaes.
+
+Agent Loop is designed as general interface for multi-turn rollout and agentic reinforcement learning.
+
+**Design goal**:
+
+- Plugable user defined agent loop
+- Provide standard request generate api with different inference frameworks
+- Provide request level load balance between multiple inference servers
+
+**Non-goal**:
+
+- How tool is defined and how to call tool
+
+In high level overview, agent loop is given a prompt, run user defined loop: call LLM generate api, call tools, ...
+and return the final output. The final output is then calculated reward and used as trajectory for RL training.
+
+.. image:: https://github.com/eric-haibin-lin/verl-community/blob/main/docs/agent_loop_overview.svg?raw=true
+
+
+API Design
+----------
+
+``AgentLoopBase`` class is the abstraction of agent loop, and ``run`` method is the only interface that user need to implement.
+The run method, given prompt messages in format: [{"role": "user"}, {"content": "..."}], and additional sampling params,
+could do whatever user wants, such as
+
+- call LLM generate api
+- call tools: web search, database query, code sandbox, ...
+- environment interaction
+- reflection
+- ...
+
+.. code:: python
+
+   class AgentLoopBase(ABC):
+       @abstractmethod
+       async def run(self, messages: list[dict[str, Any]], sampling_params: dict[str, Any]) -> AgentLoopOutput:
+           """Run agent loop to interact with LLM server and environment.
+
+           Args:
+               messages (List[Dict[str, Any]]): Input messages.
+               sampling_params (Dict[str, Any]): LLM sampling params.
+
+           Returns:
+               AgentLoopOutput: Agent loop output.
+           """
+           raise NotImplementedError
+
+After running user defined loop, run method should return ``AgentLoopOutput``, including prompt token ids,
+response token ids, and response mask.
+
+.. code:: python
+
+   class AgentLoopOutput(BaseModel):
+       """Agent loop output."""
+
+       prompt_ids: list[int]
+       """Prompt token ids."""
+       response_ids: list[int]
+       """Response token ids including LLM generated token, tool response token."""
+       response_mask: list[int]
+       """Response mask, 1 for LLM generated token, 0 for tool response token."""
+
+.. image:: https://github.com/eric-haibin-lin/verl-community/blob/main/docs/agent_loop_output.svg?raw=true
+
+.. note:: AgentLoopOutput only output one trajectory for a given prompt, multiple trajectories output is still under discussion.
+
+Architecture Design
+-------------------
+
+.. image:: https://github.com/eric-haibin-lin/verl-community/blob/main/docs/agent_loop_architecture.png?raw=true
+
+A single PPO step contain two phase: rollout and train. In rollout phase:
+
+1. PPOTrainer sample a batch from dataset and call ``AgentLoopManager.generate_sequences``.
+2. AgentLoopManager ``wake_up`` all async LLM server instances, which will sync weights between inference engine(vLLM/SGLang) and training engine(FSDP/Megatron-LM).
+3. AgentLoopManager split batch into chunks and send each chunk to ``AgentLoopWorker``.
+4. AgentLoopWorker receive chunk and for each prompt, spawn a user defined ``AgentLoopBase`` instance, run ``run`` coroutine until end and get ``AgentLoopOutput``.
+
+.. tip::
+   AgentLoopWorker schedules multiple coroutines concurrently. If number of AgentLoopWorker equals batch_size, then each worker is response for one prompt.
+
+In agent loop, when user need LLM generate response:
+
+5. Call ``AsyncLLMServerManager.generate`` with prompt_ids.
+6. AsyncLLMServerManager select a server instance with least request in first turn and send request to it. (In following turns, the request will be sent to the same server instance).
+7. AsyncLLMServer receive a request, issue ipc/rpc with model_runner, and generate response. (There's slight differences between vLLM and SGLang, see below).
+
+When all prompts in all AgentLoopWorker finish, AgentLoopManager gather results and return to PPOTrainer.
+
+8. AgentLoopManager ``sleep`` all server instances, which will free kv cache and offload weights to CPU memory.
+
+AsyncLLMServer
+~~~~~~~~~~~~~~
+
+AsyncLLMServer is the abstraction of LLM server with two types of generation api:
+
+- `OpenAI chat completion <https://platform.openai.com/docs/api-reference/chat>`_: generate response for the given chat conversation.
+- Token in token out: generate response ids for the given token ids.
+
+We have officially supported vLLM and SGLang AsyncLLMServer, both of them implement the two api and are well tested.
+Other inference engine should be easy to plug-in by implement the ``AsyncServerBase`` class.
+
+.. code:: python
+
+   class AsyncServerBase(ABC):
+       @abstractmethod
+       async def chat_completion(self, raw_request: Request) -> JSONResponse:
+           """OpenAI chat completion API.
+
+           Args:
+               raw_request (Request): raw json request
+           
+           Returns:
+               JSONResponse: json response
+
+           API reference: https://platform.openai.com/docs/api-reference/chat/create
+           """
+           raise NotImplementedError
+
+       @abstractmethod
+       async def generate(self, prompt_ids: list[int], sampling_params: dict[str, Any], request_id: str) -> list[int]:
+           """Generate response ids given prompt ids.
+
+           Args:
+               prompt_ids (List[int]): prompt ids
+               sampling_params (Dict[str, Any]): sampling params
+               request_id (str): request id
+
+           Returns:
+               List[int]: response ids
+           """
+           raise NotImplementedError
+
+
+Chat completion vs Token in token out
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+.. warning::
+   The following conclusion is based on our recent experience and is still open to investigation and discussion.
+
+Almost all agent frameworks (LangGraph, CrewAI, LlamaIndex, etc) call LLM with OpenAI chat completion api, and 
+keep chat history as messages. So user may expect that we should use the chat completion api in multi-turn rollout.
+
+But based on our recent experience on single-turn training on DAPO and multi-turn training on `retool <https://github.com/volcengine/verl/tree/main/recipe/retool>`_,
+we found the token_ids from apply the final messages may not equal to the token_ids by concat prompt_ids and response_ids in each turn.
+
+.. image:: https://github.com/eric-haibin-lin/verl-community/blob/main/docs/multi_turn.png?raw=true
+
+**Where does this inconsistency happened?**
+
+First, the tool parser may alter the content. For example
+
+.. code:: json
+
+   {"role": "assistant", "content": "Let me call a <tool_call>...</tool_call> and get the result"}
+
+After tool_calls extraction, the messages is like this:
+
+.. code:: json
+
+   {"role": "assistant", "content": "Let me call a and get the result", "tool_calls": [{"name": "foo", "arguments": "{}"}]}
+
+Encode the extracted message back is not equal to the original LLM generated response_ids.
+
+Second,  the `decode-encode` may also lead to inconsistency: `Agent-R1 issue#30 <https://github.com/0russwest0/Agent-R1/issues/30#issuecomment-2826155367>`_.
+
+**What is the impact of this inconsistency?**
+
+This inconsistency is not a big problem for serving/agent system, but is critical to RL training.
+It causes the trajectory deviate from the policy model distribution. We have observed that apply_chat_template
+to the final chat history messages make PPO training not even converged in single-turn.
+
+vLLM
+^^^^
+
+.. image:: https://github.com/eric-haibin-lin/verl-community/blob/main/docs/async_vllm.png?raw=true
+
+For vLLM, the Async LLM Engine is running in same process as the server, and ModelRunner is running in same process as FSDP/Megatron-LM workers.
+Async LLM Engine communicate with ModelRunner through ZeroMQ. When server receive a request, it directly call engine to generate response_ids.
+
+SGLang
+^^^^^^
+
+.. image:: https://github.com/eric-haibin-lin/verl-community/blob/main/docs/async_sglang.png?raw=true
+
+For SGLang, the Async LLM Engine is running in same process as FSDP/Megatron-LM worker-0, and it spawn multiple subprocesses as ModelRunner.
+Also, Async LLM Engine communicate with ModelRunner through ZeroMQ. When server receive a request, it remote call the worker-0 and get response_ids.
+
+AsyncLLMServerManager
+~~~~~~~~~~~~~~~~~~~~~
+
+AsyncLLMServerManager serve as proxy to multiple AsyncLLMServer instances, provides:
+
+- load balance: select a server instance with least request in first turn and send request to it.
+- sticky session: bind request_id to server instance, so that the same request_id will be sent to the same server instance in following turns.
+
+AsyncLLMServerManager is passed to ``AgentLoopBase.__init__``, whenever user want to interact with LLM in agent loop,
+they can call ``AsyncLLMServerManager.generate`` to generate response_ids.
+
+.. code:: python
+
+   class AsyncLLMServerManager:
+       async def generate(
+           self,
+           request_id,
+           *,
+           prompt_ids: list[int],
+           sampling_params: dict[str, Any],
+       ) -> list[int]:
+           """Generate tokens from prompt ids.
+
+           Args:
+               request_id (str): request id for sticky session.
+               prompt_ids (List[int]): List of prompt token ids.
+               sampling_params (Dict[str, Any]): Sampling parameters for the chat completion.
+
+           Returns:
+               List[int]: List of generated token ids.
+           """
+           ...
+
+Next
+----
+
+- :doc:`Agentic RL Training<../start/agentic_rl>`: Quick start agentic RL training with gsm8k dataset.
+- `LangGraph MathExpression <https://github.com/volcengine/verl/tree/main/recipe/langgraph_agent/example>`_: Demonstrate how to use LangGraph to build agent loop.
+- `Retool <https://github.com/volcengine/verl/tree/main/recipe/retool>`_: End-to-end retool paper reproduction using tool agent.