Initial commit

7f6cc211 · jerrrrry · 7f6cc211 · 7f6cc211 · 7f6cc211 · 7f6cc211
Commit 7f6cc211 authored Aug 05, 2025 by jerrrrry
20 changed files
--- a/docker/Dockerfile.ngc.vllm
+++ b/docker/Dockerfile.ngc.vllm
+# docker buildx build --platform linux/x86_64 -t "verlai/verl:ngc-th2.4.0-cu124-vllm0.6.3-ray2.4-te1.7-v0.0.6" -f docker/Dockerfile.ngc.vllm . --builder cloud-verlai-verl-builder --progress=plain --push
+FROM nvcr.io/nvidia/pytorch:24.05-py3
+
+# uninstall nv-pytorch fork
+RUN pip3 uninstall pytorch-quantization \
+    pytorch-triton \
+    torch \
+    torch-tensorrt \
+    torchvision \
+    xgboost transformer_engine flash_attn \
+    apex megatron-core -y
+
+RUN pip3 install torch==2.4.0 torchvision==0.19.0 torchaudio==2.4.0 --index-url https://download.pytorch.org/whl/cu124
+
+# =============== Megatron dependencies (optional) =================
+# install apex, set MAX_JOBS to avoid OOMs
+RUN MAX_JOBS=4 pip3 install -v --disable-pip-version-check --no-cache-dir --no-build-isolation \
+    --config-settings "--build-option=--cpp_ext" --config-settings "--build-option=--cuda_ext" \
+    git+https://github.com/NVIDIA/apex
+# =============== End of Megatron dependencies (optional) =================
+
+RUN pip3 install --no-cache-dir \
+    accelerate \
+    codetiming \
+    datasets \
+    dill \
+    hydra-core \
+    numpy \
+    'pandas' \
+    'peft' \
+    'pyarrow>=15.0.0' \
+    'pybind11' \
+    'pylatexenc' \
+    'ray>=2.10' \
+    'tensordict<0.6' \
+    'transformers' \
+    'vllm==0.6.3.post1' \
+    'wandb'
+
+# full dependencies
+RUN pip3 install pytest pre-commit py-spy pyext liger-kernel
+
+# =============== Megatron dependencies (optional) =================
+# install Transformer Engine, which requires FA 2.5.8. Do it in a separate step for docker cache
+RUN MAX_JOBS=4 NINJA_FLAGS="-j4" pip3 install flash-attn==2.5.8 --no-cache-dir --no-build-isolation
+RUN MAX_JOBS=1 NINJA_FLAGS="-j1" TE_BUILD_WITH_NINJA=0 pip3 install git+https://github.com/eric-haibin-lin/TransformerEngine.git@v1.7.0
+# =============== End of Megatron dependencies (optional) =================
--- a/docker/Dockerfile.ngc.vllm0.8
+++ b/docker/Dockerfile.ngc.vllm0.8
+# Start from the NVIDIA official image (ubuntu-22.04 + cuda-12.6 + python-3.10)
+# https://docs.nvidia.com/deeplearning/frameworks/pytorch-release-notes/rel-24-08.html
+FROM nvcr.io/nvidia/pytorch:24.08-py3
+
+# Define environments
+ENV MAX_JOBS=32
+ENV VLLM_WORKER_MULTIPROC_METHOD=spawn
+ENV DEBIAN_FRONTEND=noninteractive
+ENV NODE_OPTIONS=""
+ENV PIP_ROOT_USER_ACTION=ignore
+ENV HF_HUB_ENABLE_HF_TRANSFER="1"
+
+# Define installation arguments
+ARG APT_SOURCE=https://mirrors.tuna.tsinghua.edu.cn/ubuntu/
+ARG PIP_INDEX=https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple
+
+# Set apt source
+RUN cp /etc/apt/sources.list /etc/apt/sources.list.bak && \
+    { \
+    echo "deb ${APT_SOURCE} jammy main restricted universe multiverse"; \
+    echo "deb ${APT_SOURCE} jammy-updates main restricted universe multiverse"; \
+    echo "deb ${APT_SOURCE} jammy-backports main restricted universe multiverse"; \
+    echo "deb ${APT_SOURCE} jammy-security main restricted universe multiverse"; \
+    } > /etc/apt/sources.list
+
+# Install systemctl
+RUN apt-get update && \
+    apt-get install -y -o Dpkg::Options::="--force-confdef" systemd && \
+    apt-get clean
+
+# Install tini
+RUN apt-get update && \
+    apt-get install -y tini && \
+    apt-get clean
+
+# Change pip source
+RUN pip config set global.index-url "${PIP_INDEX}" && \
+    pip config set global.extra-index-url "${PIP_INDEX}" && \
+    python -m pip install --upgrade pip
+
+# Uninstall nv-pytorch fork
+RUN pip uninstall -y torch torchvision torchaudio \
+    pytorch-quantization pytorch-triton torch-tensorrt \
+    xgboost transformer_engine flash_attn apex megatron-core grpcio
+
+# Install torch-2.6.0+cu124 + vllm-0.8.3
+# torch-2.6.0+cu124: cxx11abi=False
+# torch-2.6.0+cu126: cxx11abi=True
+# see https://github.com/flashinfer-ai/flashinfer/issues/911
+RUN pip install --no-cache-dir "vllm==0.8.3" "torch==2.6.0" "torchvision==0.21.0" "torchaudio==2.6.0" "tensordict==0.6.2" torchdata \
+    "transformers[hf_xet]>=4.51.0" accelerate datasets peft hf-transfer \
+    "numpy<2.0.0" "pyarrow>=15.0.0" pandas \
+    ray[default] codetiming hydra-core pylatexenc qwen-vl-utils wandb dill pybind11 liger-kernel mathruler \
+    pytest py-spy pyext pre-commit ruff
+
+# Install flash-attn-2.7.4.post1 (cxx11abi=False)
+RUN wget -nv https://github.com/Dao-AILab/flash-attention/releases/download/v2.7.4.post1/flash_attn-2.7.4.post1+cu12torch2.6cxx11abiFALSE-cp310-cp310-linux_x86_64.whl && \
+    pip install --no-cache-dir flash_attn-2.7.4.post1+cu12torch2.6cxx11abiFALSE-cp310-cp310-linux_x86_64.whl
+
+# Install flashinfer-0.2.2.post1+cu124 (cxx11abi=False)
+# vllm-0.8.3 does not support flashinfer>=0.2.3
+# see https://github.com/vllm-project/vllm/pull/15777
+RUN wget -nv https://github.com/flashinfer-ai/flashinfer/releases/download/v0.2.2.post1/flashinfer_python-0.2.2.post1+cu124torch2.6-cp38-abi3-linux_x86_64.whl && \
+    pip install --no-cache-dir flashinfer_python-0.2.2.post1+cu124torch2.6-cp38-abi3-linux_x86_64.whl
+
+# Fix packages
+RUN pip uninstall -y pynvml nvidia-ml-py && \
+    pip install --no-cache-dir --upgrade "nvidia-ml-py>=12.560.30" "fastapi[standard]>=0.115.0" "optree>=0.13.0" "pydantic>=2.9" "grpcio>=1.62.1"
+
+# Install verl
+RUN pip install --no-cache-dir verl[vllm] -U
+
+# Reset pip config
+RUN pip config unset global.index-url && \
+    pip config unset global.extra-index-url
--- a/docker/Dockerfile.ngc.vllm0.8.sagemaker
+++ b/docker/Dockerfile.ngc.vllm0.8.sagemaker
+# Using a pre-built image from AWS DLC which contains the current version of python (3.10) and supported cuda version (12.1)
+FROM 763104351884.dkr.ecr.us-east-1.amazonaws.com/huggingface-pytorch-training:2.1.0-transformers4.36.0-gpu-py310-cu121-ubuntu20.04
+
+# uninstall nv-pytorch fork
+RUN pip3 uninstall -y pytorch-quantization \
+    pytorch-triton torch torch-tensorrt torchvision \
+    xgboost transformer_engine flash_attn apex megatron-core
+
+# Define environments
+ENV MAX_JOBS=32
+ENV VLLM_WORKER_MULTIPROC_METHOD=spawn
+ENV DEBIAN_FRONTEND=noninteractive
+ENV NODE_OPTIONS=""
+ENV HF_HUB_ENABLE_HF_TRANSFER="1"
+
+# Install systemctl
+RUN apt-get update && \
+    apt-get install -y -o Dpkg::Options::="--force-confdef" systemd && \
+    apt-get clean
+
+# Install tini
+RUN apt-get update && \
+    apt-get install -y tini && \
+    apt-get clean
+
+# Install torch-2.6.0 + vllm-0.8.2
+RUN pip install --no-cache-dir vllm==0.8.2 torch==2.6.0 torchvision==0.21.0 torchaudio==2.6.0 tensordict torchdata==0.11.0 \
+    transformers>=4.49.0 accelerate datasets peft hf-transfer \
+    ray[default] codetiming hydra-core pandas pyarrow>=15.0.0 pylatexenc qwen-vl-utils wandb dill pybind11 liger-kernel mathruler \
+    pytest pre-commit py-spy pyext ruff
+
+# Install flash_attn-2.7.4.post1
+RUN pip uninstall -y transformer-engine flash-attn && \
+    pip install flash-attn==2.7.4.post1 --no-build-isolation
+
+# Fix cv2
+RUN pip uninstall -y pynvml nvidia-ml-py && \
+    pip install --no-cache-dir nvidia-ml-py>=12.560.30 opencv-python-headless==4.8.0.74 fastapi==0.115.6 && \
+    pip install --no-cache-dir --upgrade optree>=0.13.0
+
+# Install verl
+RUN pip install --no-cache-dir verl[vllm] -U
+
+# Reset pip config
+RUN pip config unset global.index-url && \
+    pip config unset global.extra-index-url
--- a/docker/Dockerfile.rocm
+++ b/docker/Dockerfile.rocm
+# FROM "compute-artifactory.amd.com:5000/rocm-plus-docker/framework/compute-rocm-rel-6.4:94_ubuntu22.04_py3.10_pytorch_release-2.7_575e247"
+FROM "rlfoundation.azurecr.io/rocm6.3.4:vllm-0.8.5-numa-patch-ubuntu-22.04"
+
+SHELL ["/bin/bash", "-ceuxo", "pipefail"]
+
+ENV MAX_JOBS=512
+
+ENV PATH="/usr/local/python3.12/bin:$PATH"
+RUN ln -sf /usr/bin/python3.12 /usr/bin/python && \
+    ln -sf /usr/bin/pip3.12 /usr/bin/pip
+
+############################################
+############################################
+RUN apt-get update
+RUN apt-get install -y pkg-config liblzma-dev
+############################################
+############################################
+
+
+###########################################
+##########Install TransformerEngine########
+###########################################
+WORKDIR /workspace/
+# transformer-engine install
+# https://github.com/ROCm/TransformerEngine
+
+RUN rm -rf TransformerEngine 
+RUN git clone --recursive https://github.com/ROCm/TransformerEngine.git
+WORKDIR /workspace/TransformerEngine
+RUN git checkout 236178e5
+# git checkout bb061ade
+# git checkout 864405c
+
+ENV NVTE_FRAMEWORK=pytorch 
+ENV NVTE_ROCM_ARCH=gfx942 
+ENV NVTE_USE_HIPBLASLT=1
+ENV NVTE_USE_ROCM=1  
+
+# export CMAKE_PREFIX_PATH="/opt/rocm:/opt/rocm/hip:/usr/local:/usr:${CMAKE_PREFIX_PATH:-}"
+ENV CMAKE_PREFIX_PATH="/opt/rocm:/opt/rocm/hip:/usr/local:/usr"
+
+
+# ENV NVTE_BUILD_MAX_JOBS=$(MAX_JOBS)
+
+RUN MAX_JOBS=$(MAX_JOBS) pip install . -vvv 
+
+WORKDIR /workspace/
+###########################################
+###########################################
+###########################################
+
+
+
+
+
+####################################################################################
+################Install vllm - sglang require vllm 0.6.7 dependency#################
+####################################################################################
+#### Require vllm 0.6.7 - checkout 113274a0
+WORKDIR /workspace/
+RUN rm -rf vllm
+RUN pip uninstall -y vllm
+# Refer to here (down-grade vllm to 0.6.3): https://docs.vllm.ai/en/v0.6.3/getting_started/amd-installation.html
+RUN git clone https://github.com/ROCm/vllm.git
+# git clone https://github.com/vllm-project/vllm.git
+WORKDIR /workspace/vllm
+RUN git checkout 113274a0
+ENV PYTORCH_ROCM_ARCH="gfx90a;gfx942"
+#ENV MAX_JOBS=512
+ENV MAX_JOBS=${MAX_JOBS}
+RUN pip install "boto3>=1.26.0"
+RUN pip install setuptools_scm
+# will add src into py. You can delete the repo
+RUN python3 setup.py install
+WORKDIR /workspace/
+####################################################################################
+####################################################################################
+####################################################################################
+
+
+
+###########################################
+############For hack docker################
+###########################################
+RUN pip install setuptools==75.8.0
+###########################################
+###########################################
+###########################################
+
+
+
+###########################################
+############build sgalng###################
+###########################################
+# Set environment variables
+ENV BASE_DIR=/sgl-workspace
+ENV BUILD_TYPE=all
+ENV SGL_REPO=https://github.com/sgl-project/sglang
+ENV SGL_BRANCH=v0.4.6.post5
+ENV TRITON_REPO=https://github.com/ROCm/triton.git
+ENV TRITON_COMMIT=improve_fa_decode_3.0.0
+ENV AITER_REPO=https://github.com/ROCm/aiter.git
+ENV AITER_COMMIT=v0.1.2
+# v0.1.2 version - commit id: 9d11f47
+# ENV AITER_COMMIT=9d11f47
+
+ENV HIP_FORCE_DEV_KERNARG=1
+ENV HSA_NO_SCRATCH_RECLAIM=1
+ENV SGLANG_SET_CPU_AFFINITY=1
+ENV SGLANG_ALLOW_OVERWRITE_LONGER_CONTEXT_LEN=1
+ENV NCCL_MIN_NCHANNELS=112
+ENV MOE_PADDING=1
+ENV VLLM_FP8_PADDING=1
+ENV VLLM_FP8_ACT_PADDING=1
+ENV VLLM_FP8_WEIGHT_PADDING=1
+ENV VLLM_FP8_REDUCE_CONV=1
+ENV TORCHINDUCTOR_MAX_AUTOTUNE=1
+ENV TORCHINDUCTOR_MAX_AUTOTUNE_POINTWISE=1
+ENV HIPCC_COMPILE_FLAGS_APPEND="--offload-arch=gfx942"
+ENV AMDGPU_TARGETS=gfx942
+ENV ROCM_ARCH=gfx942
+ENV PYTORCH_ROCM_ARCH="gfx90a;gfx942"
+
+# Switch to working directory
+WORKDIR /sgl-workspace
+
+# Clean and create directory
+RUN rm -rf /sgl-workspace && mkdir -p /sgl-workspace
+
+# Clone and build sglang
+RUN git clone ${SGL_REPO} \
+    && cd sglang \
+    && git checkout ${SGL_BRANCH} || echo "Using default branch" \
+    && cd sgl-kernel \
+    && rm -f pyproject.toml \
+    && mv pyproject_rocm.toml pyproject.toml \
+    && python setup_rocm.py install \
+    && cd .. \
+    && if [ "$BUILD_TYPE" = "srt" ]; then \
+         python -m pip --no-cache-dir install -e "python[srt_hip]"; \
+       else \
+         python -m pip --no-cache-dir install -e "python[all_hip]"; \
+       fi \
+    && cd /sgl-workspace \
+    && cp -r /sgl-workspace/sglang /sglang \
+    && python -m pip cache purge
+
+# Install common Python packages
+RUN pip install IPython orjson python-multipart torchao pybind11
+
+# Rebuild Triton
+RUN pip uninstall -y triton || true \
+    && git clone ${TRITON_REPO} \
+    && cd triton \
+    && git checkout ${TRITON_COMMIT} \
+    && cd python \
+    && python3 setup.py install \
+    && cd /sgl-workspace
+
+
+# ENV HIPCC_COMPILE_FLAGS_APPEND="--offload-arch=gfx942 --amdgpu-lower-module-lds-strategy=1"
+# ENV HIPCC_COMPILE_FLAGS_APPEND="--offload-arch=gfx942"
+
+# Build aiter
+#version: Commit 9d11f47
+    # && git checkout ${AITER_COMMIT} \
+RUN pip uninstall -y aiter || true
+RUN git clone ${AITER_REPO} \
+    && cd aiter \
+    && git checkout ${AITER_COMMIT} \
+    && git submodule sync \
+    && git submodule update --init --recursive \
+    && PREBUILD_KERNELS=1 GPU_ARCHS=gfx942 python3 setup.py install \
+    && cd /sgl-workspace
+    # && PREBUILD_KERNELS=1 GPU_ARCHS=gfx942 python3 setup.py develop \
+    # && PREBUILD_KERNELS=1 GPU_ARCHS=gfx942 python3 setup.py develop \
+
+# Copy MI300X config 
+RUN find /sgl-workspace/sglang/python/sglang/srt/layers/quantization/configs/ \
+         /sgl-workspace/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/ \
+         -type f -name '*MI300X*' | \
+         xargs -I {} sh -c 'vf_config=$(echo "$1" | sed "s/MI300X/MI300X_VF/"); cp "$1" "$vf_config"' -- {}
+
+# Environment setup complete.
+RUN echo "Environment setup complete."
+
+WORKDIR /workspace/
+###########################################
+###########################################
+###########################################
+
+
+
+
+
+
+###########################################
+###############vllm v0.8.5#################
+###########################################
+# ENV GITHUB_USERNAME=yushengsu-thu
+# ENV GITHUB_MAIL=yushengsu@gmail.com
+
+# RUN git config --global user.name "${GITHUB_USERNAME}" \
+#     && git config --global user.email "${GITHUB_MAIL}" 
+
+WORKDIR /workspace/
+
+ENV VLLM_TARGET_DEVICE=rocm 
+ENV ROCM_PATH=/opt/rocm 
+ENV SETUPTOOLS_SCM_PRETEND_VERSION=0.8.5.dev
+
+# Find the repo path in: DockerFile/Dockerfile.rocm_yang
+# RUN git clone https://github.com/RLFoundation/vllm-patch.git
+RUN pip uninstall -y vllm || true
+RUN rm -rf vllm-patch
+RUN git clone https://github.com/RLFoundation/vllm-patch.git \
+    && cd vllm-patch \
+    && git checkout v0.8.5-sleep-numa \
+    && rm -rf build/ dist/ *.egg-info \
+    && ln -sf /opt/rocm/lib/libamdhip64.so /usr/lib/libamdhip64.so \
+    && SETUPTOOLS_SCM_PRETEND_VERSION=0.8.5.dev PYTORCH_ROCM_ARCH="gfx90a;gfx942" MAX_JOBS=${MAX_JOBS} python3 setup.py install
+    # RUN SETUPTOOLS_SCM_PRETEND_VERSION=0.8.5.dev PYTORCH_ROCM_ARCH="gfx90a;gfx942" MAX_JOBS=${MAX_JOBS} python3 setup.py develop
+
+WORKDIR /workspace/
+###########################################
+###########################################
+###########################################
+
+
+
+
+#########################################
+#### Install megatron-core###############
+#########################################
+RUN pip uninstall -y megatron-core && \
+    git clone https://github.com/yushengsu-thu/Megatron-LM-amd_version.git && \
+    cd Megatron-LM-amd_version && \
+    pip install -vvv -e . && \
+    cd /workspace/
+#########################################
+#########################################
+#########################################
+
+
+
+
+#######################################
+################apex###################
+#######################################
+WORKDIR /workspace/
+RUN pip uninstall -y apex && \
+    git clone https://github.com/ROCm/apex.git && \
+    cd apex && \
+    python setup.py install && \
+    cd /workspace/ 
+#######################################
+#######################################
+#######################################
+
+
+
+
+################################################################################
+###########################Add torch_memory_saver###############################
+################################################################################
+# Set environment variables
+ENV HIPCC_COMPILE_FLAGS_APPEND="--amdgpu-target=gfx90a;gfx942 -D__HIP_PLATFORM_AMD__"
+ENV CFLAGS="-D__HIP_PLATFORM_AMD__"
+ENV CXXFLAGS="-D__HIP_PLATFORM_AMD__"
+RUN pip install "git+https://github.com/YangWang92/torch_memory_saver_numa.git@numa"
+################################################################################
+################################################################################
+################################################################################
+
+
+
+########################################
+######Install ray#######################
+########################################
+# need to add this patch: https://github.com/ray-project/ray/pull/53531/files
+RUN pip uninstall ray -y
+RUN pip install "ray[data,train,tune,serve]>=2.47.0" 
+########################################
+########################################
+########################################
+
+
+
+##########################################
+#######Install other dependencies#########
+##########################################
+RUN pip install "tensordict==0.6.2" --no-deps && \
+    pip install accelerate \
+    codetiming \
+    datasets \
+    dill \
+    hydra-core \
+    liger-kernel \
+    numpy \
+    pandas \
+    peft \
+    "pyarrow>=15.0.0" \
+    pylatexenc \
+    torchdata \
+    wandb \
+    orjson \
+    pybind11
+    
+WORKDIR /workspace/
+RUN git clone https://github.com/volcengine/verl.git && \
+    cd verl && \
+    pip install -e . 
+##########################################
+##########################################
+##########################################
+
+
+
+WORKDIR /workspace/
+
+CMD ["/usr/bin/bash"]
--- a/docker/Dockerfile.rocm_verl-0.3.0.post1
+++ b/docker/Dockerfile.rocm_verl-0.3.0.post1
+#  Build the docker in the repo dir:
+# docker build -f docker/Dockerfile.rocm -t verl-rocm:03.04.2015 .
+# docker images # you can find your built docker
+
+
+# Support - Traing: fsdp; Inference: vllm
+# FROM rocm/vllm:rocm6.2_mi300_ubuntu20.04_py3.9_vllm_0.6.4
+# Support - Traing: fsdp; Inference: vllm, sglang
+FROM lmsysorg/sglang:v0.4.6.post5-rocm630
+
+# Set working directory
+# WORKDIR $PWD/app
+
+# Set environment variables
+ENV PYTORCH_ROCM_ARCH="gfx90a;gfx942"
+
+ENV HIPCC_COMPILE_FLAGS_APPEND="--amdgpu-target=gfx90a;gfx942 -D__HIP_PLATFORM_AMD__"
+ENV CFLAGS="-D__HIP_PLATFORM_AMD__"
+ENV CXXFLAGS="-D__HIP_PLATFORM_AMD__"
+
+# Install vllm
+RUN pip uninstall -y vllm && \
+    rm -rf vllm && \
+    git clone -b v0.6.3 https://github.com/vllm-project/vllm.git && \
+    cd vllm && \
+    MAX_JOBS=$(nproc) python3 setup.py install && \
+    cd .. && \
+    rm -rf vllm
+
+# Copy the entire project directory
+COPY . .
+
+# Install dependencies
+RUN pip install "tensordict==0.6.2" --no-deps && \
+    pip install accelerate \
+    codetiming \
+    datasets \
+    dill \
+    hydra-core \
+    liger-kernel \
+    numpy \
+    pandas \
+    peft \
+    "pyarrow>=15.0.0" \
+    pylatexenc \
+    "ray[data,train,tune,serve]<2.45.0" \
+    torchdata \
+    transformers \
+    wandb \
+    orjson \
+    pybind11
+    
+RUN git clone https://github.com/volcengine/verl.git && \
+    cd verl && \
+    pip install -e . 
+
+# Install torch_memory_saver
+RUN pip install git+https://github.com/ExtremeViscent/torch_memory_saver.git --no-deps
--- a/docker/Dockerfile.rocm_verl-0.4.1
+++ b/docker/Dockerfile.rocm_verl-0.4.1
+# FROM "compute-artifactory.amd.com:5000/rocm-plus-docker/framework/compute-rocm-rel-6.4:94_ubuntu22.04_py3.10_pytorch_release-2.7_575e247"
+FROM "rlfoundation.azurecr.io/rocm6.3.4:vllm-0.8.5-numa-patch-ubuntu-22.04"
+
+SHELL ["/bin/bash", "-ceuxo", "pipefail"]
+
+ENV MAX_JOBS=512
+
+ENV PATH="/usr/local/python3.12/bin:$PATH"
+RUN ln -sf /usr/bin/python3.12 /usr/bin/python && \
+    ln -sf /usr/bin/pip3.12 /usr/bin/pip
+
+############################################
+############################################
+RUN apt-get update
+RUN apt-get install -y pkg-config liblzma-dev
+############################################
+############################################
+
+
+###########################################
+##########Install TransformerEngine########
+###########################################
+WORKDIR /workspace/
+# transformer-engine install
+# https://github.com/ROCm/TransformerEngine
+
+RUN rm -rf TransformerEngine 
+RUN git clone --recursive https://github.com/ROCm/TransformerEngine.git
+WORKDIR /workspace/TransformerEngine
+RUN git checkout 236178e5
+# git checkout bb061ade
+# git checkout 864405c
+
+ENV NVTE_FRAMEWORK=pytorch 
+ENV NVTE_ROCM_ARCH=gfx942 
+ENV NVTE_USE_HIPBLASLT=1
+ENV NVTE_USE_ROCM=1  
+
+# export CMAKE_PREFIX_PATH="/opt/rocm:/opt/rocm/hip:/usr/local:/usr:${CMAKE_PREFIX_PATH:-}"
+ENV CMAKE_PREFIX_PATH="/opt/rocm:/opt/rocm/hip:/usr/local:/usr"
+
+
+# ENV NVTE_BUILD_MAX_JOBS=$(MAX_JOBS)
+
+RUN MAX_JOBS=$(MAX_JOBS) pip install . -vvv 
+
+WORKDIR /workspace/
+###########################################
+###########################################
+###########################################
+
+
+
+
+
+####################################################################################
+################Install vllm - sglang require vllm 0.6.7 dependency#################
+####################################################################################
+#### Require vllm 0.6.7 - checkout 113274a0
+WORKDIR /workspace/
+RUN rm -rf vllm
+RUN pip uninstall -y vllm
+# Refer to here (down-grade vllm to 0.6.3): https://docs.vllm.ai/en/v0.6.3/getting_started/amd-installation.html
+RUN git clone https://github.com/ROCm/vllm.git
+# git clone https://github.com/vllm-project/vllm.git
+WORKDIR /workspace/vllm
+RUN git checkout 113274a0
+ENV PYTORCH_ROCM_ARCH="gfx90a;gfx942"
+#ENV MAX_JOBS=512
+ENV MAX_JOBS=${MAX_JOBS}
+RUN pip install "boto3>=1.26.0"
+RUN pip install setuptools_scm
+# will add src into py. You can delete the repo
+RUN python3 setup.py install
+WORKDIR /workspace/
+####################################################################################
+####################################################################################
+####################################################################################
+
+
+
+###########################################
+############For hack docker################
+###########################################
+RUN pip install setuptools==75.8.0
+###########################################
+###########################################
+###########################################
+
+
+
+###########################################
+############build sgalng###################
+###########################################
+# Set environment variables
+ENV BASE_DIR=/sgl-workspace
+ENV BUILD_TYPE=all
+ENV SGL_REPO=https://github.com/sgl-project/sglang
+ENV SGL_BRANCH=v0.4.6.post5
+ENV TRITON_REPO=https://github.com/ROCm/triton.git
+ENV TRITON_COMMIT=improve_fa_decode_3.0.0
+ENV AITER_REPO=https://github.com/ROCm/aiter.git
+ENV AITER_COMMIT=v0.1.2
+# v0.1.2 version - commit id: 9d11f47
+# ENV AITER_COMMIT=9d11f47
+
+ENV HIP_FORCE_DEV_KERNARG=1
+ENV HSA_NO_SCRATCH_RECLAIM=1
+ENV SGLANG_SET_CPU_AFFINITY=1
+ENV SGLANG_ALLOW_OVERWRITE_LONGER_CONTEXT_LEN=1
+ENV NCCL_MIN_NCHANNELS=112
+ENV MOE_PADDING=1
+ENV VLLM_FP8_PADDING=1
+ENV VLLM_FP8_ACT_PADDING=1
+ENV VLLM_FP8_WEIGHT_PADDING=1
+ENV VLLM_FP8_REDUCE_CONV=1
+ENV TORCHINDUCTOR_MAX_AUTOTUNE=1
+ENV TORCHINDUCTOR_MAX_AUTOTUNE_POINTWISE=1
+ENV HIPCC_COMPILE_FLAGS_APPEND="--offload-arch=gfx942"
+ENV AMDGPU_TARGETS=gfx942
+ENV ROCM_ARCH=gfx942
+ENV PYTORCH_ROCM_ARCH="gfx90a;gfx942"
+
+# Switch to working directory
+WORKDIR /sgl-workspace
+
+# Clean and create directory
+RUN rm -rf /sgl-workspace && mkdir -p /sgl-workspace
+
+# Clone and build sglang
+RUN git clone ${SGL_REPO} \
+    && cd sglang \
+    && git checkout ${SGL_BRANCH} || echo "Using default branch" \
+    && cd sgl-kernel \
+    && rm -f pyproject.toml \
+    && mv pyproject_rocm.toml pyproject.toml \
+    && python setup_rocm.py install \
+    && cd .. \
+    && if [ "$BUILD_TYPE" = "srt" ]; then \
+         python -m pip --no-cache-dir install -e "python[srt_hip]"; \
+       else \
+         python -m pip --no-cache-dir install -e "python[all_hip]"; \
+       fi \
+    && cd /sgl-workspace \
+    && cp -r /sgl-workspace/sglang /sglang \
+    && python -m pip cache purge
+
+# Install common Python packages
+RUN pip install IPython orjson python-multipart torchao pybind11
+
+# Rebuild Triton
+RUN pip uninstall -y triton || true \
+    && git clone ${TRITON_REPO} \
+    && cd triton \
+    && git checkout ${TRITON_COMMIT} \
+    && cd python \
+    && python3 setup.py install \
+    && cd /sgl-workspace
+
+
+# ENV HIPCC_COMPILE_FLAGS_APPEND="--offload-arch=gfx942 --amdgpu-lower-module-lds-strategy=1"
+# ENV HIPCC_COMPILE_FLAGS_APPEND="--offload-arch=gfx942"
+
+# Build aiter
+#version: Commit 9d11f47
+    # && git checkout ${AITER_COMMIT} \
+RUN pip uninstall -y aiter || true
+RUN git clone ${AITER_REPO} \
+    && cd aiter \
+    && git checkout ${AITER_COMMIT} \
+    && git submodule sync \
+    && git submodule update --init --recursive \
+    && PREBUILD_KERNELS=1 GPU_ARCHS=gfx942 python3 setup.py install \
+    && cd /sgl-workspace
+    # && PREBUILD_KERNELS=1 GPU_ARCHS=gfx942 python3 setup.py develop \
+    # && PREBUILD_KERNELS=1 GPU_ARCHS=gfx942 python3 setup.py develop \
+
+# Copy MI300X config 
+RUN find /sgl-workspace/sglang/python/sglang/srt/layers/quantization/configs/ \
+         /sgl-workspace/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/ \
+         -type f -name '*MI300X*' | \
+         xargs -I {} sh -c 'vf_config=$(echo "$1" | sed "s/MI300X/MI300X_VF/"); cp "$1" "$vf_config"' -- {}
+
+# Environment setup complete.
+RUN echo "Environment setup complete."
+
+WORKDIR /workspace/
+###########################################
+###########################################
+###########################################
+
+
+
+
+
+
+###########################################
+###############vllm v0.8.5#################
+###########################################
+# ENV GITHUB_USERNAME=yushengsu-thu
+# ENV GITHUB_MAIL=yushengsu@gmail.com
+
+# RUN git config --global user.name "${GITHUB_USERNAME}" \
+#     && git config --global user.email "${GITHUB_MAIL}" 
+
+WORKDIR /workspace/
+
+ENV VLLM_TARGET_DEVICE=rocm 
+ENV ROCM_PATH=/opt/rocm 
+ENV SETUPTOOLS_SCM_PRETEND_VERSION=0.8.5.dev
+
+# Find the repo path in: DockerFile/Dockerfile.rocm_yang
+# RUN git clone https://github.com/RLFoundation/vllm-patch.git
+RUN pip uninstall -y vllm || true
+RUN rm -rf vllm-patch
+RUN git clone https://github.com/RLFoundation/vllm-patch.git \
+    && cd vllm-patch \
+    && git checkout v0.8.5-sleep-numa \
+    && rm -rf build/ dist/ *.egg-info \
+    && ln -sf /opt/rocm/lib/libamdhip64.so /usr/lib/libamdhip64.so \
+    && SETUPTOOLS_SCM_PRETEND_VERSION=0.8.5.dev PYTORCH_ROCM_ARCH="gfx90a;gfx942" MAX_JOBS=${MAX_JOBS} python3 setup.py install
+    # RUN SETUPTOOLS_SCM_PRETEND_VERSION=0.8.5.dev PYTORCH_ROCM_ARCH="gfx90a;gfx942" MAX_JOBS=${MAX_JOBS} python3 setup.py develop
+
+WORKDIR /workspace/
+###########################################
+###########################################
+###########################################
+
+
+
+
+#########################################
+#### Install megatron-core###############
+#########################################
+RUN pip uninstall -y megatron-core && \
+    git clone https://github.com/yushengsu-thu/Megatron-LM-amd_version.git && \
+    cd Megatron-LM-amd_version && \
+    pip install -vvv -e . && \
+    cd /workspace/
+#########################################
+#########################################
+#########################################
+
+
+
+
+#######################################
+################apex###################
+#######################################
+WORKDIR /workspace/
+RUN pip uninstall -y apex && \
+    git clone https://github.com/ROCm/apex.git && \
+    cd apex && \
+    python setup.py install && \
+    cd /workspace/ 
+#######################################
+#######################################
+#######################################
+
+
+
+
+################################################################################
+###########################Add torch_memory_saver###############################
+################################################################################
+# Set environment variables
+ENV HIPCC_COMPILE_FLAGS_APPEND="--amdgpu-target=gfx90a;gfx942 -D__HIP_PLATFORM_AMD__"
+ENV CFLAGS="-D__HIP_PLATFORM_AMD__"
+ENV CXXFLAGS="-D__HIP_PLATFORM_AMD__"
+RUN pip install "git+https://github.com/YangWang92/torch_memory_saver_numa.git@numa"
+################################################################################
+################################################################################
+################################################################################
+
+
+
+########################################
+######Install ray#######################
+########################################
+# need to add this patch: https://github.com/ray-project/ray/pull/53531/files
+RUN pip uninstall ray -y
+RUN pip install "ray[data,train,tune,serve]>=2.47.0" 
+########################################
+########################################
+########################################
+
+
+
+##########################################
+#######Install other dependencies#########
+##########################################
+RUN pip install "tensordict==0.6.2" --no-deps && \
+    pip install accelerate \
+    codetiming \
+    datasets \
+    dill \
+    hydra-core \
+    liger-kernel \
+    numpy \
+    pandas \
+    peft \
+    "pyarrow>=15.0.0" \
+    pylatexenc \
+    torchdata \
+    wandb \
+    orjson \
+    pybind11
+    
+WORKDIR /workspace/
+RUN git clone https://github.com/volcengine/verl.git && \
+    cd verl && \
+    pip install -e . 
+##########################################
+##########################################
+##########################################
+
+
+
+WORKDIR /workspace/
+
+CMD ["/usr/bin/bash"]
+CMD ["/usr/bin/bash"]
--- a/docker/Dockerfile.sglang
+++ b/docker/Dockerfile.sglang
+# Start from the NVIDIA official image (ubuntu-22.04 + python-3.10)
+# https://docs.nvidia.com/deeplearning/frameworks/pytorch-release-notes/rel-24-08.html
+FROM nvcr.io/nvidia/pytorch:24.08-py3
+
+# Define environments
+ENV MAX_JOBS=32
+ENV DEBIAN_FRONTEND=noninteractive
+ENV NODE_OPTIONS=""
+
+# Define installation arguments
+ARG APT_SOURCE=https://mirrors.ustc.edu.cn/ubuntu/
+
+# Set apt source
+RUN cp /etc/apt/sources.list /etc/apt/sources.list.bak && \
+    { \
+    echo "deb ${APT_SOURCE} jammy main restricted universe multiverse"; \
+    echo "deb ${APT_SOURCE} jammy-updates main restricted universe multiverse"; \
+    echo "deb ${APT_SOURCE} jammy-backports main restricted universe multiverse"; \
+    echo "deb ${APT_SOURCE} jammy-security main restricted universe multiverse"; \
+    } > /etc/apt/sources.list
+
+# Install systemctl
+RUN apt-get update && \
+    apt-get install -y -o Dpkg::Options::="--force-confdef" systemd && \
+    apt-get clean
+
+# Install tini
+RUN apt-get update && \
+    apt-get install -y tini && \
+    apt-get clean
+
+# Change pip source
+ARG PIP_INDEX=https://mirrors.aliyun.com/pypi/simple/
+
+RUN pip config set global.index-url "${PIP_INDEX}" && \
+    pip config set global.extra-index-url "${PIP_INDEX}" && \
+    python -m pip install --upgrade pip
+
+# Install sglang-0.4.6.post5 and torch-memory-saver
+RUN pip uninstall -y cuda-python && pip install "sglang[all]==0.4.6.post5" --no-cache-dir --find-links https://flashinfer.ai/whl/cu124/torch2.6/flashinfer-python && pip install torch-memory-saver --no-cache-dir
+
+# Install torch-2.6.0
+RUN pip install --no-cache-dir torch==2.6.0 torchvision==0.21.0 torchaudio==2.6.0 tensordict torchdata \
+    transformers>=4.49.0 accelerate datasets peft hf_transfer \
+    ray[default] codetiming hydra-core pandas pyarrow>=15.0.0 pylatexenc qwen-vl-utils wandb liger-kernel \
+    pytest pre-commit py-spy pyext
+
+# Install flash_attn-2.7.4.post1
+RUN pip uninstall -y transformer-engine flash-attn && \
+    wget -v https://github.com/Dao-AILab/flash-attention/releases/download/v2.7.4.post1/flash_attn-2.7.4.post1+cu12torch2.6cxx11abiFALSE-cp310-cp310-linux_x86_64.whl && \
+    pip install --no-cache-dir flash_attn-2.7.4.post1+cu12torch2.6cxx11abiFALSE-cp310-cp310-linux_x86_64.whl
+
+# Fix cv2
+RUN pip uninstall -y pynvml nvidia-ml-py && \
+    pip install --no-cache-dir nvidia-ml-py>=12.560.30 opencv-python-headless==4.8.0.74 fastapi==0.115.6
--- a/docker/Dockerfile.vemlp.vllm.te
+++ b/docker/Dockerfile.vemlp.vllm.te
+# docker buildx build --platform linux/x86_64 -t "verlai/verl:$TAG" -f docker/$FILE .
+
+# the one in docker.io is an alias for the one veturbo
+# FROM vemlp-cn-beijing.cr.volces.com/veturbo/pytorch:2.4-cu124
+FROM docker.io/haibinlin/verl:v0.0.5-th2.4.0-cu124-base
+
+# only config pip index with https://pypi.tuna.tsinghua.edu.cn/simple if needed
+# unset for now
+RUN pip3 config unset global.index-url
+
+# transformers 4.47.0 contains the following bug:
+# AttributeError: 'Gemma2Attention' object has no attribute '_flash_attn_uses_top_left_mask'
+RUN pip3 install --no-cache-dir \
+    torch==2.4.0 \
+    accelerate \
+    codetiming \
+    dill \
+    hydra-core \
+    numpy \
+    pybind11 \
+    tensordict \
+    "transformers <= 4.46.0"
+
+RUN pip3 install --no-cache-dir flash-attn==2.7.0.post2 --no-build-isolation
+
+# vllm depends on ray
+RUN pip3 install --no-cache-dir vllm==0.6.3 ray==2.10
+
+# install apex
+RUN MAX_JOBS=4 pip3 install -v --disable-pip-version-check --no-cache-dir --no-build-isolation \
+    --config-settings "--build-option=--cpp_ext" --config-settings "--build-option=--cuda_ext" \
+    git+https://github.com/NVIDIA/apex
+
+# install Transformer Engine
+# - flash-attn pinned to 2.5.3 by TransformerEngine, switch to eric-haibin-lin/TransformerEngine.git@v1.7.0 to relax version req
+# - install with: MAX_JOBS=1 NINJA_FLAGS="-j1" TE_BUILD_WITH_NINJA=0 to avoid OOM
+# - cudnn is required by TransformerEngine
+# RUN CUDNN_PATH=/opt/conda/lib/python3.11/site-packages/nvidia/cudnn \
+#     pip3 install git+https://github.com/eric-haibin-lin/TransformerEngine.git@v1.7.0
+RUN MAX_JOBS=1 NINJA_FLAGS="-j1" pip3 install flash-attn==2.5.3 --no-cache-dir --no-build-isolation
+RUN MAX_JOBS=1 NINJA_FLAGS="-j1" pip3 install git+https://github.com/NVIDIA/TransformerEngine.git@v1.7
--- a/docker/Dockerfile.vllm.sglang.megatron.deepseek
+++ b/docker/Dockerfile.vllm.sglang.megatron.deepseek
+# Start from the NVIDIA official image (ubuntu-22.04 + cuda-12.6 + python-3.10)
+# https://docs.nvidia.com/deeplearning/frameworks/pytorch-release-notes/rel-24-08.html
+FROM nvcr.io/nvidia/pytorch:24.08-py3
+
+# Define environments
+ENV MAX_JOBS=32
+ENV VLLM_WORKER_MULTIPROC_METHOD=spawn
+ENV DEBIAN_FRONTEND=noninteractive
+ENV NODE_OPTIONS=""
+ENV PIP_ROOT_USER_ACTION=ignore
+ENV HF_HUB_ENABLE_HF_TRANSFER="1"
+
+# Define installation arguments
+ARG APT_SOURCE=https://mirrors.tuna.tsinghua.edu.cn/ubuntu/
+ARG PIP_INDEX=https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple
+
+# Set apt source
+RUN cp /etc/apt/sources.list /etc/apt/sources.list.bak && \
+    { \
+    echo "deb ${APT_SOURCE} jammy main restricted universe multiverse"; \
+    echo "deb ${APT_SOURCE} jammy-updates main restricted universe multiverse"; \
+    echo "deb ${APT_SOURCE} jammy-backports main restricted universe multiverse"; \
+    echo "deb ${APT_SOURCE} jammy-security main restricted universe multiverse"; \
+    } > /etc/apt/sources.list
+
+# Install systemctl
+RUN apt-get update && \
+    apt-get install -y -o Dpkg::Options::="--force-confdef" systemd && \
+    apt-get clean
+
+# Install tini
+RUN apt-get update && \
+    apt-get install -y tini aria2 && \
+    apt-get clean
+
+# Change pip source
+RUN pip config set global.index-url "${PIP_INDEX}" && \
+    pip config set global.extra-index-url "${PIP_INDEX}" && \
+    python -m pip install --upgrade pip
+
+# Uninstall nv-pytorch fork
+RUN pip uninstall -y torch torchvision torchaudio \
+    pytorch-quantization pytorch-triton torch-tensorrt \
+    xgboost transformer_engine flash_attn apex megatron-core grpcio
+
+# Reinstall CUDA 12.4
+RUN aria2c https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/cuda-ubuntu2204.pin && \
+    mv cuda-ubuntu2204.pin /etc/apt/preferences.d/cuda-repository-pin-600
+
+RUN aria2c --always-resume=true --max-tries=99999 https://developer.download.nvidia.com/compute/cuda/12.4.1/local_installers/cuda-repo-ubuntu2204-12-4-local_12.4.1-550.54.15-1_amd64.deb && \
+    dpkg -i cuda-repo-ubuntu2204-12-4-local_12.4.1-550.54.15-1_amd64.deb && \
+    cp /var/cuda-repo-ubuntu2204-12-4-local/cuda-*-keyring.gpg /usr/share/keyrings/ && \
+    apt-get update && \
+    apt-get -y install cuda-toolkit-12-4 && \
+    rm cuda-repo-ubuntu2204-12-4-local_12.4.1-550.54.15-1_amd64.deb && \
+    update-alternatives --set cuda /usr/local/cuda-12.4 && \
+    rm -rf /usr/local/cuda-12.6
+
+# Install torch-2.6.0+cu124 + vllm-0.8.5.post1 + sglang-0.4.6.post5
+# torch-2.6.0+cu124: cxx11abi=False
+# torch-2.6.0+cu126: cxx11abi=True
+# see https://github.com/flashinfer-ai/flashinfer/issues/911
+# Install sglang-0.4.6.post1 and torch-memory-saver
+RUN pip install --resume-retries 999 "sglang[all]==0.4.6.post5" --no-cache-dir --find-links https://flashinfer.ai/whl/cu124/torch2.6/flashinfer-python && pip install --resume-retries 999 torch-memory-saver --no-cache-dir
+
+RUN pip install --resume-retries 999 --no-cache-dir "vllm==0.8.5.post1" "torch==2.6.0" "torchvision==0.21.0" "torchaudio==2.6.0" "tensordict==0.6.2" torchdata
+
+RUN pip install --resume-retries 999 --no-cache-dir "transformers[hf_xet]>=4.51.0" accelerate datasets peft hf-transfer \
+    "numpy<2.0.0" "pyarrow>=15.0.0" pandas \
+    ray[default] codetiming hydra-core pylatexenc qwen-vl-utils wandb dill pybind11 liger-kernel mathruler blobfile \
+    pytest py-spy pyext pre-commit ruff
+
+# Install flash-attn-2.7.4.post1 (cxx11abi=False)
+RUN wget -nv https://github.com/Dao-AILab/flash-attention/releases/download/v2.7.4.post1/flash_attn-2.7.4.post1+cu12torch2.6cxx11abiFALSE-cp310-cp310-linux_x86_64.whl && \
+    pip install --no-cache-dir flash_attn-2.7.4.post1+cu12torch2.6cxx11abiFALSE-cp310-cp310-linux_x86_64.whl
+
+# Fix packages
+RUN pip uninstall -y pynvml nvidia-ml-py && \
+    pip install --resume-retries 999 --no-cache-dir --upgrade "nvidia-ml-py>=12.560.30" "fastapi[standard]>=0.115.0" "optree>=0.13.0" "pydantic>=2.9" "grpcio>=1.62.1"
+
+# Install cudnn
+RUN aria2c --max-tries=9999 https://developer.download.nvidia.com/compute/cudnn/9.8.0/local_installers/cudnn-local-repo-ubuntu2204-9.8.0_1.0-1_amd64.deb && \
+    dpkg -i cudnn-local-repo-ubuntu2204-9.8.0_1.0-1_amd64.deb && \
+    cp /var/cudnn-local-repo-ubuntu2204-9.8.0/cudnn-*-keyring.gpg /usr/share/keyrings/ && \
+    apt-get update && \
+    apt-get -y install cudnn-cuda-12 && \
+    rm cudnn-local-repo-ubuntu2204-9.8.0_1.0-1_amd64.deb
+
+RUN pip install --resume-retries 999 --no-cache-dir nvidia-cudnn-cu12==9.8.0.87
+
+# Install Apex
+RUN git clone https://github.com/NVIDIA/apex.git && \
+    cd apex && \
+    pip install -v --disable-pip-version-check --no-cache-dir --no-build-isolation --config-settings "--build-option=--cpp_ext" --config-settings "--build-option=--cuda_ext" ./
+
+# Install TransformerEngine
+RUN export NVTE_FRAMEWORK=pytorch && pip3 install --no-deps --no-cache-dir git+https://github.com/NVIDIA/TransformerEngine.git@v2.3
+
+# Install Megatron-LM
+RUN pip3 install --no-deps --no-cache-dir git+https://github.com/NVIDIA/Megatron-LM.git@core_v0.12.2
+
+# Fix opencv
+RUN pip install opencv-python
+
+RUN pip install opencv-fixer && \
+    python -c "from opencv_fixer import AutoFix; AutoFix()"
+
+# Install verl
+
+# Reset pip config
+RUN pip config unset global.index-url && \
+    pip config unset global.extra-index-url
+
+    RUN apt-get update && \
+    apt-get install -y aria2 libfreeimage3 libfreeimage-dev zlib1g
\ No newline at end of file
--- a/docker/README.md
+++ b/docker/README.md
+# Dockerfiles of verl
+
+We provide pre-built Docker images for quick setup. And from this version, we utilize a new image release hierarchy for productivity and stability.
+
+The image types are divided into three large categories:
+
+- **Base Image**: Without inference and training frameworks, only basic dependencies are installed. Can directly install vllm or SGLang on top of it, without need of reinstall torch or CUDA.
+- **Application Image**: Stable version with inference and training frameworks installed.
+- **Preview Image**: Unstable version with the latest frameworks and features.
+
+The first two types of images are hosted on dockerhub [verlai/verl](https://hub.docker.com/r/verlai/verl) repository, while the preview images are hosted on community repository.
+
+> The image versions are mapped with verl releases, for example, image with tag ``verl0.4`` is built for verl release ``v0.4.x``.
+
+## Base Image
+
+The stable base image is ``verlai/verl:base-verl0.4-cu124-cudnn9.8-torch2.6-fa2.7.4``. The installed package versions can be found from tags, and the Dockerfile can be found in ``verl[version]-[packages]/Dockerfile.base``.
+
+The base images for preview are ``verlai/verl:base-verl0.5-cu126-cudnn9.8-torch2.7.1-fa2.8.0`` and ``verlai/verl:base-verl0.5-preview-cu128-cudnn9.8-torch2.7.1-fa2.8.0`` with different CUDA versions.
+
+The update of base image is not frequent, and the app image can be built on top of it without reinstalling base packages.
+
+## Application Image
+
+From this version, we divide images built for vLLM and SGLang as the divergence of dependent packages like FlashInfer.
+
+There are four types of application images available:
+
+- **vLLM with FSDP and Megatron**: ``verlai/verl:app-verl0.4-vllm0.8.5-mcore0.12.2-te2.2``, with Deep-EP support: ``verlai/verl:app-verl0.4-vllm0.8.5-mcore0.12.2-te2.2-deepep``.
+- **SGLang with FSDP and Megatron**: ``verlai/verl:app-verl0.4-sglang0.4.6.post5-vllm0.8.5-mcore0.12.2-te2.2`` (need vLLM support, but can have some package conflicts), with Deep-EP support: ``verlai/verl:app-verl0.4-sglang0.4.6.post5-vllm0.8.5-mcore0.12.2-te2.2-deepep``.
+- **Preview version of SGLang with FSDP and Megatron, CUDA 12.6**: ``verlai/verl:app-verl0.5-sglang0.4.8-mcore0.12.2-te2.2``
+- **Preview version of SGLang with FSDP and Megatron, CUDA 12.8**: ``verlai/verl:app-preview-verl0.5-sglang0.4.8-mcore0.12.2-te2.2``
+
+For Megatron 0.13.0, we offer preview images, to use latest codes, just replace ``mcore0.12.2`` with ``mcore0.13.0-preview`` in the above image tag.
+
+The latest vLLM support is coming soon.
+
+Docker images with Megatron backends are runnable with large language model like ``Qwen/Qwen3-235B-A22B``, ``deepseek-ai/DeepSeek-V3-0324`` post-training. Refer to the :doc:`Large Language Model Post-Training documentation<../perf/dpsk>` for more details.
+
+Application images can be updated frequently, and the Dockerfile can be found in ``docker/verl[version]-[packages]/Dockerfile.app.[frameworks]``. Based on the base image, it is easy to build your own application image with the desired inference and training frameworks.
+
+## Community Image
+
+For vLLM with FSDP, please refer to [hiyouga/verl](https://hub.docker.com/r/hiyouga/verl) repository and the latest version is ``hiyouga/verl:ngc-th2.6.0-cu126-vllm0.8.4-flashinfer0.2.2-cxx11abi0``.
+
+For SGLang with FSDP, please refer to [ocss884/verl-sglang](https://hub.docker.com/r/ocss884/verl-sglang) repository and the latest version is ``ocss884/verl-sglang:ngc-th2.6.0-cu126-sglang0.4.6.post5`` which is provided by SGLang RL Group.
+
+See files under ``docker/`` for NGC-based image or if you want to build your own.
+
+Note that For aws instances with EFA net interface (Sagemaker AI Pod), you need to install EFA driver as shown in ``docker/Dockerfile.extenstion.awsefa``
+
+## Installation from Docker
+
+After pulling the desired Docker image and installing desired inference and training frameworks, you can run it with the following steps:
+
+1. Launch the desired Docker image and attach into it:
+
+```sh
+docker create --runtime=nvidia --gpus all --net=host --shm-size="10g" --cap-add=SYS_ADMIN -v .:/workspace/verl --name verl <image:tag> sleep infinity
+docker start verl
+docker exec -it verl bash
+```
+
+2.	If you use the images provided, you only need to install verl itself without dependencies:
+
+```sh
+# install the nightly version (recommended)
+git clone https://github.com/volcengine/verl && cd verl
+pip3 install --no-deps -e .
+```
+
+[Optional] If you hope to switch between different frameworks, you can install verl with the following command:
+
+```sh
+# install the nightly version (recommended)
+git clone https://github.com/volcengine/verl && cd verl
+pip3 install -e .[vllm]
+pip3 install -e .[sglang]
+```
--- a/docker/verl0.4-cu124-torch2.6-fa2.7.4/Dockerfile.app.sglang.vllm.mcore0.12
+++ b/docker/verl0.4-cu124-torch2.6-fa2.7.4/Dockerfile.app.sglang.vllm.mcore0.12
+# Start from the verl base image
+# Dockerfile.base
+FROM verlai/verl:base-verl0.4-cu124-cudnn9.8-torch2.6-fa2.7.4
+
+# Define environments
+ENV MAX_JOBS=32
+ENV VLLM_WORKER_MULTIPROC_METHOD=spawn
+ENV DEBIAN_FRONTEND=noninteractive
+ENV NODE_OPTIONS=""
+ENV PIP_ROOT_USER_ACTION=ignore
+ENV HF_HUB_ENABLE_HF_TRANSFER="1"
+
+# Install sglang-0.4.6.post5 and torch-memory-saver
+RUN pip install --resume-retries 999 "sglang[all]==0.4.6.post5" --no-cache-dir --find-links https://flashinfer.ai/whl/cu124/torch2.6/flashinfer-python && pip install torch-memory-saver --no-cache-dir
+
+# Some sglang operations in 0.4.6.post5 require vllm
+# [Warning] vllm can have some packages not compatible with sglang, for example, flashinfer
+RUN pip install --resume-retries 999 --no-cache-dir vllm==0.8.5.post1
+
+# Fix packages
+RUN pip install --no-cache-dir "tensordict==0.6.2" "transformers[hf_xet]>=4.51.0" accelerate datasets peft hf-transfer \
+    "numpy<2.0.0" "pyarrow>=19.0.1" pandas \
+    ray[default] codetiming hydra-core pylatexenc qwen-vl-utils wandb dill pybind11 liger-kernel mathruler blobfile xgrammar \
+    pytest py-spy pyext pre-commit ruff
+
+RUN pip uninstall -y pynvml nvidia-ml-py && \
+    pip install --resume-retries 999 --no-cache-dir --upgrade "nvidia-ml-py>=12.560.30" "fastapi[standard]>=0.115.0" "optree>=0.13.0" "pydantic>=2.9" "grpcio>=1.62.1"
+
+RUN pip install --resume-retries 999 --no-cache-dir nvidia-cudnn-cu12==9.8.0.87
+
+# Install TransformerEngine
+RUN export NVTE_FRAMEWORK=pytorch && pip3 install --resume-retries 999 --no-deps --no-cache-dir --no-build-isolation git+https://github.com/NVIDIA/TransformerEngine.git@v2.2.1
+
+# Install Megatron-LM
+RUN pip3 install --no-deps --no-cache-dir --no-build-isolation git+https://github.com/NVIDIA/Megatron-LM.git@core_v0.12.2
+
+# Fix for transformers 4.53.0
+RUN pip3 install --no-cache-dir "transformers[hf_xet]<4.52.0"
+
+# Install mbridge
+RUN pip3 install --no-cache-dir mbridge
\ No newline at end of file
--- a/docker/verl0.4-cu124-torch2.6-fa2.7.4/Dockerfile.app.sglang.vllm.mcore0.12.deepep
+++ b/docker/verl0.4-cu124-torch2.6-fa2.7.4/Dockerfile.app.sglang.vllm.mcore0.12.deepep
+# Start from the verl base image
+# Dockerfile.base
+FROM verlai/verl:base-verl0.4-cu124-cudnn9.8-torch2.6-fa2.7.4
+
+# Define environments
+ENV MAX_JOBS=32
+ENV VLLM_WORKER_MULTIPROC_METHOD=spawn
+ENV DEBIAN_FRONTEND=noninteractive
+ENV NODE_OPTIONS=""
+ENV PIP_ROOT_USER_ACTION=ignore
+ENV HF_HUB_ENABLE_HF_TRANSFER="1"
+
+# Install sglang-0.4.6.post5 and torch-memory-saver
+RUN pip install --resume-retries 999 "sglang[all]==0.4.6.post5" --no-cache-dir --find-links https://flashinfer.ai/whl/cu124/torch2.6/flashinfer-python && pip install torch-memory-saver --no-cache-dir
+
+# Some sglang operations in 0.4.6.post5 require vllm
+# [Warning] vllm can have some packages not compatible with sglang, for example, flashinfer
+RUN pip install --resume-retries 999 --no-cache-dir vllm==0.8.5.post1
+
+# Fix packages
+RUN pip install --no-cache-dir "tensordict==0.6.2" "transformers[hf_xet]>=4.51.0" accelerate datasets peft hf-transfer \
+    "numpy<2.0.0" "pyarrow>=19.0.1" pandas \
+    ray[default] codetiming hydra-core pylatexenc qwen-vl-utils wandb dill pybind11 liger-kernel mathruler blobfile xgrammar \
+    pytest py-spy pyext pre-commit ruff
+
+RUN pip uninstall -y pynvml nvidia-ml-py && \
+    pip install --resume-retries 999 --no-cache-dir --upgrade "nvidia-ml-py>=12.560.30" "fastapi[standard]>=0.115.0" "optree>=0.13.0" "pydantic>=2.9" "grpcio>=1.62.1"
+
+RUN pip install --resume-retries 999 --no-cache-dir nvidia-cudnn-cu12==9.8.0.87
+
+# Install TransformerEngine
+RUN export NVTE_FRAMEWORK=pytorch && pip3 install --resume-retries 999 --no-deps --no-cache-dir --no-build-isolation git+https://github.com/NVIDIA/TransformerEngine.git@v2.2.1
+
+# Install Megatron-LM
+RUN pip3 install --no-deps --no-cache-dir --no-build-isolation git+https://github.com/NVIDIA/Megatron-LM.git@core_v0.12.2
+
+# Fix for transformers 4.53.0
+RUN pip3 install --no-cache-dir "transformers[hf_xet]<4.52.0"
+
+# Install mbridge
+RUN pip3 install --no-cache-dir mbridge
+
+# Install DeepEP
+## the dependency of IBGDA
+RUN ln -s /usr/lib/x86_64-linux-gnu/libmlx5.so.1 /usr/lib/x86_64-linux-gnu/libmlx5.so
+
+## Clone and build deepep and deepep-nvshmem
+RUN git clone -b v2.3.1 https://github.com/NVIDIA/gdrcopy.git && \
+    git clone https://github.com/deepseek-ai/DeepEP.git  && \
+    cd DeepEP && git checkout a84a248
+
+# Prepare nvshmem
+RUN wget https://developer.nvidia.com/downloads/assets/secure/nvshmem/nvshmem_src_3.2.5-1.txz && \
+    tar -xvf nvshmem_src_3.2.5-1.txz && mv nvshmem_src deepep-nvshmem && \
+    cd deepep-nvshmem && git apply ../DeepEP/third-party/nvshmem.patch
+
+ENV CUDA_HOME=/usr/local/cuda
+### Set MPI environment variables. Having errors when not set.
+ENV CPATH=/usr/local/mpi/include:$CPATH
+ENV LD_LIBRARY_PATH=/usr/local/mpi/lib:$LD_LIBRARY_PATH
+ENV LD_LIBRARY_PATH=/usr/local/x86_64-linux-gnu:$LD_LIBRARY_PATH
+ENV GDRCOPY_HOME=/workspace/gdrcopy
+
+## Build deepep-nvshmem
+RUN cd deepep-nvshmem && \
+    NVSHMEM_SHMEM_SUPPORT=0 \
+    NVSHMEM_UCX_SUPPORT=0 \
+    NVSHMEM_USE_NCCL=0 \
+    NVSHMEM_MPI_SUPPORT=0 \
+    NVSHMEM_IBGDA_SUPPORT=1 \
+    NVSHMEM_PMIX_SUPPORT=0 \
+    NVSHMEM_TIMEOUT_DEVICE_POLLING=0 \
+    NVSHMEM_USE_GDRCOPY=1 \
+    cmake -G Ninja -S . -B build/ -DCMAKE_INSTALL_PREFIX=/workspace/deepep-nvshmem/install && cmake --build build/ --target install
+
+ENV NVSHMEM_DIR=/workspace/deepep-nvshmem/install
+ENV LD_LIBRARY_PATH=$NVSHMEM_DIR/lib:$LD_LIBRARY_PATH
+ENV PATH=$NVSHMEM_DIR/bin:$PATH
+
+## Build deepep
+RUN cd DeepEP && \
+    python setup.py install
\ No newline at end of file
--- a/docker/verl0.4-cu124-torch2.6-fa2.7.4/Dockerfile.app.sglang.vllm.mcore0.13.preview
+++ b/docker/verl0.4-cu124-torch2.6-fa2.7.4/Dockerfile.app.sglang.vllm.mcore0.13.preview
+# Start from the verl base image
+# Dockerfile.base
+FROM verlai/verl:base-verl0.4-cu124-cudnn9.8-torch2.6-fa2.7.4
+
+# Define environments
+ENV MAX_JOBS=32
+ENV VLLM_WORKER_MULTIPROC_METHOD=spawn
+ENV DEBIAN_FRONTEND=noninteractive
+ENV NODE_OPTIONS=""
+ENV PIP_ROOT_USER_ACTION=ignore
+ENV HF_HUB_ENABLE_HF_TRANSFER="1"
+
+# Install sglang-0.4.6.post5 and torch-memory-saver
+RUN pip install --resume-retries 999 "sglang[all]==0.4.6.post5" --no-cache-dir --find-links https://flashinfer.ai/whl/cu124/torch2.6/flashinfer-python && pip install torch-memory-saver --no-cache-dir
+
+# Some sglang operations in 0.4.6.post5 require vllm
+# [Warning] vllm can have some packages not compatible with sglang, for example, flashinfer
+RUN pip install --resume-retries 999 --no-cache-dir vllm==0.8.5.post1
+
+# Fix packages
+RUN pip install --no-cache-dir "tensordict==0.6.2" "transformers[hf_xet]>=4.51.0" accelerate datasets peft hf-transfer \
+    "numpy<2.0.0" "pyarrow>=19.0.1" pandas \
+    ray[default] codetiming hydra-core pylatexenc qwen-vl-utils wandb dill pybind11 liger-kernel mathruler blobfile xgrammar \
+    pytest py-spy pyext pre-commit ruff
+
+RUN pip uninstall -y pynvml nvidia-ml-py && \
+    pip install --resume-retries 999 --no-cache-dir --upgrade "nvidia-ml-py>=12.560.30" "fastapi[standard]>=0.115.0" "optree>=0.13.0" "pydantic>=2.9" "grpcio>=1.62.1"
+
+RUN pip install --resume-retries 999 --no-cache-dir nvidia-cudnn-cu12==9.8.0.87
+
+# Install TransformerEngine
+RUN export NVTE_FRAMEWORK=pytorch && pip3 install --resume-retries 999 --no-deps --no-cache-dir --no-build-isolation git+https://github.com/NVIDIA/TransformerEngine.git@release_v2.5
+
+# Install Megatron-LM
+RUN pip3 install --no-deps --no-cache-dir --no-build-isolation git+https://github.com/NVIDIA/Megatron-LM.git@core_r0.13.0
+
+# Fix for transformers 4.53.0
+RUN pip3 install --no-cache-dir "transformers[hf_xet]<4.52.0"
+
+# Install mbridge
+RUN pip3 install --no-cache-dir mbridge
+
+# Install DeepEP
+## the dependency of IBGDA
+RUN ln -s /usr/lib/x86_64-linux-gnu/libmlx5.so.1 /usr/lib/x86_64-linux-gnu/libmlx5.so
+
+## Clone and build deepep and deepep-nvshmem
+RUN git clone -b v2.3.1 https://github.com/NVIDIA/gdrcopy.git && \
+    git clone https://github.com/deepseek-ai/DeepEP.git  && \
+    cd DeepEP && git checkout a84a248
+
+# Prepare nvshmem
+RUN wget https://developer.nvidia.com/downloads/assets/secure/nvshmem/nvshmem_src_3.2.5-1.txz && \
+    tar -xvf nvshmem_src_3.2.5-1.txz && mv nvshmem_src deepep-nvshmem && \
+    cd deepep-nvshmem && git apply ../DeepEP/third-party/nvshmem.patch
+
+ENV CUDA_HOME=/usr/local/cuda
+### Set MPI environment variables. Having errors when not set.
+ENV CPATH=/usr/local/mpi/include:$CPATH
+ENV LD_LIBRARY_PATH=/usr/local/mpi/lib:$LD_LIBRARY_PATH
+ENV LD_LIBRARY_PATH=/usr/local/x86_64-linux-gnu:$LD_LIBRARY_PATH
+ENV GDRCOPY_HOME=/workspace/gdrcopy
+
+## Build deepep-nvshmem
+RUN cd deepep-nvshmem && \
+    NVSHMEM_SHMEM_SUPPORT=0 \
+    NVSHMEM_UCX_SUPPORT=0 \
+    NVSHMEM_USE_NCCL=0 \
+    NVSHMEM_MPI_SUPPORT=0 \
+    NVSHMEM_IBGDA_SUPPORT=1 \
+    NVSHMEM_PMIX_SUPPORT=0 \
+    NVSHMEM_TIMEOUT_DEVICE_POLLING=0 \
+    NVSHMEM_USE_GDRCOPY=1 \
+    cmake -G Ninja -S . -B build/ -DCMAKE_INSTALL_PREFIX=/workspace/deepep-nvshmem/install && cmake --build build/ --target install
+
+ENV NVSHMEM_DIR=/workspace/deepep-nvshmem/install
+ENV LD_LIBRARY_PATH=$NVSHMEM_DIR/lib:$LD_LIBRARY_PATH
+ENV PATH=$NVSHMEM_DIR/bin:$PATH
+
+## Build deepep
+RUN cd DeepEP && \
+    python setup.py install
\ No newline at end of file
--- a/docker/verl0.4-cu124-torch2.6-fa2.7.4/Dockerfile.app.vllm.mcore0.12
+++ b/docker/verl0.4-cu124-torch2.6-fa2.7.4/Dockerfile.app.vllm.mcore0.12
+# Start from the verl base image
+# Dockerfile.base
+FROM verlai/verl:base-verl0.4-cu124-cudnn9.8-torch2.6-fa2.7.4
+
+# Define environments
+ENV MAX_JOBS=32
+ENV VLLM_WORKER_MULTIPROC_METHOD=spawn
+ENV DEBIAN_FRONTEND=noninteractive
+ENV NODE_OPTIONS=""
+ENV PIP_ROOT_USER_ACTION=ignore
+ENV HF_HUB_ENABLE_HF_TRANSFER="1"
+
+# Install torch-2.6.0+cu124 + vllm-0.8.5.post1
+# torch-2.6.0+cu124: cxx11abi=False
+# torch-2.6.0+cu126: cxx11abi=True
+# see https://github.com/flashinfer-ai/flashinfer/issues/911
+RUN pip install --resume-retries 999 --no-cache-dir vllm==0.8.5.post1
+
+# Install flashinfer-0.2.2.post1+cu126 (cxx11abi=True)
+# vllm-0.8.3 does not support flashinfer>=0.2.3
+# see https://github.com/vllm-project/vllm/pull/15777
+RUN aria2c --max-tries=9999 https://github.com/flashinfer-ai/flashinfer/releases/download/v0.2.2.post1/flashinfer_python-0.2.2.post1+cu124torch2.6-cp38-abi3-linux_x86_64.whl && \
+    pip install --no-cache-dir flashinfer_python-0.2.2.post1+cu124torch2.6-cp38-abi3-linux_x86_64.whl && \
+    rm flashinfer_python-0.2.2.post1+cu124torch2.6-cp38-abi3-linux_x86_64.whl
+
+# Fix packages
+RUN pip install --no-cache-dir "tensordict==0.6.2" "transformers[hf_xet]>=4.51.0" accelerate datasets peft hf-transfer \
+    "numpy<2.0.0" "pyarrow>=19.0.1" pandas \
+    ray[default] codetiming hydra-core pylatexenc qwen-vl-utils wandb dill pybind11 liger-kernel mathruler blobfile xgrammar \
+    pytest py-spy pyext pre-commit ruff
+
+RUN pip uninstall -y pynvml nvidia-ml-py && \
+    pip install --resume-retries 999 --no-cache-dir --upgrade "nvidia-ml-py>=12.560.30" "fastapi[standard]>=0.115.0" "optree>=0.13.0" "pydantic>=2.9" "grpcio>=1.62.1"
+
+RUN pip install --resume-retries 999 --no-cache-dir nvidia-cudnn-cu12==9.8.0.87
+
+# Install TransformerEngine
+RUN export NVTE_FRAMEWORK=pytorch && pip3 install --resume-retries 999 --no-deps --no-cache-dir --no-build-isolation git+https://github.com/NVIDIA/TransformerEngine.git@v2.2.1
+
+# Install Megatron-LM
+RUN pip3 install --no-deps --no-cache-dir --no-build-isolation git+https://github.com/NVIDIA/Megatron-LM.git@core_v0.12.2
+
+# Fix for transformers 4.53.0
+RUN pip3 install --no-cache-dir "transformers[hf_xet]<4.52.0"
+
+# Install mbridge
+RUN pip3 install --no-cache-dir mbridge
\ No newline at end of file
--- a/docker/verl0.4-cu124-torch2.6-fa2.7.4/Dockerfile.app.vllm.mcore0.12.deepep
+++ b/docker/verl0.4-cu124-torch2.6-fa2.7.4/Dockerfile.app.vllm.mcore0.12.deepep
+# Start from the verl base image
+# Dockerfile.base
+FROM verlai/verl:base-verl0.4-cu124-cudnn9.8-torch2.6-fa2.7.4
+
+# Define environments
+ENV MAX_JOBS=32
+ENV VLLM_WORKER_MULTIPROC_METHOD=spawn
+ENV DEBIAN_FRONTEND=noninteractive
+ENV NODE_OPTIONS=""
+ENV PIP_ROOT_USER_ACTION=ignore
+ENV HF_HUB_ENABLE_HF_TRANSFER="1"
+
+# Install torch-2.6.0+cu124 + vllm-0.8.5.post1
+# torch-2.6.0+cu124: cxx11abi=False
+# torch-2.6.0+cu126: cxx11abi=True
+# see https://github.com/flashinfer-ai/flashinfer/issues/911
+RUN pip install --resume-retries 999 --no-cache-dir vllm==0.8.5.post1
+
+# Install flashinfer-0.2.2.post1+cu126 (cxx11abi=True)
+# vllm-0.8.3 does not support flashinfer>=0.2.3
+# see https://github.com/vllm-project/vllm/pull/15777
+RUN aria2c --max-tries=9999 https://github.com/flashinfer-ai/flashinfer/releases/download/v0.2.2.post1/flashinfer_python-0.2.2.post1+cu124torch2.6-cp38-abi3-linux_x86_64.whl && \
+    pip install --no-cache-dir flashinfer_python-0.2.2.post1+cu124torch2.6-cp38-abi3-linux_x86_64.whl && \
+    rm flashinfer_python-0.2.2.post1+cu124torch2.6-cp38-abi3-linux_x86_64.whl
+
+# Fix packages
+RUN pip install --no-cache-dir "tensordict==0.6.2" "transformers[hf_xet]>=4.51.0" accelerate datasets peft hf-transfer \
+    "numpy<2.0.0" "pyarrow>=19.0.1" pandas \
+    ray[default] codetiming hydra-core pylatexenc qwen-vl-utils wandb dill pybind11 liger-kernel mathruler blobfile xgrammar \
+    pytest py-spy pyext pre-commit ruff
+
+RUN pip uninstall -y pynvml nvidia-ml-py && \
+    pip install --resume-retries 999 --no-cache-dir --upgrade "nvidia-ml-py>=12.560.30" "fastapi[standard]>=0.115.0" "optree>=0.13.0" "pydantic>=2.9" "grpcio>=1.62.1"
+
+RUN pip install --resume-retries 999 --no-cache-dir nvidia-cudnn-cu12==9.8.0.87
+
+# Install TransformerEngine
+RUN export NVTE_FRAMEWORK=pytorch && pip3 install --resume-retries 999 --no-deps --no-cache-dir --no-build-isolation git+https://github.com/NVIDIA/TransformerEngine.git@v2.2.1
+
+# Install Megatron-LM
+RUN pip3 install --no-deps --no-cache-dir --no-build-isolation git+https://github.com/NVIDIA/Megatron-LM.git@core_v0.12.2
+
+# Fix for transformers 4.53.0
+RUN pip3 install --no-cache-dir "transformers[hf_xet]<4.52.0"
+
+# Install mbridge
+RUN pip3 install --no-cache-dir mbridge
+
+# Install DeepEP
+## the dependency of IBGDA
+RUN ln -s /usr/lib/x86_64-linux-gnu/libmlx5.so.1 /usr/lib/x86_64-linux-gnu/libmlx5.so
+
+## Clone and build deepep and deepep-nvshmem
+RUN git clone -b v2.3.1 https://github.com/NVIDIA/gdrcopy.git && \
+    git clone https://github.com/deepseek-ai/DeepEP.git  && \
+    cd DeepEP && git checkout a84a248
+
+# Prepare nvshmem
+RUN wget https://developer.nvidia.com/downloads/assets/secure/nvshmem/nvshmem_src_3.2.5-1.txz && \
+    tar -xvf nvshmem_src_3.2.5-1.txz && mv nvshmem_src deepep-nvshmem && \
+    cd deepep-nvshmem && git apply ../DeepEP/third-party/nvshmem.patch
+
+ENV CUDA_HOME=/usr/local/cuda
+### Set MPI environment variables. Having errors when not set.
+ENV CPATH=/usr/local/mpi/include:$CPATH
+ENV LD_LIBRARY_PATH=/usr/local/mpi/lib:$LD_LIBRARY_PATH
+ENV LD_LIBRARY_PATH=/usr/local/x86_64-linux-gnu:$LD_LIBRARY_PATH
+ENV GDRCOPY_HOME=/workspace/gdrcopy
+
+## Build deepep-nvshmem
+RUN cd deepep-nvshmem && \
+    NVSHMEM_SHMEM_SUPPORT=0 \
+    NVSHMEM_UCX_SUPPORT=0 \
+    NVSHMEM_USE_NCCL=0 \
+    NVSHMEM_MPI_SUPPORT=0 \
+    NVSHMEM_IBGDA_SUPPORT=1 \
+    NVSHMEM_PMIX_SUPPORT=0 \
+    NVSHMEM_TIMEOUT_DEVICE_POLLING=0 \
+    NVSHMEM_USE_GDRCOPY=1 \
+    cmake -G Ninja -S . -B build/ -DCMAKE_INSTALL_PREFIX=/workspace/deepep-nvshmem/install && cmake --build build/ --target install
+
+ENV NVSHMEM_DIR=/workspace/deepep-nvshmem/install
+ENV LD_LIBRARY_PATH=$NVSHMEM_DIR/lib:$LD_LIBRARY_PATH
+ENV PATH=$NVSHMEM_DIR/bin:$PATH
+
+## Build deepep
+RUN cd DeepEP && \
+    python setup.py install
\ No newline at end of file
--- a/docker/verl0.4-cu124-torch2.6-fa2.7.4/Dockerfile.app.vllm.mcore0.13.preview
+++ b/docker/verl0.4-cu124-torch2.6-fa2.7.4/Dockerfile.app.vllm.mcore0.13.preview
+# Start from the verl base image
+# Dockerfile.base
+FROM verlai/verl:base-verl0.4-cu124-cudnn9.8-torch2.6-fa2.7.4
+
+# Define environments
+ENV MAX_JOBS=32
+ENV VLLM_WORKER_MULTIPROC_METHOD=spawn
+ENV DEBIAN_FRONTEND=noninteractive
+ENV NODE_OPTIONS=""
+ENV PIP_ROOT_USER_ACTION=ignore
+ENV HF_HUB_ENABLE_HF_TRANSFER="1"
+
+# Install torch-2.6.0+cu124 + vllm-0.8.5.post1
+# torch-2.6.0+cu124: cxx11abi=False
+# torch-2.6.0+cu126: cxx11abi=True
+# see https://github.com/flashinfer-ai/flashinfer/issues/911
+RUN pip install --resume-retries 999 --no-cache-dir vllm==0.8.5.post1
+
+# Install flashinfer-0.2.2.post1+cu126 (cxx11abi=True)
+# vllm-0.8.3 does not support flashinfer>=0.2.3
+# see https://github.com/vllm-project/vllm/pull/15777
+RUN aria2c --max-tries=9999 https://github.com/flashinfer-ai/flashinfer/releases/download/v0.2.2.post1/flashinfer_python-0.2.2.post1+cu124torch2.6-cp38-abi3-linux_x86_64.whl && \
+    pip install --no-cache-dir flashinfer_python-0.2.2.post1+cu124torch2.6-cp38-abi3-linux_x86_64.whl && \
+    rm flashinfer_python-0.2.2.post1+cu124torch2.6-cp38-abi3-linux_x86_64.whl
+
+# Fix packages
+RUN pip install --no-cache-dir "tensordict==0.6.2" "transformers[hf_xet]>=4.51.0" accelerate datasets peft hf-transfer \
+    "numpy<2.0.0" "pyarrow>=19.0.1" pandas \
+    ray[default] codetiming hydra-core pylatexenc qwen-vl-utils wandb dill pybind11 liger-kernel mathruler blobfile xgrammar \
+    pytest py-spy pyext pre-commit ruff
+
+RUN pip uninstall -y pynvml nvidia-ml-py && \
+    pip install --resume-retries 999 --no-cache-dir --upgrade "nvidia-ml-py>=12.560.30" "fastapi[standard]>=0.115.0" "optree>=0.13.0" "pydantic>=2.9" "grpcio>=1.62.1"
+
+RUN pip install --resume-retries 999 --no-cache-dir nvidia-cudnn-cu12==9.8.0.87
+
+# Install TransformerEngine
+RUN export NVTE_FRAMEWORK=pytorch && pip3 install --resume-retries 999 --no-deps --no-cache-dir --no-build-isolation git+https://github.com/NVIDIA/TransformerEngine.git@release_v2.5
+
+# Install Megatron-LM
+RUN pip3 install --no-deps --no-cache-dir --no-build-isolation git+https://github.com/NVIDIA/Megatron-LM.git@core_v0.12.2
+
+# Install mbridge
+RUN pip3 install --no-cache-dir mbridge
+
+# Install DeepEP
+## the dependency of IBGDA
+RUN ln -s /usr/lib/x86_64-linux-gnu/libmlx5.so.1 /usr/lib/x86_64-linux-gnu/libmlx5.so
+
+## Clone and build deepep and deepep-nvshmem
+RUN git clone -b v2.3.1 https://github.com/NVIDIA/gdrcopy.git && \
+    git clone https://github.com/deepseek-ai/DeepEP.git  && \
+    cd DeepEP && git checkout a84a248
+
+# Prepare nvshmem
+RUN wget https://developer.nvidia.com/downloads/assets/secure/nvshmem/nvshmem_src_3.2.5-1.txz && \
+    tar -xvf nvshmem_src_3.2.5-1.txz && mv nvshmem_src deepep-nvshmem && \
+    cd deepep-nvshmem && git apply ../DeepEP/third-party/nvshmem.patch
+
+ENV CUDA_HOME=/usr/local/cuda
+### Set MPI environment variables. Having errors when not set.
+ENV CPATH=/usr/local/mpi/include:$CPATH
+ENV LD_LIBRARY_PATH=/usr/local/mpi/lib:$LD_LIBRARY_PATH
+ENV LD_LIBRARY_PATH=/usr/local/x86_64-linux-gnu:$LD_LIBRARY_PATH
+ENV GDRCOPY_HOME=/workspace/gdrcopy
+
+## Build deepep-nvshmem
+RUN cd deepep-nvshmem && \
+    NVSHMEM_SHMEM_SUPPORT=0 \
+    NVSHMEM_UCX_SUPPORT=0 \
+    NVSHMEM_USE_NCCL=0 \
+    NVSHMEM_MPI_SUPPORT=0 \
+    NVSHMEM_IBGDA_SUPPORT=1 \
+    NVSHMEM_PMIX_SUPPORT=0 \
+    NVSHMEM_TIMEOUT_DEVICE_POLLING=0 \
+    NVSHMEM_USE_GDRCOPY=1 \
+    cmake -G Ninja -S . -B build/ -DCMAKE_INSTALL_PREFIX=/workspace/deepep-nvshmem/install && cmake --build build/ --target install
+
+ENV NVSHMEM_DIR=/workspace/deepep-nvshmem/install
+ENV LD_LIBRARY_PATH=$NVSHMEM_DIR/lib:$LD_LIBRARY_PATH
+ENV PATH=$NVSHMEM_DIR/bin:$PATH
+
+## Build deepep
+RUN cd DeepEP && \
+    python setup.py install
\ No newline at end of file
--- a/docker/verl0.4-cu124-torch2.6-fa2.7.4/Dockerfile.base
+++ b/docker/verl0.4-cu124-torch2.6-fa2.7.4/Dockerfile.base
+# Base Docker Image of verl, with CUDA/Torch/FlashAttn/Apex/TransformerEngine, without other frameworks
+# Target: verlai/verl:base-v2-cu124-cudnn9.8-torch2.6-fa2.8.0-te2.3
+# Start from the NVIDIA official image (ubuntu-22.04 + cuda-12.6 + python-3.10)
+# https://docs.nvidia.com/deeplearning/frameworks/pytorch-release-notes/rel-24-08.html
+FROM nvcr.io/nvidia/pytorch:24.08-py3
+
+# Define environments
+ENV MAX_JOBS=16
+ENV VLLM_WORKER_MULTIPROC_METHOD=spawn
+ENV DEBIAN_FRONTEND=noninteractive
+ENV NODE_OPTIONS=""
+ENV PIP_ROOT_USER_ACTION=ignore
+ENV HF_HUB_ENABLE_HF_TRANSFER="1"
+
+# Define installation arguments
+ARG APT_SOURCE=https://mirrors.tuna.tsinghua.edu.cn/ubuntu/
+ARG PIP_INDEX=https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple
+
+# Set apt source
+RUN cp /etc/apt/sources.list /etc/apt/sources.list.bak && \
+    { \
+    echo "deb ${APT_SOURCE} jammy main restricted universe multiverse"; \
+    echo "deb ${APT_SOURCE} jammy-updates main restricted universe multiverse"; \
+    echo "deb ${APT_SOURCE} jammy-backports main restricted universe multiverse"; \
+    echo "deb ${APT_SOURCE} jammy-security main restricted universe multiverse"; \
+    } > /etc/apt/sources.list
+
+# Install systemctl
+RUN apt-get update && \
+    apt-get install -y -o Dpkg::Options::="--force-confdef" systemd && \
+    apt-get clean
+
+# Install tini
+RUN apt-get update && \
+    apt-get install -y tini aria2 && \
+    apt-get clean
+
+# Change pip source
+RUN pip config set global.index-url "${PIP_INDEX}" && \
+    pip config set global.extra-index-url "${PIP_INDEX}" && \
+    python -m pip install --upgrade pip
+
+# Uninstall nv-pytorch fork
+RUN pip uninstall -y torch torchvision torchaudio \
+    pytorch-quantization pytorch-triton torch-tensorrt \
+    xgboost transformer_engine flash_attn apex megatron-core grpcio
+
+# Reinstall CUDA 12.4
+RUN aria2c https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/cuda-ubuntu2204.pin && \
+    mv cuda-ubuntu2204.pin /etc/apt/preferences.d/cuda-repository-pin-600
+
+RUN aria2c --always-resume=true --max-tries=99999 https://developer.download.nvidia.com/compute/cuda/12.4.1/local_installers/cuda-repo-ubuntu2204-12-4-local_12.4.1-550.54.15-1_amd64.deb && \
+    dpkg -i cuda-repo-ubuntu2204-12-4-local_12.4.1-550.54.15-1_amd64.deb && \
+    cp /var/cuda-repo-ubuntu2204-12-4-local/cuda-*-keyring.gpg /usr/share/keyrings/ && \
+    apt-get update && \
+    apt-get -y install cuda-toolkit-12-4 && \
+    rm cuda-repo-ubuntu2204-12-4-local_12.4.1-550.54.15-1_amd64.deb && \
+    update-alternatives --set cuda /usr/local/cuda-12.4 && \
+    rm -rf /usr/local/cuda-12.6
+
+RUN pip install --resume-retries 999 --no-cache-dir torch==2.6.0 torchvision==0.21.0 torchaudio==2.6.0
+
+RUN pip install --resume-retries 999 --no-cache-dir "tensordict==0.6.2" torchdata "transformers[hf_xet]>=4.51.0" accelerate datasets peft hf-transfer \
+    "numpy<2.0.0" "pyarrow>=19.0.1" pandas \
+    ray[default] codetiming hydra-core pylatexenc qwen-vl-utils wandb dill pybind11 liger-kernel mathruler blobfile xgrammar \
+    pytest py-spy pyext pre-commit ruff
+
+# Install flash-attn-2.7.4.post1 (cxx11abi=False)
+RUN wget -nv https://github.com/Dao-AILab/flash-attention/releases/download/v2.7.4.post1/flash_attn-2.7.4.post1+cu12torch2.6cxx11abiFALSE-cp310-cp310-linux_x86_64.whl && \
+    pip install --no-cache-dir flash_attn-2.7.4.post1+cu12torch2.6cxx11abiFALSE-cp310-cp310-linux_x86_64.whl
+
+# Fix packages
+RUN pip uninstall -y pynvml nvidia-ml-py && \
+    pip install --no-cache-dir --upgrade "nvidia-ml-py>=12.560.30" "fastapi[standard]>=0.115.0" "optree>=0.13.0" "pydantic>=2.9" "grpcio>=1.62.1"
+
+# Install cudnn
+RUN aria2c --max-tries=9999 https://developer.download.nvidia.com/compute/cudnn/9.8.0/local_installers/cudnn-local-repo-ubuntu2204-9.8.0_1.0-1_amd64.deb && \
+    dpkg -i cudnn-local-repo-ubuntu2204-9.8.0_1.0-1_amd64.deb && \
+    cp /var/cudnn-local-repo-ubuntu2204-9.8.0/cudnn-*-keyring.gpg /usr/share/keyrings/ && \
+    apt-get update && \
+    apt-get -y install cudnn-cuda-12 && \
+    rm cudnn-local-repo-ubuntu2204-9.8.0_1.0-1_amd64.deb
+
+# Install Apex
+RUN git clone https://github.com/NVIDIA/apex.git && \
+    cd apex && \
+    pip install -v --disable-pip-version-check --no-cache-dir --no-build-isolation --config-settings "--build-option=--cpp_ext" --config-settings "--build-option=--cuda_ext" ./
+
+# Profiling tools
+RUN aria2c --always-resume=true --max-tries=99999 https://developer.nvidia.com/downloads/assets/tools/secure/nsight-systems/2025_3/nsight-systems-2025.3.1_2025.3.1.90-1_amd64.deb && \
+    apt-get update && apt-get install -y libxcb-cursor0 && \
+    dpkg -i ./nsight-systems-2025.3.1_2025.3.1.90-1_amd64.deb && \
+    rm -rf /usr/local/cuda/bin/nsys && \
+    ln -s /opt/nvidia/nsight-systems/2025.3.1/target-linux-x64/nsys  /usr/local/cuda/bin/nsys && \
+    rm -rf /usr/local/cuda/bin/nsys-ui && \
+    ln -s /opt/nvidia/nsight-systems/2025.3.1/target-linux-x64/nsys-ui /usr/local/cuda/bin/nsys-ui && \
+    rm nsight-systems-2025.3.1_2025.3.1.90-1_amd64.deb
+
+# Fix opencv
+RUN pip install --resume-retries 999 --no-cache-dir opencv-python
+
+RUN pip install --resume-retries 999 --no-cache-dir opencv-fixer && \
+    python -c "from opencv_fixer import AutoFix; AutoFix()"
+
+RUN pip install --resume-retries 999 --no-cache-dir cuda-bindings
+
+# Reset pip config
+RUN pip config unset global.index-url && \
+    pip config unset global.extra-index-url
+
+RUN apt-get update && \
+    apt-get install -y libfreeimage3 libfreeimage-dev zlib1g htop
+
--- a/docker/verl0.4-cu124-torch2.6-fa2.7.4/README.md
+++ b/docker/verl0.4-cu124-torch2.6-fa2.7.4/README.md
+# verl image with verl v0.4.x
+
+## Important packages version
+
+```txt
+cuda==12.4
+cudnn==9.8.0
+torch==2.6.0
+flash_attn=2.7.4
+sglang==0.4.6.post5
+vllm==0.8.5.post1
+vidia-cudnn-cu12==9.8.0.87
+transformer_engine==2.3
+megatron.core==core_v0.12.2
+# Preview
+transformer_engine==2.5
+megatron.core==core_r0.13.0
+```
+
+## Target
+
+- Base image: 
+    - `verlai/verl:base-verl0.4-cu124-cudnn9.8-torch2.6-fa2.7.4`
+- App image:
+    - `verlai/verl:app-verl0.4-sglang0.4.6.post5-vllm0.8.5-mcore0.12.2-te2.2`: SGLang requires vLLM in 0.4.6.post5 version, vLLM can have some package conflicts with SGLang
+    - `verlai/verl:app-verl0.4-sglang0.4.6.post5-vllm0.8.5-mcore0.12.2-te2.2-deepep`: Built with deepep
+    - `verlai/verl:app-verl0.4-vllm0.8.5-mcore0.12.2-te2.2`
+    - `verlai/verl:app-verl0.4-vllm0.8.5-mcore0.12.2-te2.2-deepep`: Built with deepep
+- Preview image:
+    - `verlai/verl:app-verl0.4-sglang0.4.6.post5-vllm0.8.5-mcore0.13.0-te2.2-preview`
+    - `verlai/verl:app-verl0.4-vllm0.8.5-mcore0.13.0-te2.2-preview`
\ No newline at end of file
--- a/docker/verl0.5-cu126-torch2.7-fa2.7.4/Dockerfile.app.sglang-preview.mcore0.12
+++ b/docker/verl0.5-cu126-torch2.7-fa2.7.4/Dockerfile.app.sglang-preview.mcore0.12
+# Start from the verl base image
+# Dockerfile.base
+FROM verlai/verl:base-verl0.5-cu126-cudnn9.8-torch2.7.1-fa2.7.4
+
+# Define environments
+ENV MAX_JOBS=8
+ENV VLLM_WORKER_MULTIPROC_METHOD=spawn
+ENV DEBIAN_FRONTEND=noninteractive
+ENV NODE_OPTIONS=""
+ENV PIP_ROOT_USER_ACTION=ignore
+ENV HF_HUB_ENABLE_HF_TRANSFER="1"
+
+# Install sglang-0.4.8 and torch-memory-saver
+# Install FlashInfer Python package
+RUN pip install --upgrade pip setuptools packaging
+RUN pip install --resume-retries 999 --no-cache-dir --no-build-isolation flashinfer-python==0.2.7.post1
+RUN pip install --resume-retries 999  --no-cache-dir "sglang[all]==0.4.9.post2" && pip install torch-memory-saver --no-cache-dir
+
+# Fix packages
+RUN pip install --no-cache-dir "tensordict==0.6.2" "transformers[hf_xet]==4.53.2" accelerate datasets peft hf-transfer \
+    "numpy<2.0.0" "pyarrow>=19.0.1" pandas \
+    ray[default] codetiming hydra-core pylatexenc qwen-vl-utils wandb dill pybind11 liger-kernel mathruler blobfile xgrammar \
+    pytest py-spy pyext pre-commit ruff
+
+RUN pip uninstall -y pynvml nvidia-ml-py && \
+    pip install --resume-retries 999 --no-cache-dir --upgrade "nvidia-ml-py>=12.560.30" "fastapi[standard]>=0.115.0" "optree>=0.13.0" "pydantic>=2.9" "grpcio>=1.62.1"
+
+RUN pip install --resume-retries 999 --no-cache-dir nvidia-cudnn-cu12==9.8.0.87
+
+# Install TransformerEngine
+RUN export NVTE_FRAMEWORK=pytorch && pip3 install --resume-retries 999 --no-deps --no-cache-dir --no-build-isolation git+https://github.com/NVIDIA/TransformerEngine.git@v2.2.1
+
+# Install Megatron-LM
+RUN pip3 install --no-deps --no-cache-dir --no-build-isolation git+https://github.com/NVIDIA/Megatron-LM.git@core_v0.12.2
+
+# Install mbridge
+RUN pip3 install --no-cache-dir mbridge
+
+RUN pip3 install --no-deps --no-cache-dir --no-build-isolation --resume-retries 999 vllm==0.9.2
\ No newline at end of file
--- a/docker/verl0.5-cu126-torch2.7-fa2.7.4/Dockerfile.app.sglang.mcore0.12
+++ b/docker/verl0.5-cu126-torch2.7-fa2.7.4/Dockerfile.app.sglang.mcore0.12
+# Start from the verl base image
+# Dockerfile.base
+FROM verlai/verl:base-verl0.5-cu126-cudnn9.8-torch2.7.1-fa2.7.4
+
+# Define environments
+ENV MAX_JOBS=8
+ENV VLLM_WORKER_MULTIPROC_METHOD=spawn
+ENV DEBIAN_FRONTEND=noninteractive
+ENV NODE_OPTIONS=""
+ENV PIP_ROOT_USER_ACTION=ignore
+ENV HF_HUB_ENABLE_HF_TRANSFER="1"
+
+# Install sglang-0.4.8 and torch-memory-saver
+# Install FlashInfer Python package
+RUN pip install --upgrade pip setuptools packaging
+RUN pip install --resume-retries 999 --no-cache-dir --no-build-isolation flashinfer-python==0.2.6.post1
+RUN pip install --resume-retries 999  --no-cache-dir "sglang[all]==0.4.8" && pip install torch-memory-saver --no-cache-dir
+
+# Fix packages
+RUN pip install --no-cache-dir "tensordict==0.6.2" "transformers[hf_xet]>=4.52.3" accelerate datasets peft hf-transfer \
+    "numpy<2.0.0" "pyarrow>=19.0.1" pandas \
+    ray[default] codetiming hydra-core pylatexenc qwen-vl-utils wandb dill pybind11 liger-kernel mathruler blobfile xgrammar \
+    pytest py-spy pyext pre-commit ruff
+
+RUN pip uninstall -y pynvml nvidia-ml-py && \
+    pip install --resume-retries 999 --no-cache-dir --upgrade "nvidia-ml-py>=12.560.30" "fastapi[standard]>=0.115.0" "optree>=0.13.0" "pydantic>=2.9" "grpcio>=1.62.1"
+
+RUN pip install --resume-retries 999 --no-cache-dir nvidia-cudnn-cu12==9.8.0.87
+
+# Install TransformerEngine
+RUN export NVTE_FRAMEWORK=pytorch && pip3 install --resume-retries 999 --no-deps --no-cache-dir --no-build-isolation git+https://github.com/NVIDIA/TransformerEngine.git@v2.2.1
+
+# Install Megatron-LM
+RUN pip3 install --no-deps --no-cache-dir --no-build-isolation git+https://github.com/NVIDIA/Megatron-LM.git@core_v0.12.2
+
+# Install mbridge
+RUN pip3 install --no-cache-dir mbridge
+
+RUN pip3 install --no-deps --no-cache-dir --no-build-isolation --resume-retries 999 vllm==0.9.2
\ No newline at end of file