Initial commit

7f6cc211 · jerrrrry · 7f6cc211 · 7f6cc211 · 7f6cc211 · 7f6cc211
Commit 7f6cc211 authored Aug 05, 2025 by jerrrrry
20 changed files
--- a/docker/Dockerfile.ngc.vllm
+++ b/docker/Dockerfile.ngc.vllm
+# docker buildx build --platform linux/x86_64 -t "verlai/verl:ngc-th2.4.0-cu124-vllm0.6.3-ray2.4-te1.7-v0.0.6" -f docker/Dockerfile.ngc.vllm . --builder cloud-verlai-verl-builder --progress=plain --push
+FROM nvcr.io/nvidia/pytorch:24.05-py3
+# uninstall nv-pytorch fork
+RUN pip3 uninstall pytorch-quantization \
+    pytorch-triton \
+    torch \
+    torch-tensorrt \
+    torchvision \
+    xgboost transformer_engine flash_attn \
+    apex megatron-core -y
+RUN pip3 install torch==2.4.0 torchvision==0.19.0 torchaudio==2.4.0 --index-url https://download.pytorch.org/whl/cu124
+# =============== Megatron dependencies (optional) =================
+# install apex, set MAX_JOBS to avoid OOMs
+RUN MAX_JOBS=4 pip3 install -v --disable-pip-version-check --no-cache-dir --no-build-isolation \
+    --config-settings "--build-option=--cpp_ext" --config-settings "--build-option=--cuda_ext" \
+    git+https://github.com/NVIDIA/apex
+# =============== End of Megatron dependencies (optional) =================
+RUN pip3 install --no-cache-dir \
+    accelerate \
+    codetiming \
+    datasets \
+    dill \
+    hydra-core \
+    numpy \
+    'pandas' \
+    'peft' \
+    'pyarrow>=15.0.0' \
+    'pybind11' \
+    'pylatexenc' \
+    'ray>=2.10' \
+    'tensordict<0.6' \
+    'transformers' \
+    'vllm==0.6.3.post1' \
+    'wandb'
+# full dependencies
+RUN pip3 install pytest pre-commit py-spy pyext liger-kernel
+# =============== Megatron dependencies (optional) =================
+# install Transformer Engine, which requires FA 2.5.8. Do it in a separate step for docker cache
+RUN MAX_JOBS=4 NINJA_FLAGS="-j4" pip3 install flash-attn==2.5.8 --no-cache-dir --no-build-isolation
+RUN MAX_JOBS=1 NINJA_FLAGS="-j1" TE_BUILD_WITH_NINJA=0 pip3 install git+https://github.com/eric-haibin-lin/TransformerEngine.git@v1.7.0
+# =============== End of Megatron dependencies (optional) =================
--- a/docker/Dockerfile.ngc.vllm0.8
+++ b/docker/Dockerfile.ngc.vllm0.8
+# Start from the NVIDIA official image (ubuntu-22.04 + cuda-12.6 + python-3.10)
+# https://docs.nvidia.com/deeplearning/frameworks/pytorch-release-notes/rel-24-08.html
+FROM nvcr.io/nvidia/pytorch:24.08-py3
+# Define environments
+ENV MAX_JOBS=32
+ENV VLLM_WORKER_MULTIPROC_METHOD=spawn
+ENV DEBIAN_FRONTEND=noninteractive
+ENV NODE_OPTIONS=""
+ENV PIP_ROOT_USER_ACTION=ignore
+ENV HF_HUB_ENABLE_HF_TRANSFER="1"
+# Define installation arguments
+ARG APT_SOURCE=https://mirrors.tuna.tsinghua.edu.cn/ubuntu/
+ARG PIP_INDEX=https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple
+# Set apt source
+RUN cp /etc/apt/sources.list /etc/apt/sources.list.bak && \
+    { \
+    echo "deb ${APT_SOURCE} jammy main restricted universe multiverse"; \
+    echo "deb ${APT_SOURCE} jammy-updates main restricted universe multiverse"; \
+    echo "deb ${APT_SOURCE} jammy-backports main restricted universe multiverse"; \
+    echo "deb ${APT_SOURCE} jammy-security main restricted universe multiverse"; \
+    } > /etc/apt/sources.list
+# Install systemctl
+RUN apt-get update && \
+    apt-get install -y -o Dpkg::Options::="--force-confdef" systemd && \
+    apt-get clean
+# Install tini
+RUN apt-get update && \
+    apt-get install -y tini && \
+    apt-get clean
+# Change pip source
+RUN pip config set global.index-url "${PIP_INDEX}" && \
+    pip config set global.extra-index-url "${PIP_INDEX}" && \
+    python -m pip install --upgrade pip
+# Uninstall nv-pytorch fork
+RUN pip uninstall -y torch torchvision torchaudio \
+    pytorch-quantization pytorch-triton torch-tensorrt \
+    xgboost transformer_engine flash_attn apex megatron-core grpcio
+# Install torch-2.6.0+cu124 + vllm-0.8.3
+# torch-2.6.0+cu124: cxx11abi=False
+# torch-2.6.0+cu126: cxx11abi=True
+# see https://github.com/flashinfer-ai/flashinfer/issues/911
+RUN pip install --no-cache-dir "vllm==0.8.3" "torch==2.6.0" "torchvision==0.21.0" "torchaudio==2.6.0" "tensordict==0.6.2" torchdata \
+    "transformers[hf_xet]>=4.51.0" accelerate datasets peft hf-transfer \
+    "numpy<2.0.0" "pyarrow>=15.0.0" pandas \
+    ray[default] codetiming hydra-core pylatexenc qwen-vl-utils wandb dill pybind11 liger-kernel mathruler \
+    pytest py-spy pyext pre-commit ruff
+# Install flash-attn-2.7.4.post1 (cxx11abi=False)
+RUN wget -nv https://github.com/Dao-AILab/flash-attention/releases/download/v2.7.4.post1/flash_attn-2.7.4.post1+cu12torch2.6cxx11abiFALSE-cp310-cp310-linux_x86_64.whl && \
+    pip install --no-cache-dir flash_attn-2.7.4.post1+cu12torch2.6cxx11abiFALSE-cp310-cp310-linux_x86_64.whl
+# Install flashinfer-0.2.2.post1+cu124 (cxx11abi=False)
+# vllm-0.8.3 does not support flashinfer>=0.2.3
+# see https://github.com/vllm-project/vllm/pull/15777
+RUN wget -nv https://github.com/flashinfer-ai/flashinfer/releases/download/v0.2.2.post1/flashinfer_python-0.2.2.post1+cu124torch2.6-cp38-abi3-linux_x86_64.whl && \
+    pip install --no-cache-dir flashinfer_python-0.2.2.post1+cu124torch2.6-cp38-abi3-linux_x86_64.whl
+# Fix packages
+RUN pip uninstall -y pynvml nvidia-ml-py && \
+    pip install --no-cache-dir --upgrade "nvidia-ml-py>=12.560.30" "fastapi[standard]>=0.115.0" "optree>=0.13.0" "pydantic>=2.9" "grpcio>=1.62.1"
+# Install verl
+RUN pip install --no-cache-dir verl[vllm] -U
+# Reset pip config
+RUN pip config unset global.index-url && \
+    pip config unset global.extra-index-url
--- a/docker/Dockerfile.ngc.vllm0.8.sagemaker
+++ b/docker/Dockerfile.ngc.vllm0.8.sagemaker
+# Using a pre-built image from AWS DLC which contains the current version of python (3.10) and supported cuda version (12.1)
+FROM 763104351884.dkr.ecr.us-east-1.amazonaws.com/huggingface-pytorch-training:2.1.0-transformers4.36.0-gpu-py310-cu121-ubuntu20.04
+# uninstall nv-pytorch fork
+RUN pip3 uninstall -y pytorch-quantization \
+    pytorch-triton torch torch-tensorrt torchvision \
+    xgboost transformer_engine flash_attn apex megatron-core
+# Define environments
+ENV MAX_JOBS=32
+ENV VLLM_WORKER_MULTIPROC_METHOD=spawn
+ENV DEBIAN_FRONTEND=noninteractive
+ENV NODE_OPTIONS=""
+ENV HF_HUB_ENABLE_HF_TRANSFER="1"
+# Install systemctl
+RUN apt-get update && \
+    apt-get install -y -o Dpkg::Options::="--force-confdef" systemd && \
+    apt-get clean
+# Install tini
+RUN apt-get update && \
+    apt-get install -y tini && \
+    apt-get clean
+# Install torch-2.6.0 + vllm-0.8.2
+RUN pip install --no-cache-dir vllm==0.8.2 torch==2.6.0 torchvision==0.21.0 torchaudio==2.6.0 tensordict torchdata==0.11.0 \
+    transformers>=4.49.0 accelerate datasets peft hf-transfer \
+    ray[default] codetiming hydra-core pandas pyarrow>=15.0.0 pylatexenc qwen-vl-utils wandb dill pybind11 liger-kernel mathruler \
+    pytest pre-commit py-spy pyext ruff
+# Install flash_attn-2.7.4.post1
+RUN pip uninstall -y transformer-engine flash-attn && \
+    pip install flash-attn==2.7.4.post1 --no-build-isolation
+# Fix cv2
+RUN pip uninstall -y pynvml nvidia-ml-py && \
+    pip install --no-cache-dir nvidia-ml-py>=12.560.30 opencv-python-headless==4.8.0.74 fastapi==0.115.6 && \
+    pip install --no-cache-dir --upgrade optree>=0.13.0
+# Install verl
+RUN pip install --no-cache-dir verl[vllm] -U
+# Reset pip config
+RUN pip config unset global.index-url && \
+    pip config unset global.extra-index-url
--- a/docker/Dockerfile.rocm
+++ b/docker/Dockerfile.rocm
+# FROM "compute-artifactory.amd.com:5000/rocm-plus-docker/framework/compute-rocm-rel-6.4:94_ubuntu22.04_py3.10_pytorch_release-2.7_575e247"
+FROM "rlfoundation.azurecr.io/rocm6.3.4:vllm-0.8.5-numa-patch-ubuntu-22.04"
+SHELL ["/bin/bash", "-ceuxo", "pipefail"]
+ENV MAX_JOBS=512
+ENV PATH="/usr/local/python3.12/bin:$PATH"
+RUN ln -sf /usr/bin/python3.12 /usr/bin/python && \
+    ln -sf /usr/bin/pip3.12 /usr/bin/pip
+############################################
+############################################
+RUN apt-get update
+RUN apt-get install -y pkg-config liblzma-dev
+############################################
+############################################
+###########################################
+##########Install TransformerEngine########
+###########################################
+WORKDIR /workspace/
+# transformer-engine install
+# https://github.com/ROCm/TransformerEngine
+RUN rm -rf TransformerEngine 
+RUN git clone --recursive https://github.com/ROCm/TransformerEngine.git
+WORKDIR /workspace/TransformerEngine
+RUN git checkout 236178e5
+# git checkout bb061ade
+# git checkout 864405c
+ENV NVTE_FRAMEWORK=pytorch 
+ENV NVTE_ROCM_ARCH=gfx942 
+ENV NVTE_USE_HIPBLASLT=1
+ENV NVTE_USE_ROCM=1  
+# export CMAKE_PREFIX_PATH="/opt/rocm:/opt/rocm/hip:/usr/local:/usr:${CMAKE_PREFIX_PATH:-}"
+ENV CMAKE_PREFIX_PATH="/opt/rocm:/opt/rocm/hip:/usr/local:/usr"
+# ENV NVTE_BUILD_MAX_JOBS=$(MAX_JOBS)
+RUN MAX_JOBS=$(MAX_JOBS) pip install . -vvv 
+WORKDIR /workspace/
+###########################################
+###########################################
+###########################################
+####################################################################################
+################Install vllm - sglang require vllm 0.6.7 dependency#################
+####################################################################################
+#### Require vllm 0.6.7 - checkout 113274a0
+WORKDIR /workspace/
+RUN rm -rf vllm
+RUN pip uninstall -y vllm
+# Refer to here (down-grade vllm to 0.6.3): https://docs.vllm.ai/en/v0.6.3/getting_started/amd-installation.html
+RUN git clone https://github.com/ROCm/vllm.git
+# git clone https://github.com/vllm-project/vllm.git
+WORKDIR /workspace/vllm
+RUN git checkout 113274a0
+ENV PYTORCH_ROCM_ARCH="gfx90a;gfx942"
+#ENV MAX_JOBS=512
+ENV MAX_JOBS=${MAX_JOBS}
+RUN pip install "boto3>=1.26.0"
+RUN pip install setuptools_scm
+# will add src into py. You can delete the repo
+RUN python3 setup.py install
+WORKDIR /workspace/
+####################################################################################
+####################################################################################
+####################################################################################
+###########################################
+############For hack docker################
+###########################################
+RUN pip install setuptools==75.8.0
+###########################################
+###########################################
+###########################################
+###########################################
+############build sgalng###################
+###########################################
+# Set environment variables
+ENV BASE_DIR=/sgl-workspace
+ENV BUILD_TYPE=all
+ENV SGL_REPO=https://github.com/sgl-project/sglang
+ENV SGL_BRANCH=v0.4.6.post5
+ENV TRITON_REPO=https://github.com/ROCm/triton.git
+ENV TRITON_COMMIT=improve_fa_decode_3.0.0
+ENV AITER_REPO=https://github.com/ROCm/aiter.git
+ENV AITER_COMMIT=v0.1.2
+# v0.1.2 version - commit id: 9d11f47
+# ENV AITER_COMMIT=9d11f47
+ENV HIP_FORCE_DEV_KERNARG=1
+ENV HSA_NO_SCRATCH_RECLAIM=1
+ENV SGLANG_SET_CPU_AFFINITY=1
+ENV SGLANG_ALLOW_OVERWRITE_LONGER_CONTEXT_LEN=1
+ENV NCCL_MIN_NCHANNELS=112
+ENV MOE_PADDING=1
+ENV VLLM_FP8_PADDING=1
+ENV VLLM_FP8_ACT_PADDING=1
+ENV VLLM_FP8_WEIGHT_PADDING=1
+ENV VLLM_FP8_REDUCE_CONV=1
+ENV TORCHINDUCTOR_MAX_AUTOTUNE=1
+ENV TORCHINDUCTOR_MAX_AUTOTUNE_POINTWISE=1
+ENV HIPCC_COMPILE_FLAGS_APPEND="--offload-arch=gfx942"
+ENV AMDGPU_TARGETS=gfx942
+ENV ROCM_ARCH=gfx942
+ENV PYTORCH_ROCM_ARCH="gfx90a;gfx942"
+# Switch to working directory
+WORKDIR /sgl-workspace
+# Clean and create directory
+RUN rm -rf /sgl-workspace && mkdir -p /sgl-workspace
+# Clone and build sglang
+RUN git clone ${SGL_REPO} \
+    && cd sglang \
+    && git checkout ${SGL_BRANCH} || echo "Using default branch" \
+    && cd sgl-kernel \
+    && rm -f pyproject.toml \
+    && mv pyproject_rocm.toml pyproject.toml \
+    && python setup_rocm.py install \
+    && cd .. \
+    && if [ "$BUILD_TYPE" = "srt" ]; then \
+         python -m pip --no-cache-dir install -e "python[srt_hip]"; \
+       else \
+         python -m pip --no-cache-dir install -e "python[all_hip]"; \
+       fi \
+    && cd /sgl-workspace \
+    && cp -r /sgl-workspace/sglang /sglang \
+    && python -m pip cache purge
+# Install common Python packages
+RUN pip install IPython orjson python-multipart torchao pybind11
+# Rebuild Triton
+RUN pip uninstall -y triton || true \
+    && git clone ${TRITON_REPO} \
+    && cd triton \
+    && git checkout ${TRITON_COMMIT} \
+    && cd python \
+    && python3 setup.py install \
+    && cd /sgl-workspace
+# ENV HIPCC_COMPILE_FLAGS_APPEND="--offload-arch=gfx942 --amdgpu-lower-module-lds-strategy=1"
+# ENV HIPCC_COMPILE_FLAGS_APPEND="--offload-arch=gfx942"
+# Build aiter
+#version: Commit 9d11f47
+    # && git checkout ${AITER_COMMIT} \
+RUN pip uninstall -y aiter || true
+RUN git clone ${AITER_REPO} \
+    && cd aiter \
+    && git checkout ${AITER_COMMIT} \
+    && git submodule sync \
+    && git submodule update --init --recursive \
+    && PREBUILD_KERNELS=1 GPU_ARCHS=gfx942 python3 setup.py install \
+    && cd /sgl-workspace
+    # && PREBUILD_KERNELS=1 GPU_ARCHS=gfx942 python3 setup.py develop \
+    # && PREBUILD_KERNELS=1 GPU_ARCHS=gfx942 python3 setup.py develop \
+# Copy MI300X config 
+RUN find /sgl-workspace/sglang/python/sglang/srt/layers/quantization/configs/ \
+         /sgl-workspace/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/ \
+         -type f -name '*MI300X*' | \
+         xargs -I {} sh -c 'vf_config=$(echo "$1" | sed "s/MI300X/MI300X_VF/"); cp "$1" "$vf_config"' -- {}
+# Environment setup complete.
+RUN echo "Environment setup complete."
+WORKDIR /workspace/
+###########################################
+###########################################
+###########################################
+###########################################
+###############vllm v0.8.5#################
+###########################################
+# ENV GITHUB_USERNAME=yushengsu-thu
+# ENV GITHUB_MAIL=yushengsu@gmail.com
+# RUN git config --global user.name "${GITHUB_USERNAME}" \
+#     && git config --global user.email "${GITHUB_MAIL}" 
+WORKDIR /workspace/
+ENV VLLM_TARGET_DEVICE=rocm 
+ENV ROCM_PATH=/opt/rocm 
+ENV SETUPTOOLS_SCM_PRETEND_VERSION=0.8.5.dev
+# Find the repo path in: DockerFile/Dockerfile.rocm_yang
+# RUN git clone https://github.com/RLFoundation/vllm-patch.git
+RUN pip uninstall -y vllm || true
+RUN rm -rf vllm-patch
+RUN git clone https://github.com/RLFoundation/vllm-patch.git \
+    && cd vllm-patch \
+    && git checkout v0.8.5-sleep-numa \
+    && rm -rf build/ dist/ *.egg-info \
+    && ln -sf /opt/rocm/lib/libamdhip64.so /usr/lib/libamdhip64.so \
+    && SETUPTOOLS_SCM_PRETEND_VERSION=0.8.5.dev PYTORCH_ROCM_ARCH="gfx90a;gfx942" MAX_JOBS=${MAX_JOBS} python3 setup.py install
+    # RUN SETUPTOOLS_SCM_PRETEND_VERSION=0.8.5.dev PYTORCH_ROCM_ARCH="gfx90a;gfx942" MAX_JOBS=${MAX_JOBS} python3 setup.py develop
+WORKDIR /workspace/
+###########################################
+###########################################
+###########################################
+#########################################
+#### Install megatron-core###############
+#########################################
+RUN pip uninstall -y megatron-core && \
+    git clone https://github.com/yushengsu-thu/Megatron-LM-amd_version.git && \
+    cd Megatron-LM-amd_version && \
+    pip install -vvv -e . && \
+    cd /workspace/
+#########################################
+#########################################
+#########################################
+#######################################
+################apex###################
+#######################################
+WORKDIR /workspace/
+RUN pip uninstall -y apex && \
+    git clone https://github.com/ROCm/apex.git && \
+    cd apex && \
+    python setup.py install && \
+    cd /workspace/ 
+#######################################
+#######################################
+#######################################
+################################################################################
+###########################Add torch_memory_saver###############################
+################################################################################
+# Set environment variables
+ENV HIPCC_COMPILE_FLAGS_APPEND="--amdgpu-target=gfx90a;gfx942 -D__HIP_PLATFORM_AMD__"
+ENV CFLAGS="-D__HIP_PLATFORM_AMD__"
+ENV CXXFLAGS="-D__HIP_PLATFORM_AMD__"
+RUN pip install "git+https://github.com/YangWang92/torch_memory_saver_numa.git@numa"
+################################################################################
+################################################################################
+################################################################################
+########################################
+######Install ray#######################
+########################################
+# need to add this patch: https://github.com/ray-project/ray/pull/53531/files
+RUN pip uninstall ray -y
+RUN pip install "ray[data,train,tune,serve]>=2.47.0" 
+########################################
+########################################
+########################################
+##########################################
+#######Install other dependencies#########
+##########################################
+RUN pip install "tensordict==0.6.2" --no-deps && \
+    pip install accelerate \
+    codetiming \
+    datasets \
+    dill \
+    hydra-core \
+    liger-kernel \
+    numpy \
+    pandas \
+    peft \
+    "pyarrow>=15.0.0" \
+    pylatexenc \
+    torchdata \
+    wandb \
+    orjson \
+    pybind11
+WORKDIR /workspace/
+RUN git clone https://github.com/volcengine/verl.git && \
+    cd verl && \
+    pip install -e . 
+##########################################
+##########################################
+##########################################
+WORKDIR /workspace/
+CMD ["/usr/bin/bash"]
--- a/docker/Dockerfile.rocm_verl-0.3.0.post1
+++ b/docker/Dockerfile.rocm_verl-0.3.0.post1
+#  Build the docker in the repo dir:
+# docker build -f docker/Dockerfile.rocm -t verl-rocm:03.04.2015 .
+# docker images # you can find your built docker
+# Support - Traing: fsdp; Inference: vllm
+# FROM rocm/vllm:rocm6.2_mi300_ubuntu20.04_py3.9_vllm_0.6.4
+# Support - Traing: fsdp; Inference: vllm, sglang
+FROM lmsysorg/sglang:v0.4.6.post5-rocm630
+# Set working directory
+# WORKDIR $PWD/app
+# Set environment variables
+ENV PYTORCH_ROCM_ARCH="gfx90a;gfx942"
+ENV HIPCC_COMPILE_FLAGS_APPEND="--amdgpu-target=gfx90a;gfx942 -D__HIP_PLATFORM_AMD__"
+ENV CFLAGS="-D__HIP_PLATFORM_AMD__"
+ENV CXXFLAGS="-D__HIP_PLATFORM_AMD__"
+# Install vllm
+RUN pip uninstall -y vllm && \
+    rm -rf vllm && \
+    git clone -b v0.6.3 https://github.com/vllm-project/vllm.git && \
+    cd vllm && \
+    MAX_JOBS=$(nproc) python3 setup.py install && \
+    cd .. && \
+    rm -rf vllm
+# Copy the entire project directory
+COPY . .
+# Install dependencies
+RUN pip install "tensordict==0.6.2" --no-deps && \
+    pip install accelerate \
+    codetiming \
+    datasets \
+    dill \
+    hydra-core \
+    liger-kernel \
+    numpy \
+    pandas \
+    peft \
+    "pyarrow>=15.0.0" \
+    pylatexenc \
+    "ray[data,train,tune,serve]<2.45.0" \
+    torchdata \
+    transformers \
+    wandb \
+    orjson \
+    pybind11
+RUN git clone https://github.com/volcengine/verl.git && \
+    cd verl && \
+    pip install -e . 
+# Install torch_memory_saver
+RUN pip install git+https://github.com/ExtremeViscent/torch_memory_saver.git --no-deps
--- a/docker/Dockerfile.rocm_verl-0.4.1
+++ b/docker/Dockerfile.rocm_verl-0.4.1
--- a/docker/Dockerfile.sglang
+++ b/docker/Dockerfile.sglang
+# Start from the NVIDIA official image (ubuntu-22.04 + python-3.10)
+# https://docs.nvidia.com/deeplearning/frameworks/pytorch-release-notes/rel-24-08.html
+FROM nvcr.io/nvidia/pytorch:24.08-py3
+# Define environments
+ENV MAX_JOBS=32
+ENV DEBIAN_FRONTEND=noninteractive
+ENV NODE_OPTIONS=""
+# Define installation arguments
+ARG APT_SOURCE=https://mirrors.ustc.edu.cn/ubuntu/
+# Set apt source
+RUN cp /etc/apt/sources.list /etc/apt/sources.list.bak && \
+    { \
+    echo "deb ${APT_SOURCE} jammy main restricted universe multiverse"; \
+    echo "deb ${APT_SOURCE} jammy-updates main restricted universe multiverse"; \
+    echo "deb ${APT_SOURCE} jammy-backports main restricted universe multiverse"; \
+    echo "deb ${APT_SOURCE} jammy-security main restricted universe multiverse"; \
+    } > /etc/apt/sources.list
+# Install systemctl
+RUN apt-get update && \
+    apt-get install -y -o Dpkg::Options::="--force-confdef" systemd && \
+    apt-get clean
+# Install tini
+RUN apt-get update && \
+    apt-get install -y tini && \
+    apt-get clean
+# Change pip source
+ARG PIP_INDEX=https://mirrors.aliyun.com/pypi/simple/
+RUN pip config set global.index-url "${PIP_INDEX}" && \
+    pip config set global.extra-index-url "${PIP_INDEX}" && \
+    python -m pip install --upgrade pip
+# Install sglang-0.4.6.post5 and torch-memory-saver
+RUN pip uninstall -y cuda-python && pip install "sglang[all]==0.4.6.post5" --no-cache-dir --find-links https://flashinfer.ai/whl/cu124/torch2.6/flashinfer-python && pip install torch-memory-saver --no-cache-dir
+# Install torch-2.6.0
+RUN pip install --no-cache-dir torch==2.6.0 torchvision==0.21.0 torchaudio==2.6.0 tensordict torchdata \
+    transformers>=4.49.0 accelerate datasets peft hf_transfer \
+    ray[default] codetiming hydra-core pandas pyarrow>=15.0.0 pylatexenc qwen-vl-utils wandb liger-kernel \
+    pytest pre-commit py-spy pyext
+# Install flash_attn-2.7.4.post1
+RUN pip uninstall -y transformer-engine flash-attn && \
+    wget -v https://github.com/Dao-AILab/flash-attention/releases/download/v2.7.4.post1/flash_attn-2.7.4.post1+cu12torch2.6cxx11abiFALSE-cp310-cp310-linux_x86_64.whl && \
+    pip install --no-cache-dir flash_attn-2.7.4.post1+cu12torch2.6cxx11abiFALSE-cp310-cp310-linux_x86_64.whl
+# Fix cv2
+RUN pip uninstall -y pynvml nvidia-ml-py && \
+    pip install --no-cache-dir nvidia-ml-py>=12.560.30 opencv-python-headless==4.8.0.74 fastapi==0.115.6
--- a/docker/Dockerfile.vemlp.vllm.te
+++ b/docker/Dockerfile.vemlp.vllm.te
+# docker buildx build --platform linux/x86_64 -t "verlai/verl:$TAG" -f docker/$FILE .
+# the one in docker.io is an alias for the one veturbo
+# FROM vemlp-cn-beijing.cr.volces.com/veturbo/pytorch:2.4-cu124
+FROM docker.io/haibinlin/verl:v0.0.5-th2.4.0-cu124-base
+# only config pip index with https://pypi.tuna.tsinghua.edu.cn/simple if needed
+# unset for now
+RUN pip3 config unset global.index-url
+# transformers 4.47.0 contains the following bug:
+# AttributeError: 'Gemma2Attention' object has no attribute '_flash_attn_uses_top_left_mask'
+RUN pip3 install --no-cache-dir \
+    torch==2.4.0 \
+    accelerate \
+    codetiming \
+    dill \
+    hydra-core \
+    numpy \
+    pybind11 \
+    tensordict \
+    "transformers <= 4.46.0"
+RUN pip3 install --no-cache-dir flash-attn==2.7.0.post2 --no-build-isolation
+# vllm depends on ray
+RUN pip3 install --no-cache-dir vllm==0.6.3 ray==2.10
+# install apex
+RUN MAX_JOBS=4 pip3 install -v --disable-pip-version-check --no-cache-dir --no-build-isolation \
+    --config-settings "--build-option=--cpp_ext" --config-settings "--build-option=--cuda_ext" \
+    git+https://github.com/NVIDIA/apex
+# install Transformer Engine
+# - flash-attn pinned to 2.5.3 by TransformerEngine, switch to eric-haibin-lin/TransformerEngine.git@v1.7.0 to relax version req
+# - install with: MAX_JOBS=1 NINJA_FLAGS="-j1" TE_BUILD_WITH_NINJA=0 to avoid OOM
+# - cudnn is required by TransformerEngine
+# RUN CUDNN_PATH=/opt/conda/lib/python3.11/site-packages/nvidia/cudnn \
+#     pip3 install git+https://github.com/eric-haibin-lin/TransformerEngine.git@v1.7.0
+RUN MAX_JOBS=1 NINJA_FLAGS="-j1" pip3 install flash-attn==2.5.3 --no-cache-dir --no-build-isolation
+RUN MAX_JOBS=1 NINJA_FLAGS="-j1" pip3 install git+https://github.com/NVIDIA/TransformerEngine.git@v1.7
--- a/docker/Dockerfile.vllm.sglang.megatron.deepseek
+++ b/docker/Dockerfile.vllm.sglang.megatron.deepseek
+# Start from the NVIDIA official image (ubuntu-22.04 + cuda-12.6 + python-3.10)
+# https://docs.nvidia.com/deeplearning/frameworks/pytorch-release-notes/rel-24-08.html
+FROM nvcr.io/nvidia/pytorch:24.08-py3
+# Define environments
+ENV MAX_JOBS=32
+ENV VLLM_WORKER_MULTIPROC_METHOD=spawn
+ENV DEBIAN_FRONTEND=noninteractive
+ENV NODE_OPTIONS=""
+ENV PIP_ROOT_USER_ACTION=ignore
+ENV HF_HUB_ENABLE_HF_TRANSFER="1"
+# Define installation arguments
+ARG APT_SOURCE=https://mirrors.tuna.tsinghua.edu.cn/ubuntu/
+ARG PIP_INDEX=https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple
+# Set apt source
+RUN cp /etc/apt/sources.list /etc/apt/sources.list.bak && \
+    { \
+    echo "deb ${APT_SOURCE} jammy main restricted universe multiverse"; \
+    echo "deb ${APT_SOURCE} jammy-updates main restricted universe multiverse"; \
+    echo "deb ${APT_SOURCE} jammy-backports main restricted universe multiverse"; \
+    echo "deb ${APT_SOURCE} jammy-security main restricted universe multiverse"; \
+    } > /etc/apt/sources.list
+# Install systemctl
+RUN apt-get update && \
+    apt-get install -y -o Dpkg::Options::="--force-confdef" systemd && \
+    apt-get clean
+# Install tini
+RUN apt-get update && \
+    apt-get install -y tini aria2 && \
+    apt-get clean
+# Change pip source
+RUN pip config set global.index-url "${PIP_INDEX}" && \
+    pip config set global.extra-index-url "${PIP_INDEX}" && \
+    python -m pip install --upgrade pip
+# Uninstall nv-pytorch fork
+RUN pip uninstall -y torch torchvision torchaudio \
+    pytorch-quantization pytorch-triton torch-tensorrt \
+    xgboost transformer_engine flash_attn apex megatron-core grpcio
+# Reinstall CUDA 12.4
+RUN aria2c https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/cuda-ubuntu2204.pin && \
+    mv cuda-ubuntu2204.pin /etc/apt/preferences.d/cuda-repository-pin-600
+RUN aria2c --always-resume=true --max-tries=99999 https://developer.download.nvidia.com/compute/cuda/12.4.1/local_installers/cuda-repo-ubuntu2204-12-4-local_12.4.1-550.54.15-1_amd64.deb && \
+    dpkg -i cuda-repo-ubuntu2204-12-4-local_12.4.1-550.54.15-1_amd64.deb && \
+    cp /var/cuda-repo-ubuntu2204-12-4-local/cuda-*-keyring.gpg /usr/share/keyrings/ && \
+    apt-get update && \
+    apt-get -y install cuda-toolkit-12-4 && \
+    rm cuda-repo-ubuntu2204-12-4-local_12.4.1-550.54.15-1_amd64.deb && \
+    update-alternatives --set cuda /usr/local/cuda-12.4 && \
+    rm -rf /usr/local/cuda-12.6
+# Install torch-2.6.0+cu124 + vllm-0.8.5.post1 + sglang-0.4.6.post5
+# torch-2.6.0+cu124: cxx11abi=False
+# torch-2.6.0+cu126: cxx11abi=True
+# see https://github.com/flashinfer-ai/flashinfer/issues/911
+# Install sglang-0.4.6.post1 and torch-memory-saver
+RUN pip install --resume-retries 999 "sglang[all]==0.4.6.post5" --no-cache-dir --find-links https://flashinfer.ai/whl/cu124/torch2.6/flashinfer-python && pip install --resume-retries 999 torch-memory-saver --no-cache-dir
+RUN pip install --resume-retries 999 --no-cache-dir "vllm==0.8.5.post1" "torch==2.6.0" "torchvision==0.21.0" "torchaudio==2.6.0" "tensordict==0.6.2" torchdata
+RUN pip install --resume-retries 999 --no-cache-dir "transformers[hf_xet]>=4.51.0" accelerate datasets peft hf-transfer \
+    "numpy<2.0.0" "pyarrow>=15.0.0" pandas \
+    ray[default] codetiming hydra-core pylatexenc qwen-vl-utils wandb dill pybind11 liger-kernel mathruler blobfile \
+    pytest py-spy pyext pre-commit ruff
+# Install flash-attn-2.7.4.post1 (cxx11abi=False)
+RUN wget -nv https://github.com/Dao-AILab/flash-attention/releases/download/v2.7.4.post1/flash_attn-2.7.4.post1+cu12torch2.6cxx11abiFALSE-cp310-cp310-linux_x86_64.whl && \
+    pip install --no-cache-dir flash_attn-2.7.4.post1+cu12torch2.6cxx11abiFALSE-cp310-cp310-linux_x86_64.whl
+# Fix packages
+RUN pip uninstall -y pynvml nvidia-ml-py && \
+    pip install --resume-retries 999 --no-cache-dir --upgrade "nvidia-ml-py>=12.560.30" "fastapi[standard]>=0.115.0" "optree>=0.13.0" "pydantic>=2.9" "grpcio>=1.62.1"
+# Install cudnn
+RUN aria2c --max-tries=9999 https://developer.download.nvidia.com/compute/cudnn/9.8.0/local_installers/cudnn-local-repo-ubuntu2204-9.8.0_1.0-1_amd64.deb && \
+    dpkg -i cudnn-local-repo-ubuntu2204-9.8.0_1.0-1_amd64.deb && \
+    cp /var/cudnn-local-repo-ubuntu2204-9.8.0/cudnn-*-keyring.gpg /usr/share/keyrings/ && \
+    apt-get update && \
+    apt-get -y install cudnn-cuda-12 && \
+    rm cudnn-local-repo-ubuntu2204-9.8.0_1.0-1_amd64.deb
+RUN pip install --resume-retries 999 --no-cache-dir nvidia-cudnn-cu12==9.8.0.87
+# Install Apex
+RUN git clone https://github.com/NVIDIA/apex.git && \
+    cd apex && \
+    pip install -v --disable-pip-version-check --no-cache-dir --no-build-isolation --config-settings "--build-option=--cpp_ext" --config-settings "--build-option=--cuda_ext" ./
+# Install TransformerEngine
+RUN export NVTE_FRAMEWORK=pytorch && pip3 install --no-deps --no-cache-dir git+https://github.com/NVIDIA/TransformerEngine.git@v2.3
+# Install Megatron-LM
+RUN pip3 install --no-deps --no-cache-dir git+https://github.com/NVIDIA/Megatron-LM.git@core_v0.12.2
+# Fix opencv
+RUN pip install opencv-python
+RUN pip install opencv-fixer && \
+    python -c "from opencv_fixer import AutoFix; AutoFix()"
+# Install verl
+# Reset pip config
+RUN pip config unset global.index-url && \
+    pip config unset global.extra-index-url
+    RUN apt-get update && \
+    apt-get install -y aria2 libfreeimage3 libfreeimage-dev zlib1g
\ No newline at end of file
--- a/docker/README.md
+++ b/docker/README.md
--- a/docker/verl0.4-cu124-torch2.6-fa2.7.4/Dockerfile.app.sglang.vllm.mcore0.12
+++ b/docker/verl0.4-cu124-torch2.6-fa2.7.4/Dockerfile.app.sglang.vllm.mcore0.12
+# Start from the verl base image
+# Dockerfile.base
+FROM verlai/verl:base-verl0.4-cu124-cudnn9.8-torch2.6-fa2.7.4
+# Define environments
+ENV MAX_JOBS=32
+ENV VLLM_WORKER_MULTIPROC_METHOD=spawn
+ENV DEBIAN_FRONTEND=noninteractive
+ENV NODE_OPTIONS=""
+ENV PIP_ROOT_USER_ACTION=ignore
+ENV HF_HUB_ENABLE_HF_TRANSFER="1"
+# Install sglang-0.4.6.post5 and torch-memory-saver
+RUN pip install --resume-retries 999 "sglang[all]==0.4.6.post5" --no-cache-dir --find-links https://flashinfer.ai/whl/cu124/torch2.6/flashinfer-python && pip install torch-memory-saver --no-cache-dir
+# Some sglang operations in 0.4.6.post5 require vllm
+# [Warning] vllm can have some packages not compatible with sglang, for example, flashinfer
+RUN pip install --resume-retries 999 --no-cache-dir vllm==0.8.5.post1
+# Fix packages
+RUN pip install --no-cache-dir "tensordict==0.6.2" "transformers[hf_xet]>=4.51.0" accelerate datasets peft hf-transfer \
+    "numpy<2.0.0" "pyarrow>=19.0.1" pandas \
+    ray[default] codetiming hydra-core pylatexenc qwen-vl-utils wandb dill pybind11 liger-kernel mathruler blobfile xgrammar \
+    pytest py-spy pyext pre-commit ruff
+RUN pip uninstall -y pynvml nvidia-ml-py && \
+    pip install --resume-retries 999 --no-cache-dir --upgrade "nvidia-ml-py>=12.560.30" "fastapi[standard]>=0.115.0" "optree>=0.13.0" "pydantic>=2.9" "grpcio>=1.62.1"
+RUN pip install --resume-retries 999 --no-cache-dir nvidia-cudnn-cu12==9.8.0.87
+# Install TransformerEngine
+RUN export NVTE_FRAMEWORK=pytorch && pip3 install --resume-retries 999 --no-deps --no-cache-dir --no-build-isolation git+https://github.com/NVIDIA/TransformerEngine.git@v2.2.1
+# Install Megatron-LM
+RUN pip3 install --no-deps --no-cache-dir --no-build-isolation git+https://github.com/NVIDIA/Megatron-LM.git@core_v0.12.2
+# Fix for transformers 4.53.0
+RUN pip3 install --no-cache-dir "transformers[hf_xet]<4.52.0"
+# Install mbridge
+RUN pip3 install --no-cache-dir mbridge
\ No newline at end of file
--- a/docker/verl0.4-cu124-torch2.6-fa2.7.4/Dockerfile.app.sglang.vllm.mcore0.12.deepep
+++ b/docker/verl0.4-cu124-torch2.6-fa2.7.4/Dockerfile.app.sglang.vllm.mcore0.12.deepep
--- a/docker/verl0.4-cu124-torch2.6-fa2.7.4/Dockerfile.app.sglang.vllm.mcore0.13.preview
+++ b/docker/verl0.4-cu124-torch2.6-fa2.7.4/Dockerfile.app.sglang.vllm.mcore0.13.preview
--- a/docker/verl0.4-cu124-torch2.6-fa2.7.4/Dockerfile.app.vllm.mcore0.12
+++ b/docker/verl0.4-cu124-torch2.6-fa2.7.4/Dockerfile.app.vllm.mcore0.12
--- a/docker/verl0.4-cu124-torch2.6-fa2.7.4/Dockerfile.app.vllm.mcore0.12.deepep
+++ b/docker/verl0.4-cu124-torch2.6-fa2.7.4/Dockerfile.app.vllm.mcore0.12.deepep
--- a/docker/verl0.4-cu124-torch2.6-fa2.7.4/Dockerfile.app.vllm.mcore0.13.preview
+++ b/docker/verl0.4-cu124-torch2.6-fa2.7.4/Dockerfile.app.vllm.mcore0.13.preview
--- a/docker/verl0.4-cu124-torch2.6-fa2.7.4/Dockerfile.base
+++ b/docker/verl0.4-cu124-torch2.6-fa2.7.4/Dockerfile.base
--- a/docker/verl0.4-cu124-torch2.6-fa2.7.4/README.md
+++ b/docker/verl0.4-cu124-torch2.6-fa2.7.4/README.md
--- a/docker/verl0.5-cu126-torch2.7-fa2.7.4/Dockerfile.app.sglang-preview.mcore0.12
+++ b/docker/verl0.5-cu126-torch2.7-fa2.7.4/Dockerfile.app.sglang-preview.mcore0.12
--- a/docker/verl0.5-cu126-torch2.7-fa2.7.4/Dockerfile.app.sglang.mcore0.12
+++ b/docker/verl0.5-cu126-torch2.7-fa2.7.4/Dockerfile.app.sglang.mcore0.12