Commit 7f6cc211 authored by jerrrrry's avatar jerrrrry
Browse files

Initial commit

parents
Pipeline #2874 failed with stages
in 0 seconds
# docker buildx build --platform linux/x86_64 -t "verlai/verl:ngc-th2.4.0-cu124-vllm0.6.3-ray2.4-te1.7-v0.0.6" -f docker/Dockerfile.ngc.vllm . --builder cloud-verlai-verl-builder --progress=plain --push
FROM nvcr.io/nvidia/pytorch:24.05-py3
# uninstall nv-pytorch fork
RUN pip3 uninstall pytorch-quantization \
pytorch-triton \
torch \
torch-tensorrt \
torchvision \
xgboost transformer_engine flash_attn \
apex megatron-core -y
RUN pip3 install torch==2.4.0 torchvision==0.19.0 torchaudio==2.4.0 --index-url https://download.pytorch.org/whl/cu124
# =============== Megatron dependencies (optional) =================
# install apex, set MAX_JOBS to avoid OOMs
RUN MAX_JOBS=4 pip3 install -v --disable-pip-version-check --no-cache-dir --no-build-isolation \
--config-settings "--build-option=--cpp_ext" --config-settings "--build-option=--cuda_ext" \
git+https://github.com/NVIDIA/apex
# =============== End of Megatron dependencies (optional) =================
RUN pip3 install --no-cache-dir \
accelerate \
codetiming \
datasets \
dill \
hydra-core \
numpy \
'pandas' \
'peft' \
'pyarrow>=15.0.0' \
'pybind11' \
'pylatexenc' \
'ray>=2.10' \
'tensordict<0.6' \
'transformers' \
'vllm==0.6.3.post1' \
'wandb'
# full dependencies
RUN pip3 install pytest pre-commit py-spy pyext liger-kernel
# =============== Megatron dependencies (optional) =================
# install Transformer Engine, which requires FA 2.5.8. Do it in a separate step for docker cache
RUN MAX_JOBS=4 NINJA_FLAGS="-j4" pip3 install flash-attn==2.5.8 --no-cache-dir --no-build-isolation
RUN MAX_JOBS=1 NINJA_FLAGS="-j1" TE_BUILD_WITH_NINJA=0 pip3 install git+https://github.com/eric-haibin-lin/TransformerEngine.git@v1.7.0
# =============== End of Megatron dependencies (optional) =================
# Start from the NVIDIA official image (ubuntu-22.04 + cuda-12.6 + python-3.10)
# https://docs.nvidia.com/deeplearning/frameworks/pytorch-release-notes/rel-24-08.html
FROM nvcr.io/nvidia/pytorch:24.08-py3
# Define environments
ENV MAX_JOBS=32
ENV VLLM_WORKER_MULTIPROC_METHOD=spawn
ENV DEBIAN_FRONTEND=noninteractive
ENV NODE_OPTIONS=""
ENV PIP_ROOT_USER_ACTION=ignore
ENV HF_HUB_ENABLE_HF_TRANSFER="1"
# Define installation arguments
ARG APT_SOURCE=https://mirrors.tuna.tsinghua.edu.cn/ubuntu/
ARG PIP_INDEX=https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple
# Set apt source
RUN cp /etc/apt/sources.list /etc/apt/sources.list.bak && \
{ \
echo "deb ${APT_SOURCE} jammy main restricted universe multiverse"; \
echo "deb ${APT_SOURCE} jammy-updates main restricted universe multiverse"; \
echo "deb ${APT_SOURCE} jammy-backports main restricted universe multiverse"; \
echo "deb ${APT_SOURCE} jammy-security main restricted universe multiverse"; \
} > /etc/apt/sources.list
# Install systemctl
RUN apt-get update && \
apt-get install -y -o Dpkg::Options::="--force-confdef" systemd && \
apt-get clean
# Install tini
RUN apt-get update && \
apt-get install -y tini && \
apt-get clean
# Change pip source
RUN pip config set global.index-url "${PIP_INDEX}" && \
pip config set global.extra-index-url "${PIP_INDEX}" && \
python -m pip install --upgrade pip
# Uninstall nv-pytorch fork
RUN pip uninstall -y torch torchvision torchaudio \
pytorch-quantization pytorch-triton torch-tensorrt \
xgboost transformer_engine flash_attn apex megatron-core grpcio
# Install torch-2.6.0+cu124 + vllm-0.8.3
# torch-2.6.0+cu124: cxx11abi=False
# torch-2.6.0+cu126: cxx11abi=True
# see https://github.com/flashinfer-ai/flashinfer/issues/911
RUN pip install --no-cache-dir "vllm==0.8.3" "torch==2.6.0" "torchvision==0.21.0" "torchaudio==2.6.0" "tensordict==0.6.2" torchdata \
"transformers[hf_xet]>=4.51.0" accelerate datasets peft hf-transfer \
"numpy<2.0.0" "pyarrow>=15.0.0" pandas \
ray[default] codetiming hydra-core pylatexenc qwen-vl-utils wandb dill pybind11 liger-kernel mathruler \
pytest py-spy pyext pre-commit ruff
# Install flash-attn-2.7.4.post1 (cxx11abi=False)
RUN wget -nv https://github.com/Dao-AILab/flash-attention/releases/download/v2.7.4.post1/flash_attn-2.7.4.post1+cu12torch2.6cxx11abiFALSE-cp310-cp310-linux_x86_64.whl && \
pip install --no-cache-dir flash_attn-2.7.4.post1+cu12torch2.6cxx11abiFALSE-cp310-cp310-linux_x86_64.whl
# Install flashinfer-0.2.2.post1+cu124 (cxx11abi=False)
# vllm-0.8.3 does not support flashinfer>=0.2.3
# see https://github.com/vllm-project/vllm/pull/15777
RUN wget -nv https://github.com/flashinfer-ai/flashinfer/releases/download/v0.2.2.post1/flashinfer_python-0.2.2.post1+cu124torch2.6-cp38-abi3-linux_x86_64.whl && \
pip install --no-cache-dir flashinfer_python-0.2.2.post1+cu124torch2.6-cp38-abi3-linux_x86_64.whl
# Fix packages
RUN pip uninstall -y pynvml nvidia-ml-py && \
pip install --no-cache-dir --upgrade "nvidia-ml-py>=12.560.30" "fastapi[standard]>=0.115.0" "optree>=0.13.0" "pydantic>=2.9" "grpcio>=1.62.1"
# Install verl
RUN pip install --no-cache-dir verl[vllm] -U
# Reset pip config
RUN pip config unset global.index-url && \
pip config unset global.extra-index-url
# Using a pre-built image from AWS DLC which contains the current version of python (3.10) and supported cuda version (12.1)
FROM 763104351884.dkr.ecr.us-east-1.amazonaws.com/huggingface-pytorch-training:2.1.0-transformers4.36.0-gpu-py310-cu121-ubuntu20.04
# uninstall nv-pytorch fork
RUN pip3 uninstall -y pytorch-quantization \
pytorch-triton torch torch-tensorrt torchvision \
xgboost transformer_engine flash_attn apex megatron-core
# Define environments
ENV MAX_JOBS=32
ENV VLLM_WORKER_MULTIPROC_METHOD=spawn
ENV DEBIAN_FRONTEND=noninteractive
ENV NODE_OPTIONS=""
ENV HF_HUB_ENABLE_HF_TRANSFER="1"
# Install systemctl
RUN apt-get update && \
apt-get install -y -o Dpkg::Options::="--force-confdef" systemd && \
apt-get clean
# Install tini
RUN apt-get update && \
apt-get install -y tini && \
apt-get clean
# Install torch-2.6.0 + vllm-0.8.2
RUN pip install --no-cache-dir vllm==0.8.2 torch==2.6.0 torchvision==0.21.0 torchaudio==2.6.0 tensordict torchdata==0.11.0 \
transformers>=4.49.0 accelerate datasets peft hf-transfer \
ray[default] codetiming hydra-core pandas pyarrow>=15.0.0 pylatexenc qwen-vl-utils wandb dill pybind11 liger-kernel mathruler \
pytest pre-commit py-spy pyext ruff
# Install flash_attn-2.7.4.post1
RUN pip uninstall -y transformer-engine flash-attn && \
pip install flash-attn==2.7.4.post1 --no-build-isolation
# Fix cv2
RUN pip uninstall -y pynvml nvidia-ml-py && \
pip install --no-cache-dir nvidia-ml-py>=12.560.30 opencv-python-headless==4.8.0.74 fastapi==0.115.6 && \
pip install --no-cache-dir --upgrade optree>=0.13.0
# Install verl
RUN pip install --no-cache-dir verl[vllm] -U
# Reset pip config
RUN pip config unset global.index-url && \
pip config unset global.extra-index-url
# FROM "compute-artifactory.amd.com:5000/rocm-plus-docker/framework/compute-rocm-rel-6.4:94_ubuntu22.04_py3.10_pytorch_release-2.7_575e247"
FROM "rlfoundation.azurecr.io/rocm6.3.4:vllm-0.8.5-numa-patch-ubuntu-22.04"
SHELL ["/bin/bash", "-ceuxo", "pipefail"]
ENV MAX_JOBS=512
ENV PATH="/usr/local/python3.12/bin:$PATH"
RUN ln -sf /usr/bin/python3.12 /usr/bin/python && \
ln -sf /usr/bin/pip3.12 /usr/bin/pip
############################################
############################################
RUN apt-get update
RUN apt-get install -y pkg-config liblzma-dev
############################################
############################################
###########################################
##########Install TransformerEngine########
###########################################
WORKDIR /workspace/
# transformer-engine install
# https://github.com/ROCm/TransformerEngine
RUN rm -rf TransformerEngine
RUN git clone --recursive https://github.com/ROCm/TransformerEngine.git
WORKDIR /workspace/TransformerEngine
RUN git checkout 236178e5
# git checkout bb061ade
# git checkout 864405c
ENV NVTE_FRAMEWORK=pytorch
ENV NVTE_ROCM_ARCH=gfx942
ENV NVTE_USE_HIPBLASLT=1
ENV NVTE_USE_ROCM=1
# export CMAKE_PREFIX_PATH="/opt/rocm:/opt/rocm/hip:/usr/local:/usr:${CMAKE_PREFIX_PATH:-}"
ENV CMAKE_PREFIX_PATH="/opt/rocm:/opt/rocm/hip:/usr/local:/usr"
# ENV NVTE_BUILD_MAX_JOBS=$(MAX_JOBS)
RUN MAX_JOBS=$(MAX_JOBS) pip install . -vvv
WORKDIR /workspace/
###########################################
###########################################
###########################################
####################################################################################
################Install vllm - sglang require vllm 0.6.7 dependency#################
####################################################################################
#### Require vllm 0.6.7 - checkout 113274a0
WORKDIR /workspace/
RUN rm -rf vllm
RUN pip uninstall -y vllm
# Refer to here (down-grade vllm to 0.6.3): https://docs.vllm.ai/en/v0.6.3/getting_started/amd-installation.html
RUN git clone https://github.com/ROCm/vllm.git
# git clone https://github.com/vllm-project/vllm.git
WORKDIR /workspace/vllm
RUN git checkout 113274a0
ENV PYTORCH_ROCM_ARCH="gfx90a;gfx942"
#ENV MAX_JOBS=512
ENV MAX_JOBS=${MAX_JOBS}
RUN pip install "boto3>=1.26.0"
RUN pip install setuptools_scm
# will add src into py. You can delete the repo
RUN python3 setup.py install
WORKDIR /workspace/
####################################################################################
####################################################################################
####################################################################################
###########################################
############For hack docker################
###########################################
RUN pip install setuptools==75.8.0
###########################################
###########################################
###########################################
###########################################
############build sgalng###################
###########################################
# Set environment variables
ENV BASE_DIR=/sgl-workspace
ENV BUILD_TYPE=all
ENV SGL_REPO=https://github.com/sgl-project/sglang
ENV SGL_BRANCH=v0.4.6.post5
ENV TRITON_REPO=https://github.com/ROCm/triton.git
ENV TRITON_COMMIT=improve_fa_decode_3.0.0
ENV AITER_REPO=https://github.com/ROCm/aiter.git
ENV AITER_COMMIT=v0.1.2
# v0.1.2 version - commit id: 9d11f47
# ENV AITER_COMMIT=9d11f47
ENV HIP_FORCE_DEV_KERNARG=1
ENV HSA_NO_SCRATCH_RECLAIM=1
ENV SGLANG_SET_CPU_AFFINITY=1
ENV SGLANG_ALLOW_OVERWRITE_LONGER_CONTEXT_LEN=1
ENV NCCL_MIN_NCHANNELS=112
ENV MOE_PADDING=1
ENV VLLM_FP8_PADDING=1
ENV VLLM_FP8_ACT_PADDING=1
ENV VLLM_FP8_WEIGHT_PADDING=1
ENV VLLM_FP8_REDUCE_CONV=1
ENV TORCHINDUCTOR_MAX_AUTOTUNE=1
ENV TORCHINDUCTOR_MAX_AUTOTUNE_POINTWISE=1
ENV HIPCC_COMPILE_FLAGS_APPEND="--offload-arch=gfx942"
ENV AMDGPU_TARGETS=gfx942
ENV ROCM_ARCH=gfx942
ENV PYTORCH_ROCM_ARCH="gfx90a;gfx942"
# Switch to working directory
WORKDIR /sgl-workspace
# Clean and create directory
RUN rm -rf /sgl-workspace && mkdir -p /sgl-workspace
# Clone and build sglang
RUN git clone ${SGL_REPO} \
&& cd sglang \
&& git checkout ${SGL_BRANCH} || echo "Using default branch" \
&& cd sgl-kernel \
&& rm -f pyproject.toml \
&& mv pyproject_rocm.toml pyproject.toml \
&& python setup_rocm.py install \
&& cd .. \
&& if [ "$BUILD_TYPE" = "srt" ]; then \
python -m pip --no-cache-dir install -e "python[srt_hip]"; \
else \
python -m pip --no-cache-dir install -e "python[all_hip]"; \
fi \
&& cd /sgl-workspace \
&& cp -r /sgl-workspace/sglang /sglang \
&& python -m pip cache purge
# Install common Python packages
RUN pip install IPython orjson python-multipart torchao pybind11
# Rebuild Triton
RUN pip uninstall -y triton || true \
&& git clone ${TRITON_REPO} \
&& cd triton \
&& git checkout ${TRITON_COMMIT} \
&& cd python \
&& python3 setup.py install \
&& cd /sgl-workspace
# ENV HIPCC_COMPILE_FLAGS_APPEND="--offload-arch=gfx942 --amdgpu-lower-module-lds-strategy=1"
# ENV HIPCC_COMPILE_FLAGS_APPEND="--offload-arch=gfx942"
# Build aiter
#version: Commit 9d11f47
# && git checkout ${AITER_COMMIT} \
RUN pip uninstall -y aiter || true
RUN git clone ${AITER_REPO} \
&& cd aiter \
&& git checkout ${AITER_COMMIT} \
&& git submodule sync \
&& git submodule update --init --recursive \
&& PREBUILD_KERNELS=1 GPU_ARCHS=gfx942 python3 setup.py install \
&& cd /sgl-workspace
# && PREBUILD_KERNELS=1 GPU_ARCHS=gfx942 python3 setup.py develop \
# && PREBUILD_KERNELS=1 GPU_ARCHS=gfx942 python3 setup.py develop \
# Copy MI300X config
RUN find /sgl-workspace/sglang/python/sglang/srt/layers/quantization/configs/ \
/sgl-workspace/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/ \
-type f -name '*MI300X*' | \
xargs -I {} sh -c 'vf_config=$(echo "$1" | sed "s/MI300X/MI300X_VF/"); cp "$1" "$vf_config"' -- {}
# Environment setup complete.
RUN echo "Environment setup complete."
WORKDIR /workspace/
###########################################
###########################################
###########################################
###########################################
###############vllm v0.8.5#################
###########################################
# ENV GITHUB_USERNAME=yushengsu-thu
# ENV GITHUB_MAIL=yushengsu@gmail.com
# RUN git config --global user.name "${GITHUB_USERNAME}" \
# && git config --global user.email "${GITHUB_MAIL}"
WORKDIR /workspace/
ENV VLLM_TARGET_DEVICE=rocm
ENV ROCM_PATH=/opt/rocm
ENV SETUPTOOLS_SCM_PRETEND_VERSION=0.8.5.dev
# Find the repo path in: DockerFile/Dockerfile.rocm_yang
# RUN git clone https://github.com/RLFoundation/vllm-patch.git
RUN pip uninstall -y vllm || true
RUN rm -rf vllm-patch
RUN git clone https://github.com/RLFoundation/vllm-patch.git \
&& cd vllm-patch \
&& git checkout v0.8.5-sleep-numa \
&& rm -rf build/ dist/ *.egg-info \
&& ln -sf /opt/rocm/lib/libamdhip64.so /usr/lib/libamdhip64.so \
&& SETUPTOOLS_SCM_PRETEND_VERSION=0.8.5.dev PYTORCH_ROCM_ARCH="gfx90a;gfx942" MAX_JOBS=${MAX_JOBS} python3 setup.py install
# RUN SETUPTOOLS_SCM_PRETEND_VERSION=0.8.5.dev PYTORCH_ROCM_ARCH="gfx90a;gfx942" MAX_JOBS=${MAX_JOBS} python3 setup.py develop
WORKDIR /workspace/
###########################################
###########################################
###########################################
#########################################
#### Install megatron-core###############
#########################################
RUN pip uninstall -y megatron-core && \
git clone https://github.com/yushengsu-thu/Megatron-LM-amd_version.git && \
cd Megatron-LM-amd_version && \
pip install -vvv -e . && \
cd /workspace/
#########################################
#########################################
#########################################
#######################################
################apex###################
#######################################
WORKDIR /workspace/
RUN pip uninstall -y apex && \
git clone https://github.com/ROCm/apex.git && \
cd apex && \
python setup.py install && \
cd /workspace/
#######################################
#######################################
#######################################
################################################################################
###########################Add torch_memory_saver###############################
################################################################################
# Set environment variables
ENV HIPCC_COMPILE_FLAGS_APPEND="--amdgpu-target=gfx90a;gfx942 -D__HIP_PLATFORM_AMD__"
ENV CFLAGS="-D__HIP_PLATFORM_AMD__"
ENV CXXFLAGS="-D__HIP_PLATFORM_AMD__"
RUN pip install "git+https://github.com/YangWang92/torch_memory_saver_numa.git@numa"
################################################################################
################################################################################
################################################################################
########################################
######Install ray#######################
########################################
# need to add this patch: https://github.com/ray-project/ray/pull/53531/files
RUN pip uninstall ray -y
RUN pip install "ray[data,train,tune,serve]>=2.47.0"
########################################
########################################
########################################
##########################################
#######Install other dependencies#########
##########################################
RUN pip install "tensordict==0.6.2" --no-deps && \
pip install accelerate \
codetiming \
datasets \
dill \
hydra-core \
liger-kernel \
numpy \
pandas \
peft \
"pyarrow>=15.0.0" \
pylatexenc \
torchdata \
wandb \
orjson \
pybind11
WORKDIR /workspace/
RUN git clone https://github.com/volcengine/verl.git && \
cd verl && \
pip install -e .
##########################################
##########################################
##########################################
WORKDIR /workspace/
CMD ["/usr/bin/bash"]
# Build the docker in the repo dir:
# docker build -f docker/Dockerfile.rocm -t verl-rocm:03.04.2015 .
# docker images # you can find your built docker
# Support - Traing: fsdp; Inference: vllm
# FROM rocm/vllm:rocm6.2_mi300_ubuntu20.04_py3.9_vllm_0.6.4
# Support - Traing: fsdp; Inference: vllm, sglang
FROM lmsysorg/sglang:v0.4.6.post5-rocm630
# Set working directory
# WORKDIR $PWD/app
# Set environment variables
ENV PYTORCH_ROCM_ARCH="gfx90a;gfx942"
ENV HIPCC_COMPILE_FLAGS_APPEND="--amdgpu-target=gfx90a;gfx942 -D__HIP_PLATFORM_AMD__"
ENV CFLAGS="-D__HIP_PLATFORM_AMD__"
ENV CXXFLAGS="-D__HIP_PLATFORM_AMD__"
# Install vllm
RUN pip uninstall -y vllm && \
rm -rf vllm && \
git clone -b v0.6.3 https://github.com/vllm-project/vllm.git && \
cd vllm && \
MAX_JOBS=$(nproc) python3 setup.py install && \
cd .. && \
rm -rf vllm
# Copy the entire project directory
COPY . .
# Install dependencies
RUN pip install "tensordict==0.6.2" --no-deps && \
pip install accelerate \
codetiming \
datasets \
dill \
hydra-core \
liger-kernel \
numpy \
pandas \
peft \
"pyarrow>=15.0.0" \
pylatexenc \
"ray[data,train,tune,serve]<2.45.0" \
torchdata \
transformers \
wandb \
orjson \
pybind11
RUN git clone https://github.com/volcengine/verl.git && \
cd verl && \
pip install -e .
# Install torch_memory_saver
RUN pip install git+https://github.com/ExtremeViscent/torch_memory_saver.git --no-deps
This diff is collapsed.
# Start from the NVIDIA official image (ubuntu-22.04 + python-3.10)
# https://docs.nvidia.com/deeplearning/frameworks/pytorch-release-notes/rel-24-08.html
FROM nvcr.io/nvidia/pytorch:24.08-py3
# Define environments
ENV MAX_JOBS=32
ENV DEBIAN_FRONTEND=noninteractive
ENV NODE_OPTIONS=""
# Define installation arguments
ARG APT_SOURCE=https://mirrors.ustc.edu.cn/ubuntu/
# Set apt source
RUN cp /etc/apt/sources.list /etc/apt/sources.list.bak && \
{ \
echo "deb ${APT_SOURCE} jammy main restricted universe multiverse"; \
echo "deb ${APT_SOURCE} jammy-updates main restricted universe multiverse"; \
echo "deb ${APT_SOURCE} jammy-backports main restricted universe multiverse"; \
echo "deb ${APT_SOURCE} jammy-security main restricted universe multiverse"; \
} > /etc/apt/sources.list
# Install systemctl
RUN apt-get update && \
apt-get install -y -o Dpkg::Options::="--force-confdef" systemd && \
apt-get clean
# Install tini
RUN apt-get update && \
apt-get install -y tini && \
apt-get clean
# Change pip source
ARG PIP_INDEX=https://mirrors.aliyun.com/pypi/simple/
RUN pip config set global.index-url "${PIP_INDEX}" && \
pip config set global.extra-index-url "${PIP_INDEX}" && \
python -m pip install --upgrade pip
# Install sglang-0.4.6.post5 and torch-memory-saver
RUN pip uninstall -y cuda-python && pip install "sglang[all]==0.4.6.post5" --no-cache-dir --find-links https://flashinfer.ai/whl/cu124/torch2.6/flashinfer-python && pip install torch-memory-saver --no-cache-dir
# Install torch-2.6.0
RUN pip install --no-cache-dir torch==2.6.0 torchvision==0.21.0 torchaudio==2.6.0 tensordict torchdata \
transformers>=4.49.0 accelerate datasets peft hf_transfer \
ray[default] codetiming hydra-core pandas pyarrow>=15.0.0 pylatexenc qwen-vl-utils wandb liger-kernel \
pytest pre-commit py-spy pyext
# Install flash_attn-2.7.4.post1
RUN pip uninstall -y transformer-engine flash-attn && \
wget -v https://github.com/Dao-AILab/flash-attention/releases/download/v2.7.4.post1/flash_attn-2.7.4.post1+cu12torch2.6cxx11abiFALSE-cp310-cp310-linux_x86_64.whl && \
pip install --no-cache-dir flash_attn-2.7.4.post1+cu12torch2.6cxx11abiFALSE-cp310-cp310-linux_x86_64.whl
# Fix cv2
RUN pip uninstall -y pynvml nvidia-ml-py && \
pip install --no-cache-dir nvidia-ml-py>=12.560.30 opencv-python-headless==4.8.0.74 fastapi==0.115.6
# docker buildx build --platform linux/x86_64 -t "verlai/verl:$TAG" -f docker/$FILE .
# the one in docker.io is an alias for the one veturbo
# FROM vemlp-cn-beijing.cr.volces.com/veturbo/pytorch:2.4-cu124
FROM docker.io/haibinlin/verl:v0.0.5-th2.4.0-cu124-base
# only config pip index with https://pypi.tuna.tsinghua.edu.cn/simple if needed
# unset for now
RUN pip3 config unset global.index-url
# transformers 4.47.0 contains the following bug:
# AttributeError: 'Gemma2Attention' object has no attribute '_flash_attn_uses_top_left_mask'
RUN pip3 install --no-cache-dir \
torch==2.4.0 \
accelerate \
codetiming \
dill \
hydra-core \
numpy \
pybind11 \
tensordict \
"transformers <= 4.46.0"
RUN pip3 install --no-cache-dir flash-attn==2.7.0.post2 --no-build-isolation
# vllm depends on ray
RUN pip3 install --no-cache-dir vllm==0.6.3 ray==2.10
# install apex
RUN MAX_JOBS=4 pip3 install -v --disable-pip-version-check --no-cache-dir --no-build-isolation \
--config-settings "--build-option=--cpp_ext" --config-settings "--build-option=--cuda_ext" \
git+https://github.com/NVIDIA/apex
# install Transformer Engine
# - flash-attn pinned to 2.5.3 by TransformerEngine, switch to eric-haibin-lin/TransformerEngine.git@v1.7.0 to relax version req
# - install with: MAX_JOBS=1 NINJA_FLAGS="-j1" TE_BUILD_WITH_NINJA=0 to avoid OOM
# - cudnn is required by TransformerEngine
# RUN CUDNN_PATH=/opt/conda/lib/python3.11/site-packages/nvidia/cudnn \
# pip3 install git+https://github.com/eric-haibin-lin/TransformerEngine.git@v1.7.0
RUN MAX_JOBS=1 NINJA_FLAGS="-j1" pip3 install flash-attn==2.5.3 --no-cache-dir --no-build-isolation
RUN MAX_JOBS=1 NINJA_FLAGS="-j1" pip3 install git+https://github.com/NVIDIA/TransformerEngine.git@v1.7
# Start from the NVIDIA official image (ubuntu-22.04 + cuda-12.6 + python-3.10)
# https://docs.nvidia.com/deeplearning/frameworks/pytorch-release-notes/rel-24-08.html
FROM nvcr.io/nvidia/pytorch:24.08-py3
# Define environments
ENV MAX_JOBS=32
ENV VLLM_WORKER_MULTIPROC_METHOD=spawn
ENV DEBIAN_FRONTEND=noninteractive
ENV NODE_OPTIONS=""
ENV PIP_ROOT_USER_ACTION=ignore
ENV HF_HUB_ENABLE_HF_TRANSFER="1"
# Define installation arguments
ARG APT_SOURCE=https://mirrors.tuna.tsinghua.edu.cn/ubuntu/
ARG PIP_INDEX=https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple
# Set apt source
RUN cp /etc/apt/sources.list /etc/apt/sources.list.bak && \
{ \
echo "deb ${APT_SOURCE} jammy main restricted universe multiverse"; \
echo "deb ${APT_SOURCE} jammy-updates main restricted universe multiverse"; \
echo "deb ${APT_SOURCE} jammy-backports main restricted universe multiverse"; \
echo "deb ${APT_SOURCE} jammy-security main restricted universe multiverse"; \
} > /etc/apt/sources.list
# Install systemctl
RUN apt-get update && \
apt-get install -y -o Dpkg::Options::="--force-confdef" systemd && \
apt-get clean
# Install tini
RUN apt-get update && \
apt-get install -y tini aria2 && \
apt-get clean
# Change pip source
RUN pip config set global.index-url "${PIP_INDEX}" && \
pip config set global.extra-index-url "${PIP_INDEX}" && \
python -m pip install --upgrade pip
# Uninstall nv-pytorch fork
RUN pip uninstall -y torch torchvision torchaudio \
pytorch-quantization pytorch-triton torch-tensorrt \
xgboost transformer_engine flash_attn apex megatron-core grpcio
# Reinstall CUDA 12.4
RUN aria2c https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/cuda-ubuntu2204.pin && \
mv cuda-ubuntu2204.pin /etc/apt/preferences.d/cuda-repository-pin-600
RUN aria2c --always-resume=true --max-tries=99999 https://developer.download.nvidia.com/compute/cuda/12.4.1/local_installers/cuda-repo-ubuntu2204-12-4-local_12.4.1-550.54.15-1_amd64.deb && \
dpkg -i cuda-repo-ubuntu2204-12-4-local_12.4.1-550.54.15-1_amd64.deb && \
cp /var/cuda-repo-ubuntu2204-12-4-local/cuda-*-keyring.gpg /usr/share/keyrings/ && \
apt-get update && \
apt-get -y install cuda-toolkit-12-4 && \
rm cuda-repo-ubuntu2204-12-4-local_12.4.1-550.54.15-1_amd64.deb && \
update-alternatives --set cuda /usr/local/cuda-12.4 && \
rm -rf /usr/local/cuda-12.6
# Install torch-2.6.0+cu124 + vllm-0.8.5.post1 + sglang-0.4.6.post5
# torch-2.6.0+cu124: cxx11abi=False
# torch-2.6.0+cu126: cxx11abi=True
# see https://github.com/flashinfer-ai/flashinfer/issues/911
# Install sglang-0.4.6.post1 and torch-memory-saver
RUN pip install --resume-retries 999 "sglang[all]==0.4.6.post5" --no-cache-dir --find-links https://flashinfer.ai/whl/cu124/torch2.6/flashinfer-python && pip install --resume-retries 999 torch-memory-saver --no-cache-dir
RUN pip install --resume-retries 999 --no-cache-dir "vllm==0.8.5.post1" "torch==2.6.0" "torchvision==0.21.0" "torchaudio==2.6.0" "tensordict==0.6.2" torchdata
RUN pip install --resume-retries 999 --no-cache-dir "transformers[hf_xet]>=4.51.0" accelerate datasets peft hf-transfer \
"numpy<2.0.0" "pyarrow>=15.0.0" pandas \
ray[default] codetiming hydra-core pylatexenc qwen-vl-utils wandb dill pybind11 liger-kernel mathruler blobfile \
pytest py-spy pyext pre-commit ruff
# Install flash-attn-2.7.4.post1 (cxx11abi=False)
RUN wget -nv https://github.com/Dao-AILab/flash-attention/releases/download/v2.7.4.post1/flash_attn-2.7.4.post1+cu12torch2.6cxx11abiFALSE-cp310-cp310-linux_x86_64.whl && \
pip install --no-cache-dir flash_attn-2.7.4.post1+cu12torch2.6cxx11abiFALSE-cp310-cp310-linux_x86_64.whl
# Fix packages
RUN pip uninstall -y pynvml nvidia-ml-py && \
pip install --resume-retries 999 --no-cache-dir --upgrade "nvidia-ml-py>=12.560.30" "fastapi[standard]>=0.115.0" "optree>=0.13.0" "pydantic>=2.9" "grpcio>=1.62.1"
# Install cudnn
RUN aria2c --max-tries=9999 https://developer.download.nvidia.com/compute/cudnn/9.8.0/local_installers/cudnn-local-repo-ubuntu2204-9.8.0_1.0-1_amd64.deb && \
dpkg -i cudnn-local-repo-ubuntu2204-9.8.0_1.0-1_amd64.deb && \
cp /var/cudnn-local-repo-ubuntu2204-9.8.0/cudnn-*-keyring.gpg /usr/share/keyrings/ && \
apt-get update && \
apt-get -y install cudnn-cuda-12 && \
rm cudnn-local-repo-ubuntu2204-9.8.0_1.0-1_amd64.deb
RUN pip install --resume-retries 999 --no-cache-dir nvidia-cudnn-cu12==9.8.0.87
# Install Apex
RUN git clone https://github.com/NVIDIA/apex.git && \
cd apex && \
pip install -v --disable-pip-version-check --no-cache-dir --no-build-isolation --config-settings "--build-option=--cpp_ext" --config-settings "--build-option=--cuda_ext" ./
# Install TransformerEngine
RUN export NVTE_FRAMEWORK=pytorch && pip3 install --no-deps --no-cache-dir git+https://github.com/NVIDIA/TransformerEngine.git@v2.3
# Install Megatron-LM
RUN pip3 install --no-deps --no-cache-dir git+https://github.com/NVIDIA/Megatron-LM.git@core_v0.12.2
# Fix opencv
RUN pip install opencv-python
RUN pip install opencv-fixer && \
python -c "from opencv_fixer import AutoFix; AutoFix()"
# Install verl
# Reset pip config
RUN pip config unset global.index-url && \
pip config unset global.extra-index-url
RUN apt-get update && \
apt-get install -y aria2 libfreeimage3 libfreeimage-dev zlib1g
\ No newline at end of file
This diff is collapsed.
# Start from the verl base image
# Dockerfile.base
FROM verlai/verl:base-verl0.4-cu124-cudnn9.8-torch2.6-fa2.7.4
# Define environments
ENV MAX_JOBS=32
ENV VLLM_WORKER_MULTIPROC_METHOD=spawn
ENV DEBIAN_FRONTEND=noninteractive
ENV NODE_OPTIONS=""
ENV PIP_ROOT_USER_ACTION=ignore
ENV HF_HUB_ENABLE_HF_TRANSFER="1"
# Install sglang-0.4.6.post5 and torch-memory-saver
RUN pip install --resume-retries 999 "sglang[all]==0.4.6.post5" --no-cache-dir --find-links https://flashinfer.ai/whl/cu124/torch2.6/flashinfer-python && pip install torch-memory-saver --no-cache-dir
# Some sglang operations in 0.4.6.post5 require vllm
# [Warning] vllm can have some packages not compatible with sglang, for example, flashinfer
RUN pip install --resume-retries 999 --no-cache-dir vllm==0.8.5.post1
# Fix packages
RUN pip install --no-cache-dir "tensordict==0.6.2" "transformers[hf_xet]>=4.51.0" accelerate datasets peft hf-transfer \
"numpy<2.0.0" "pyarrow>=19.0.1" pandas \
ray[default] codetiming hydra-core pylatexenc qwen-vl-utils wandb dill pybind11 liger-kernel mathruler blobfile xgrammar \
pytest py-spy pyext pre-commit ruff
RUN pip uninstall -y pynvml nvidia-ml-py && \
pip install --resume-retries 999 --no-cache-dir --upgrade "nvidia-ml-py>=12.560.30" "fastapi[standard]>=0.115.0" "optree>=0.13.0" "pydantic>=2.9" "grpcio>=1.62.1"
RUN pip install --resume-retries 999 --no-cache-dir nvidia-cudnn-cu12==9.8.0.87
# Install TransformerEngine
RUN export NVTE_FRAMEWORK=pytorch && pip3 install --resume-retries 999 --no-deps --no-cache-dir --no-build-isolation git+https://github.com/NVIDIA/TransformerEngine.git@v2.2.1
# Install Megatron-LM
RUN pip3 install --no-deps --no-cache-dir --no-build-isolation git+https://github.com/NVIDIA/Megatron-LM.git@core_v0.12.2
# Fix for transformers 4.53.0
RUN pip3 install --no-cache-dir "transformers[hf_xet]<4.52.0"
# Install mbridge
RUN pip3 install --no-cache-dir mbridge
\ No newline at end of file
This diff is collapsed.
This diff is collapsed.
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment