# Start from the verl base image # Dockerfile.base FROM verlai/verl:base-verl0.4-cu124-cudnn9.8-torch2.6-fa2.7.4 # Define environments ENV MAX_JOBS=32 ENV VLLM_WORKER_MULTIPROC_METHOD=spawn ENV DEBIAN_FRONTEND=noninteractive ENV NODE_OPTIONS="" ENV PIP_ROOT_USER_ACTION=ignore ENV HF_HUB_ENABLE_HF_TRANSFER="1" # Install torch-2.6.0+cu124 + vllm-0.8.5.post1 # torch-2.6.0+cu124: cxx11abi=False # torch-2.6.0+cu126: cxx11abi=True # see https://github.com/flashinfer-ai/flashinfer/issues/911 RUN pip install --resume-retries 999 --no-cache-dir vllm==0.8.5.post1 # Install flashinfer-0.2.2.post1+cu126 (cxx11abi=True) # vllm-0.8.3 does not support flashinfer>=0.2.3 # see https://github.com/vllm-project/vllm/pull/15777 RUN aria2c --max-tries=9999 https://github.com/flashinfer-ai/flashinfer/releases/download/v0.2.2.post1/flashinfer_python-0.2.2.post1+cu124torch2.6-cp38-abi3-linux_x86_64.whl && \ pip install --no-cache-dir flashinfer_python-0.2.2.post1+cu124torch2.6-cp38-abi3-linux_x86_64.whl && \ rm flashinfer_python-0.2.2.post1+cu124torch2.6-cp38-abi3-linux_x86_64.whl # Fix packages RUN pip install --no-cache-dir "tensordict==0.6.2" "transformers[hf_xet]>=4.51.0" accelerate datasets peft hf-transfer \ "numpy<2.0.0" "pyarrow>=19.0.1" pandas \ ray[default] codetiming hydra-core pylatexenc qwen-vl-utils wandb dill pybind11 liger-kernel mathruler blobfile xgrammar \ pytest py-spy pyext pre-commit ruff RUN pip uninstall -y pynvml nvidia-ml-py && \ pip install --resume-retries 999 --no-cache-dir --upgrade "nvidia-ml-py>=12.560.30" "fastapi[standard]>=0.115.0" "optree>=0.13.0" "pydantic>=2.9" "grpcio>=1.62.1" RUN pip install --resume-retries 999 --no-cache-dir nvidia-cudnn-cu12==9.8.0.87 # Install TransformerEngine RUN export NVTE_FRAMEWORK=pytorch && pip3 install --resume-retries 999 --no-deps --no-cache-dir --no-build-isolation git+https://github.com/NVIDIA/TransformerEngine.git@release_v2.5 # Install Megatron-LM RUN pip3 install --no-deps --no-cache-dir --no-build-isolation git+https://github.com/NVIDIA/Megatron-LM.git@core_v0.12.2 # Install mbridge RUN pip3 install --no-cache-dir mbridge # Install DeepEP ## the dependency of IBGDA RUN ln -s /usr/lib/x86_64-linux-gnu/libmlx5.so.1 /usr/lib/x86_64-linux-gnu/libmlx5.so ## Clone and build deepep and deepep-nvshmem RUN git clone -b v2.3.1 https://github.com/NVIDIA/gdrcopy.git && \ git clone https://github.com/deepseek-ai/DeepEP.git && \ cd DeepEP && git checkout a84a248 # Prepare nvshmem RUN wget https://developer.nvidia.com/downloads/assets/secure/nvshmem/nvshmem_src_3.2.5-1.txz && \ tar -xvf nvshmem_src_3.2.5-1.txz && mv nvshmem_src deepep-nvshmem && \ cd deepep-nvshmem && git apply ../DeepEP/third-party/nvshmem.patch ENV CUDA_HOME=/usr/local/cuda ### Set MPI environment variables. Having errors when not set. ENV CPATH=/usr/local/mpi/include:$CPATH ENV LD_LIBRARY_PATH=/usr/local/mpi/lib:$LD_LIBRARY_PATH ENV LD_LIBRARY_PATH=/usr/local/x86_64-linux-gnu:$LD_LIBRARY_PATH ENV GDRCOPY_HOME=/workspace/gdrcopy ## Build deepep-nvshmem RUN cd deepep-nvshmem && \ NVSHMEM_SHMEM_SUPPORT=0 \ NVSHMEM_UCX_SUPPORT=0 \ NVSHMEM_USE_NCCL=0 \ NVSHMEM_MPI_SUPPORT=0 \ NVSHMEM_IBGDA_SUPPORT=1 \ NVSHMEM_PMIX_SUPPORT=0 \ NVSHMEM_TIMEOUT_DEVICE_POLLING=0 \ NVSHMEM_USE_GDRCOPY=1 \ cmake -G Ninja -S . -B build/ -DCMAKE_INSTALL_PREFIX=/workspace/deepep-nvshmem/install && cmake --build build/ --target install ENV NVSHMEM_DIR=/workspace/deepep-nvshmem/install ENV LD_LIBRARY_PATH=$NVSHMEM_DIR/lib:$LD_LIBRARY_PATH ENV PATH=$NVSHMEM_DIR/bin:$PATH ## Build deepep RUN cd DeepEP && \ python setup.py install