# Start from the verl base image
# Dockerfile.base
FROM verlai/verl:base-verl0.4-cu124-cudnn9.8-torch2.6-fa2.7.4

# Define environments
ENV MAX_JOBS=32
ENV VLLM_WORKER_MULTIPROC_METHOD=spawn
ENV DEBIAN_FRONTEND=noninteractive
ENV NODE_OPTIONS=""
ENV PIP_ROOT_USER_ACTION=ignore
ENV HF_HUB_ENABLE_HF_TRANSFER="1"

# Install torch-2.6.0+cu124 + vllm-0.8.5.post1
# torch-2.6.0+cu124: cxx11abi=False
# torch-2.6.0+cu126: cxx11abi=True
# see https://github.com/flashinfer-ai/flashinfer/issues/911
RUN pip install --resume-retries 999 --no-cache-dir vllm==0.8.5.post1

# Install flashinfer-0.2.2.post1+cu126 (cxx11abi=True)
# vllm-0.8.3 does not support flashinfer>=0.2.3
# see https://github.com/vllm-project/vllm/pull/15777
RUN aria2c --max-tries=9999 https://github.com/flashinfer-ai/flashinfer/releases/download/v0.2.2.post1/flashinfer_python-0.2.2.post1+cu124torch2.6-cp38-abi3-linux_x86_64.whl && \
    pip install --no-cache-dir flashinfer_python-0.2.2.post1+cu124torch2.6-cp38-abi3-linux_x86_64.whl && \
    rm flashinfer_python-0.2.2.post1+cu124torch2.6-cp38-abi3-linux_x86_64.whl

# Fix packages
RUN pip install --no-cache-dir "tensordict==0.6.2" "transformers[hf_xet]>=4.51.0" accelerate datasets peft hf-transfer \
    "numpy<2.0.0" "pyarrow>=19.0.1" pandas \
    ray[default] codetiming hydra-core pylatexenc qwen-vl-utils wandb dill pybind11 liger-kernel mathruler blobfile xgrammar \
    pytest py-spy pyext pre-commit ruff

RUN pip uninstall -y pynvml nvidia-ml-py && \
    pip install --resume-retries 999 --no-cache-dir --upgrade "nvidia-ml-py>=12.560.30" "fastapi[standard]>=0.115.0" "optree>=0.13.0" "pydantic>=2.9" "grpcio>=1.62.1"

RUN pip install --resume-retries 999 --no-cache-dir nvidia-cudnn-cu12==9.8.0.87

# Install TransformerEngine
RUN export NVTE_FRAMEWORK=pytorch && pip3 install --resume-retries 999 --no-deps --no-cache-dir --no-build-isolation git+https://github.com/NVIDIA/TransformerEngine.git@v2.2.1

# Install Megatron-LM
RUN pip3 install --no-deps --no-cache-dir --no-build-isolation git+https://github.com/NVIDIA/Megatron-LM.git@core_v0.12.2

# Fix for transformers 4.53.0
RUN pip3 install --no-cache-dir "transformers[hf_xet]<4.52.0"

# Install mbridge
RUN pip3 install --no-cache-dir mbridge