Dockerfile.ci.dev 3.33 KB
Newer Older
jerrrrry's avatar
jerrrrry committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
# syntax=docker/dockerfile:1.3-labs

ARG FROM_IMAGE_NAME
ARG WHEEL_DIR=/workspace/wheels

FROM ${FROM_IMAGE_NAME} as main
ENV PIP_CONSTRAINT=""
ENV DEBIAN_FRONTEND=noninteractive
ARG UV_VERSION=0.7.2
ARG YQ_VERSION=4.44.1
ENV PATH="/root/.local/bin:$PATH"
ARG UV_PROJECT_ENVIRONMENT=/opt/venv
ENV UV_PROJECT_ENVIRONMENT=${UV_PROJECT_ENVIRONMENT}
ENV VIRTUAL_ENV=$UV_PROJECT_ENVIRONMENT
ENV PATH="$UV_PROJECT_ENVIRONMENT/bin:$PATH"
ENV UV_LINK_MODE=copy

RUN bash -ex <<"EOF"
    apt-get update
    apt-get install -y --no-install-recommends gettext python3-venv psmisc
    apt-get clean
    python -m venv /opt/jet
    wget https://github.com/mikefarah/yq/releases/download/v${YQ_VERSION}/yq_linux_amd64 -O /usr/local/bin/yq
    chmod a+x /usr/local/bin/yq
    curl -LsSf https://astral.sh/uv/${UV_VERSION}/install.sh | sh
EOF

ARG WHEEL_DIR
COPY README.md pyproject.toml uv.lock /workspace/
COPY megatron/core/__init__.py /workspace/megatron/core/
COPY megatron/core/package_info.py /workspace/megatron/core/
RUN --mount=type=cache,target=/root/.cache/uv \
    bash -ex <<"EOF"
    export NVTE_CUDA_ARCHS="80;90;100"
    uv venv ${UV_PROJECT_ENVIRONMENT} --system-site-packages
    uv sync --only-group build
    uv sync --extra dev --extra mlm --link-mode copy --locked \
        --no-install-package torch \
        --no-install-package torchvision \
        --no-install-package triton \
        --no-install-package nvidia-cublas-cu12 \
        --no-install-package nvidia-cuda-cupti-cu12 \
        --no-install-package nvidia-cuda-nvrtc-cu12 \
        --no-install-package nvidia-cuda-runtime-cu12 \
        --no-install-package nvidia-cudnn-cu12 \
        --no-install-package nvidia-cufft-cu12 \
        --no-install-package nvidia-cufile-cu12 \
        --no-install-package nvidia-curand-cu12 \
        --no-install-package nvidia-cusolver-cu12 \
        --no-install-package nvidia-cusparse-cu12 \
        --no-install-package nvidia-cusparselt-cu12 \
        --no-install-package nvidia-nccl-cu12
EOF

# Install DeepEP
COPY docker/patches/deepep.patch /workspace/deepep.patch
RUN bash -ex <<"EOF"
    cd /workspace
    uv pip install nvidia-nvshmem-cu13
    pushd /opt/venv/lib/python3.12/site-packages/nvidia/nvshmem/lib/
        ln -s libnvshmem_host.so.3 libnvshmem_host.so
    popd

    git clone --branch v1.2.1 https://github.com/deepseek-ai/DeepEP.git
    pushd DeepEP
        patch -p1 < /workspace/deepep.patch
    popd
    TORCH_CUDA_ARCH_LIST="9.0 10.0 12.0" uv pip install --no-build-isolation -v DeepEP/.
    rm -rf DeepEP
EOF

COPY assets/ /opt/data/
ENV UV_PYTHON=$UV_PROJECT_ENVIRONMENT/bin/python

##### For NVIDIANS only #####
FROM main as jet
ARG JET_API_VERSION
ENV PATH="$PATH:/opt/jet/bin"
RUN --mount=type=secret,id=JET_INDEX_URLS bash -ex <<"EOF"
    JET_INDEX_URLS=$(cat /run/secrets/JET_INDEX_URLS)
    python -m venv /opt/jet 
    /opt/jet/bin/pip install --no-cache-dir $JET_INDEX_URLS \
        jet-api==$JET_API_VERSION
EOF

RUN --mount=type=secret,id=JET_INDEX_URLS \
    --mount=type=secret,id=LOGGER_INDEX_URL bash -ex <<"EOF"
    JET_INDEX_URLS=$(cat /run/secrets/JET_INDEX_URLS)
    LOGGER_INDEX_URL=$(cat /run/secrets/LOGGER_INDEX_URL)
    uv pip install --no-cache-dir --upgrade $LOGGER_INDEX_URL "one-logger"
    uv pip install --no-cache-dir --upgrade "setuptools<80.0.0"
    uv pip install --no-cache-dir --upgrade $JET_INDEX_URLS "jet-client~=2.0" 
EOF
###