Dockerfile 2.65 KB
Newer Older
1
# https://docs.nvidia.com/deeplearning/frameworks/pytorch-release-notes/rel-22-12.html#rel-22-12
2
FROM nvcr.io/nvidia/pytorch:22.12-py3
3
4
5
6
LABEL maintainer="Hugging Face"

ARG DEBIAN_FRONTEND=noninteractive

Yih-Dar's avatar
Yih-Dar committed
7
ARG PYTORCH='2.0.1'
8
# Example: `cu102`, `cu113`, etc.
9
ARG CUDA='cu118'
10

11
12
13
14
RUN apt -y update
RUN apt install -y libaio-dev
RUN python3 -m pip install --no-cache-dir --upgrade pip

15
ARG REF=main
16
17
RUN git clone https://github.com/huggingface/transformers && cd transformers && git checkout $REF

18
19
RUN python3 -m pip uninstall -y torch torchvision torchaudio

20
# Install latest release PyTorch
Yih-Dar's avatar
Yih-Dar committed
21
22
# (PyTorch must be installed before pre-compiling any DeepSpeed c++/cuda ops.)
# (https://www.deepspeed.ai/tutorials/advanced-install/#pre-install-deepspeed-ops)
23
RUN python3 -m pip install --no-cache-dir -U torch==$PYTORCH torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/$CUDA
24

Yih-Dar's avatar
Yih-Dar committed
25
26
RUN python3 -m pip install --no-cache-dir ./transformers[deepspeed-testing]

Yih-Dar's avatar
Yih-Dar committed
27
28
RUN python3 -m pip install --no-cache-dir git+https://github.com/huggingface/accelerate@main#egg=accelerate

29
30
31
# Uninstall `transformer-engine` shipped with the base image
RUN python3 -m pip uninstall -y transformer-engine

Yih-Dar's avatar
Yih-Dar committed
32
33
# Uninstall `torch-tensorrt` shipped with the base image
RUN python3 -m pip uninstall -y torch-tensorrt
34

35
36
37
# recompile apex
RUN python3 -m pip uninstall -y apex
RUN git clone https://github.com/NVIDIA/apex
38
#  `MAX_JOBS=1` disables parallel building to avoid cpu memory OOM when building image on GitHub Action (standard) runners
39
RUN cd apex && git checkout 82ee367f3da74b4cd62a1fb47aa9806f0f47b58b && MAX_JOBS=1 python3 -m pip install --global-option="--cpp_ext" --global-option="--cuda_ext" --no-cache -v --disable-pip-version-check .
40

41
# Pre-build **latest** DeepSpeed, so it would be ready for testing (otherwise, the 1st deepspeed test will timeout)
Yih-Dar's avatar
Yih-Dar committed
42
RUN python3 -m pip uninstall -y deepspeed
43
44
45
# This has to be run (again) inside the GPU VMs running the tests.
# The installation works here, but some tests fail, if we don't pre-build deepspeed again in the VMs running the tests.
# TODO: Find out why test fail.
Yih-Dar's avatar
Yih-Dar committed
46
RUN DS_BUILD_CPU_ADAM=1 DS_BUILD_FUSED_ADAM=1 DS_BUILD_UTILS=1 python3 -m pip install deepspeed --global-option="build_ext" --global-option="-j8" --no-cache -v --disable-pip-version-check 2>&1
Yih-Dar's avatar
Yih-Dar committed
47

48
49
50
51
# When installing in editable mode, `transformers` is not recognized as a package.
# this line must be added in order for python to be aware of transformers.
RUN cd transformers && python3 setup.py develop

Yih-Dar's avatar
Yih-Dar committed
52
# The base image ships with `pydantic==1.8.2` which is not working - i.e. the next command fails
53
RUN python3 -m pip install -U --no-cache-dir pydantic<2
54
RUN python3 -c "from deepspeed.launcher.runner import main"