"git@developer.sourcefind.cn:chenpangpang/transformers.git" did not exist on "2e153930cfddf327b86abd46daf8bc58671a094a"
Unverified Commit ca169dbd authored by Yih-Dar's avatar Yih-Dar Committed by GitHub
Browse files

Enable PyTorch nightly build CI (#17335)



* nightly build pytorch CI

* fix working dir

* change time and event name
Co-authored-by: default avatarydshieh <ydshieh@users.noreply.github.com>
parent 3c7e56fb
...@@ -39,6 +39,33 @@ jobs: ...@@ -39,6 +39,33 @@ jobs:
push: true push: true
tags: huggingface/transformers-all-latest-gpu tags: huggingface/transformers-all-latest-gpu
latest-with-torch-nightly-docker:
name: "Nightly PyTorch + Stable TensorFlow"
runs-on: ubuntu-latest
steps:
-
name: Set up Docker Buildx
uses: docker/setup-buildx-action@v1
-
name: Check out code
uses: actions/checkout@v2
-
name: Login to DockerHub
uses: docker/login-action@v1
with:
username: ${{ secrets.DOCKERHUB_USERNAME }}
password: ${{ secrets.DOCKERHUB_PASSWORD }}
-
name: Build and push
uses: docker/build-push-action@v2
with:
context: ./docker/transformers-all-latest-gpu
build-args: |
REF=main
PYTORCH=pre
push: true
tags: huggingface/transformers-all-latest-torch-nightly-gpu
latest-torch-deepspeed-docker: latest-torch-deepspeed-docker:
name: "Latest PyTorch + DeepSpeed" name: "Latest PyTorch + DeepSpeed"
runs-on: ubuntu-latest runs-on: ubuntu-latest
...@@ -65,6 +92,32 @@ jobs: ...@@ -65,6 +92,32 @@ jobs:
push: true push: true
tags: huggingface/transformers-pytorch-deepspeed-latest-gpu tags: huggingface/transformers-pytorch-deepspeed-latest-gpu
nightly-torch-deepspeed-docker:
name: "Nightly PyTorch + DeepSpeed"
runs-on: ubuntu-latest
steps:
-
name: Set up Docker Buildx
uses: docker/setup-buildx-action@v1
-
name: Check out code
uses: actions/checkout@v2
-
name: Login to DockerHub
uses: docker/login-action@v1
with:
username: ${{ secrets.DOCKERHUB_USERNAME }}
password: ${{ secrets.DOCKERHUB_PASSWORD }}
-
name: Build and push
uses: docker/build-push-action@v2
with:
context: ./docker/transformers-pytorch-deepspeed-nightly-gpu
build-args: |
REF=main
push: true
tags: huggingface/transformers-pytorch-deepspeed-nightly-gpu
doc-builder: doc-builder:
name: "Doc builder" name: "Doc builder"
runs-on: ubuntu-latest runs-on: ubuntu-latest
......
name: Self-hosted runner; Nightly (scheduled) name: Self-hosted runner (nightly)
# Note that each job's dependencies go into a corresponding docker file.
#
# For example for `run_all_tests_torch_cuda_extensions_gpu` the docker image is
# `huggingface/transformers-pytorch-deepspeed-latest-gpu`, which can be found at
# `docker/transformers-pytorch-deepspeed-latest-gpu/Dockerfile`
on: on:
push: repository_dispatch:
branches: schedule:
- nightly_ci* - cron: "0 16 * * *"
repository_dispatch:
schedule:
- cron: "0 0 */3 * *"
env: env:
HF_HOME: /mnt/cache HF_HOME: /mnt/cache
TRANSFORMERS_IS_CI: yes TRANSFORMERS_IS_CI: yes
RUN_SLOW: yes OMP_NUM_THREADS: 8
OMP_NUM_THREADS: 16 MKL_NUM_THREADS: 8
MKL_NUM_THREADS: 16 RUN_SLOW: yes
PYTEST_TIMEOUT: 600 SIGOPT_API_TOKEN: ${{ secrets.SIGOPT_API_TOKEN }}
SIGOPT_API_TOKEN: ${{ secrets.SIGOPT_API_TOKEN }} TF_FORCE_GPU_ALLOW_GROWTH: true
RUN_PT_TF_CROSS_TESTS: 1
jobs: jobs:
run_all_tests_torch_gpu: setup:
runs-on: [self-hosted, docker-gpu, single-gpu] name: Setup
container: strategy:
image: pytorch/pytorch:1.10.0-cuda11.3-cudnn8-runtime matrix:
options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ machine_type: [single-gpu, multi-gpu]
steps: runs-on: ${{ format('{0}-{1}', matrix.machine_type, 'docker') }}
- name: Launcher docker container:
uses: actions/checkout@v2 image: huggingface/transformers-all-latest-torch-nightly-gpu
options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
- name: NVIDIA-SMI outputs:
run: | matrix: ${{ steps.set-matrix.outputs.matrix }}
nvidia-smi steps:
- name: Update clone
- name: Install dependencies working-directory: /transformers
run: | run: |
apt -y update && apt install -y libsndfile1-dev git espeak-ng git fetch && git checkout ${{ github.sha }}
pip install --upgrade pip
pip install .[integrations,sklearn,testing,onnxruntime,sentencepiece,torch-speech,vision,timm] - name: Cleanup
pip install https://github.com/kpu/kenlm/archive/master.zip working-directory: /transformers
pip install --pre torch torchvision torchaudio -f https://download.pytorch.org/whl/nightly/cu113/torch_nightly.html -U run: |
rm -rf tests/__pycache__
- name: Are GPUs recognized by our DL frameworks rm -rf tests/models/__pycache__
run: | rm -rf reports
utils/print_env.py
- id: set-matrix
- name: Run all tests on GPU name: Identify models to test
run: | working-directory: /transformers/tests
python -m pytest -n 1 -v --dist=loadfile --make-reports=tests_torch_gpu tests run: |
echo "::set-output name=matrix::$(python3 -c 'import os; tests = os.getcwd(); model_tests = os.listdir(os.path.join(tests, "models")); d1 = sorted(list(filter(os.path.isdir, os.listdir(tests)))); d2 = sorted(list(filter(os.path.isdir, [f"models/{x}" for x in model_tests]))); d1.remove("models"); d = d2 + d1; print(d)')"
- name: Failure short reports
if: ${{ always() }} - name: NVIDIA-SMI
run: cat reports/tests_torch_gpu/failures_short.txt run: |
nvidia-smi
- name: Run examples tests on GPU
if: ${{ always() }} run_tests_single_gpu:
env: name: Model tests
OMP_NUM_THREADS: 16 strategy:
MKL_NUM_THREADS: 16 fail-fast: false
RUN_SLOW: yes matrix:
HF_HOME: /mnt/cache folders: ${{ fromJson(needs.setup.outputs.matrix) }}
TRANSFORMERS_IS_CI: yes machine_type: [single-gpu]
run: | runs-on: ${{ format('{0}-{1}', matrix.machine_type, 'docker') }}
pip install -r examples/pytorch/_tests_requirements.txt container:
python -m pytest -n 1 -v --dist=loadfile --make-reports=examples_torch_gpu examples image: huggingface/transformers-all-latest-torch-nightly-gpu
options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
- name: Failure short reports needs: setup
if: ${{ always() }} steps:
run: cat reports/examples_torch_gpu/failures_short.txt - name: Echo folder ${{ matrix.folders }}
shell: bash
- name: Run all pipeline tests on GPU # For folders like `models/bert`, set an env. var. (`matrix_folders`) to `models_bert`, which will be used to
if: ${{ always() }} # set the artifact folder names (because the character `/` is not allowed).
env: run: |
RUN_PIPELINE_TESTS: yes echo "${{ matrix.folders }}"
run: | matrix_folders=${{ matrix.folders }}
python -m pytest -n 1 -v --dist=loadfile -m is_pipeline_test --make-reports=tests_torch_pipeline_gpu tests matrix_folders=${matrix_folders/'models/'/'models_'}
echo "$matrix_folders"
- name: Failure short reports echo "matrix_folders=$matrix_folders" >> $GITHUB_ENV
if: ${{ always() }}
run: cat reports/tests_torch_pipeline_gpu/failures_short.txt - name: Update clone
working-directory: /transformers
- name: Test suite reports artifacts run: git fetch && git checkout ${{ github.sha }}
if: ${{ always() }}
uses: actions/upload-artifact@v2 - name: NVIDIA-SMI
with: run: |
name: run_all_tests_torch_gpu_test_reports nvidia-smi
path: reports
- name: Environment
run_all_tests_torch_multi_gpu: working-directory: /transformers
runs-on: [self-hosted, docker-gpu, multi-gpu] run: |
container: python3 utils/print_env.py
image: pytorch/pytorch:1.10.0-cuda11.3-cudnn8-runtime
options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ - name: Run all tests on GPU
steps: working-directory: /transformers
- name: Launcher docker run: python3 -m pytest -v --make-reports=${{ matrix.machine_type }}_tests_gpu_${{ matrix.folders }} tests/${{ matrix.folders }}
uses: actions/checkout@v2
- name: Failure short reports
- name: NVIDIA-SMI if: ${{ failure() }}
continue-on-error: true continue-on-error: true
run: | run: cat /transformers/reports/${{ matrix.machine_type }}_tests_gpu_${{ matrix.folders }}/failures_short.txt
nvidia-smi
- name: Test suite reports artifacts
- name: Install dependencies if: ${{ always() }}
run: | uses: actions/upload-artifact@v2
apt -y update && apt install -y libsndfile1-dev git espeak-ng with:
pip install --upgrade pip name: ${{ matrix.machine_type }}_run_all_tests_gpu_${{ env.matrix_folders }}_test_reports
pip install .[integrations,sklearn,testing,onnxruntime,sentencepiece,torch-speech,vision,timm] path: /transformers/reports/${{ matrix.machine_type }}_tests_gpu_${{ matrix.folders }}
pip install https://github.com/kpu/kenlm/archive/master.zip
pip install --pre torch torchvision torchaudio -f https://download.pytorch.org/whl/nightly/cu113/torch_nightly.html -U run_tests_multi_gpu:
name: Model tests
- name: Are GPUs recognized by our DL frameworks strategy:
run: | fail-fast: false
utils/print_env.py matrix:
folders: ${{ fromJson(needs.setup.outputs.matrix) }}
- name: Run all tests on GPU machine_type: [multi-gpu]
env: runs-on: ${{ format('{0}-{1}', matrix.machine_type, 'docker') }}
MKL_SERVICE_FORCE_INTEL: 1 container:
run: | image: huggingface/transformers-all-latest-torch-nightly-gpu
python -m pytest -n 1 -v --dist=loadfile --make-reports=tests_torch_multi_gpu tests options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
needs: setup
- name: Failure short reports steps:
if: ${{ always() }} - name: Echo folder ${{ matrix.folders }}
run: cat reports/tests_torch_multi_gpu/failures_short.txt shell: bash
# For folders like `models/bert`, set an env. var. (`matrix_folders`) to `models_bert`, which will be used to
- name: Run all pipeline tests on GPU # set the artifact folder names (because the character `/` is not allowed).
if: ${{ always() }} run: |
env: echo "${{ matrix.folders }}"
RUN_PIPELINE_TESTS: yes matrix_folders=${{ matrix.folders }}
run: | matrix_folders=${matrix_folders/'models/'/'models_'}
python -m pytest -n 1 -v --dist=loadfile -m is_pipeline_test --make-reports=tests_torch_pipeline_multi_gpu tests echo "$matrix_folders"
echo "matrix_folders=$matrix_folders" >> $GITHUB_ENV
- name: Failure short reports
if: ${{ always() }} - name: Update clone
run: cat reports/tests_torch_pipeline_multi_gpu/failures_short.txt working-directory: /transformers
run: git fetch && git checkout ${{ github.sha }}
- name: Test suite reports artifacts
if: ${{ always() }} - name: NVIDIA-SMI
uses: actions/upload-artifact@v2 run: |
with: nvidia-smi
name: run_all_tests_torch_multi_gpu_test_reports
path: reports - name: Environment
working-directory: /transformers
run_all_tests_torch_cuda_extensions_gpu: run: |
runs-on: [self-hosted, docker-gpu, single-gpu] python3 utils/print_env.py
container:
image: nvcr.io/nvidia/pytorch:21.03-py3 - name: Run all tests on GPU
options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ working-directory: /transformers
steps: run: python3 -m pytest -v --make-reports=${{ matrix.machine_type }}_tests_gpu_${{ matrix.folders }} tests/${{ matrix.folders }}
- name: Launcher docker
uses: actions/checkout@v2 - name: Failure short reports
if: ${{ failure() }}
- name: NVIDIA-SMI continue-on-error: true
run: | run: cat /transformers/reports/${{ matrix.machine_type }}_tests_gpu_${{ matrix.folders }}/failures_short.txt
nvidia-smi
- name: Test suite reports artifacts
- name: Install dependencies if: ${{ always() }}
run: | uses: actions/upload-artifact@v2
apt -y update && apt install -y libaio-dev libsndfile1-dev git espeak-ng with:
pip install --upgrade pip name: ${{ matrix.machine_type }}_run_all_tests_gpu_${{ env.matrix_folders }}_test_reports
pip install --pre torch torchvision torchaudio -f https://download.pytorch.org/whl/nightly/cu113/torch_nightly.html -U path: /transformers/reports/${{ matrix.machine_type }}_tests_gpu_${{ matrix.folders }}
pip install .[deepspeed-testing]
pip install https://github.com/kpu/kenlm/archive/master.zip run_all_tests_torch_cuda_extensions_gpu:
pip install git+https://github.com/microsoft/DeepSpeed name: Torch CUDA extension tests
strategy:
- name: Are GPUs recognized by our DL frameworks fail-fast: false
run: | matrix:
utils/print_env.py machine_type: [single-gpu, multi-gpu]
runs-on: ${{ format('{0}-{1}', matrix.machine_type, 'docker') }}
- name: Run all tests on GPU needs: setup
run: | container:
python -m pytest -n 1 -v --dist=loadfile --make-reports=tests_torch_cuda_extensions_gpu tests/deepspeed tests/extended image: huggingface/transformers-pytorch-deepspeed-nightly-gpu
options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
- name: Failure short reports steps:
if: ${{ always() }} - name: Update clone
run: cat reports/tests_torch_cuda_extensions_gpu/failures_short.txt working-directory: /workspace/transformers
run: git fetch && git checkout ${{ github.sha }}
- name: Test suite reports artifacts
if: ${{ always() }} # To avoid unknown test failures
uses: actions/upload-artifact@v2 - name: Pre build DeepSpeed *again*
with: working-directory: /workspace
name: run_tests_torch_cuda_extensions_gpu_test_reports run: |
path: reports python3 -m pip uninstall -y deepspeed
rm -rf DeepSpeed
run_all_tests_torch_cuda_extensions_multi_gpu: git clone https://github.com/microsoft/DeepSpeed && cd DeepSpeed && rm -rf build
runs-on: [self-hosted, docker-gpu, multi-gpu] DS_BUILD_CPU_ADAM=1 DS_BUILD_AIO=1 DS_BUILD_UTILS=1 python3 -m pip install . --global-option="build_ext" --global-option="-j8" --no-cache -v --disable-pip-version-check
container:
image: nvcr.io/nvidia/pytorch:21.03-py3 - name: NVIDIA-SMI
options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ run: |
steps: nvidia-smi
- name: Launcher docker
uses: actions/checkout@v2 - name: Environment
working-directory: /workspace/transformers
- name: NVIDIA-SMI run: |
continue-on-error: true python utils/print_env.py
run: |
nvidia-smi - name: Run all tests on GPU
working-directory: /workspace/transformers
- name: Install dependencies run: |
run: | python -m pytest -v --make-reports=${{ matrix.machine_type }}_tests_torch_cuda_extensions_gpu tests/deepspeed tests/extended
apt -y update && apt install -y libaio-dev libsndfile1-dev git espeak-ng
pip install --upgrade pip - name: Failure short reports
pip install --pre torch torchvision torchaudio -f https://download.pytorch.org/whl/nightly/cu113/torch_nightly.html -U if: ${{ failure() }}
rm -rf ~/.cache/torch_extensions/ # shared between conflicting builds continue-on-error: true
pip install .[testing,fairscale] run: cat /workspace/transformers/reports/${{ matrix.machine_type }}_tests_torch_cuda_extensions_gpu/failures_short.txt
pip install https://github.com/kpu/kenlm/archive/master.zip
pip install git+https://github.com/microsoft/DeepSpeed # testing bleeding edge - name: Test suite reports artifacts
if: ${{ always() }}
- name: Are GPUs recognized by our DL frameworks uses: actions/upload-artifact@v2
run: | with:
utils/print_env.py name: ${{ matrix.machine_type }}_run_tests_torch_cuda_extensions_gpu_test_reports
path: /workspace/transformers/reports/${{ matrix.machine_type }}_tests_torch_cuda_extensions_gpu
- name: Run all tests on GPU
run: | send_results:
python -m pytest -n 1 -v --dist=loadfile --make-reports=tests_torch_cuda_extensions_multi_gpu tests/deepspeed tests/extended name: Send results to webhook
runs-on: ubuntu-latest
- name: Failure short reports if: always()
if: ${{ always() }} needs: [setup, run_tests_single_gpu, run_tests_multi_gpu, run_all_tests_torch_cuda_extensions_gpu]
run: cat reports/tests_torch_cuda_extensions_multi_gpu/failures_short.txt steps:
- uses: actions/checkout@v2
- name: Test suite reports artifacts - uses: actions/download-artifact@v2
if: ${{ always() }} - name: Send message to Slack
uses: actions/upload-artifact@v2 env:
with: CI_SLACK_BOT_TOKEN: ${{ secrets.CI_SLACK_BOT_TOKEN }}
name: run_tests_torch_cuda_extensions_multi_gpu_test_reports CI_SLACK_CHANNEL_ID: ${{ secrets.CI_SLACK_CHANNEL_ID }}
path: reports CI_SLACK_CHANNEL_ID_DAILY: ${{ secrets.CI_SLACK_CHANNEL_ID_DAILY }}
CI_SLACK_CHANNEL_DUMMY_TESTS: ${{ secrets.CI_SLACK_CHANNEL_DUMMY_TESTS }}
send_results: CI_SLACK_REPORT_CHANNEL_ID: ${{ secrets.CI_SLACK_CHANNEL_ID_PAST_FUTURE }}
name: Send results to webhook CI_EVENT: nightly-build
runs-on: ubuntu-latest # We pass `needs.setup.outputs.matrix` as the argument. A processing in `notification_service.py` to change
if: always() # `models/bert` to `models_bert` is required, as the artifact names use `_` instead of `/`.
needs: [ run: |
run_all_tests_torch_gpu, pip install slack_sdk
run_all_tests_torch_multi_gpu, python utils/notification_service.py "${{ needs.setup.outputs.matrix }}"
run_all_tests_torch_cuda_extensions_gpu,
run_all_tests_torch_cuda_extensions_multi_gpu
]
steps:
- uses: actions/checkout@v2
- uses: actions/download-artifact@v2
- name: Send message to Slack
env:
CI_SLACK_BOT_TOKEN: ${{ secrets.CI_SLACK_BOT_TOKEN }}
CI_SLACK_CHANNEL_ID: ${{ secrets.CI_SLACK_CHANNEL_ID }}
CI_SLACK_CHANNEL_ID_DAILY: ${{ secrets.CI_SLACK_CHANNEL_ID_DAILY }}
CI_SLACK_CHANNEL_ID_PAST_FUTURE: ${{ secrets.CI_SLACK_CHANNEL_ID_PAST_FUTURE }}
run: |
pip install slack_sdk
python utils/notification_service.py scheduled nightly-torch
...@@ -207,7 +207,7 @@ jobs: ...@@ -207,7 +207,7 @@ jobs:
# To avoid unknown test failures # To avoid unknown test failures
- name: Pre build DeepSpeed *again* - name: Pre build DeepSpeed *again*
working-directory: /workspace/transformers working-directory: /workspace
run: | run: |
python3 -m pip uninstall -y deepspeed python3 -m pip uninstall -y deepspeed
DS_BUILD_CPU_ADAM=1 DS_BUILD_AIO=1 DS_BUILD_UTILS=1 python3 -m pip install deepspeed --global-option="build_ext" --global-option="-j8" --no-cache -v --disable-pip-version-check DS_BUILD_CPU_ADAM=1 DS_BUILD_AIO=1 DS_BUILD_UTILS=1 python3 -m pip install deepspeed --global-option="build_ext" --global-option="-j8" --no-cache -v --disable-pip-version-check
...@@ -217,10 +217,12 @@ jobs: ...@@ -217,10 +217,12 @@ jobs:
nvidia-smi nvidia-smi
- name: Environment - name: Environment
working-directory: /workspace/transformers
run: | run: |
python utils/print_env.py python utils/print_env.py
- name: Run all non-slow selected tests on GPU - name: Run all non-slow selected tests on GPU
working-directory: /workspace/transformers
# TODO: Here we pass all tests in the 2 folders for simplicity. It's better to pass only the identified tests. # TODO: Here we pass all tests in the 2 folders for simplicity. It's better to pass only the identified tests.
run: | run: |
python -m pytest -n 1 --dist=loadfile -v --make-reports=${{ matrix.machine_type }}_tests_torch_cuda_extensions_gpu tests/deepspeed tests/extended python -m pytest -n 1 --dist=loadfile -v --make-reports=${{ matrix.machine_type }}_tests_torch_cuda_extensions_gpu tests/deepspeed tests/extended
...@@ -256,7 +258,7 @@ jobs: ...@@ -256,7 +258,7 @@ jobs:
# To avoid unknown test failures # To avoid unknown test failures
- name: Pre build DeepSpeed *again* - name: Pre build DeepSpeed *again*
working-directory: /workspace/transformers working-directory: /workspace
run: | run: |
python3 -m pip uninstall -y deepspeed python3 -m pip uninstall -y deepspeed
DS_BUILD_CPU_ADAM=1 DS_BUILD_AIO=1 DS_BUILD_UTILS=1 python3 -m pip install deepspeed --global-option="build_ext" --global-option="-j8" --no-cache -v --disable-pip-version-check DS_BUILD_CPU_ADAM=1 DS_BUILD_AIO=1 DS_BUILD_UTILS=1 python3 -m pip install deepspeed --global-option="build_ext" --global-option="-j8" --no-cache -v --disable-pip-version-check
......
...@@ -308,7 +308,7 @@ jobs: ...@@ -308,7 +308,7 @@ jobs:
# To avoid unknown test failures # To avoid unknown test failures
- name: Pre build DeepSpeed *again* - name: Pre build DeepSpeed *again*
working-directory: /workspace/transformers working-directory: /workspace
run: | run: |
python3 -m pip uninstall -y deepspeed python3 -m pip uninstall -y deepspeed
DS_BUILD_CPU_ADAM=1 DS_BUILD_AIO=1 DS_BUILD_UTILS=1 python3 -m pip install deepspeed --global-option="build_ext" --global-option="-j8" --no-cache -v --disable-pip-version-check DS_BUILD_CPU_ADAM=1 DS_BUILD_AIO=1 DS_BUILD_UTILS=1 python3 -m pip install deepspeed --global-option="build_ext" --global-option="-j8" --no-cache -v --disable-pip-version-check
......
...@@ -3,6 +3,9 @@ LABEL maintainer="Hugging Face" ...@@ -3,6 +3,9 @@ LABEL maintainer="Hugging Face"
ARG DEBIAN_FRONTEND=noninteractive ARG DEBIAN_FRONTEND=noninteractive
# Use login shell to read variables from `~/.profile` (to pass dynamic created variables between RUN commands)
SHELL ["sh", "-lc"]
# The following `ARG` are mainly used to specify the versions explicitly & directly in this docker file, and not meant # The following `ARG` are mainly used to specify the versions explicitly & directly in this docker file, and not meant
# to be used as arguments for docker build (so far). # to be used as arguments for docker build (so far).
...@@ -21,11 +24,20 @@ ARG REF=main ...@@ -21,11 +24,20 @@ ARG REF=main
RUN git clone https://github.com/huggingface/transformers && cd transformers && git checkout $REF RUN git clone https://github.com/huggingface/transformers && cd transformers && git checkout $REF
RUN python3 -m pip install --no-cache-dir -e ./transformers[dev,onnxruntime] RUN python3 -m pip install --no-cache-dir -e ./transformers[dev,onnxruntime]
RUN python3 -m pip install --no-cache-dir -U torch==$PYTORCH torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/$CUDA # TODO: Handle these in a python utility script
RUN [ ${#PYTORCH} -gt 0 -a "$PYTORCH" != "pre" ] && VERSION='torch=='$PYTORCH'.*' || VERSION='torch'; echo "export VERSION='$VERSION'" >> ~/.profile
RUN echo torch=$VERSION
# `torchvision` and `torchaudio` should be installed along with `torch`, especially for nightly build.
# Currently, let's just use their latest releases (when `torch` is installed with a release version)
# TODO: We might need to specify proper versions that work with a specific torch version (especially for past CI).
RUN [ "$PYTORCH" != "pre" ] && python3 -m pip install --no-cache-dir -U $VERSION torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/$CUDA || python3 -m pip install --no-cache-dir -U --pre torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/nightly/$CUDA
RUN python3 -m pip install --no-cache-dir -U tensorflow RUN python3 -m pip install --no-cache-dir -U tensorflow
RUN python3 -m pip uninstall -y flax jax RUN python3 -m pip uninstall -y flax jax
RUN python3 -m pip install --no-cache-dir torch-scatter -f https://data.pyg.org/whl/torch-$PYTORCH+$CUDA.html # Use installed torch version for `torch-scatter` to avid to deal with PYTORCH='pre'.
# If torch is nightly version, the link is likely to be invalid, but the installation falls back to the latest torch-scatter
RUN python3 -m pip install --no-cache-dir torch-scatter -f https://data.pyg.org/whl/torch-$(python3 -c "from torch import version; print(version.__version__.split('+')[0])")+$CUDA.html
RUN python3 -m pip install --no-cache-dir intel_extension_for_pytorch==$INTEL_TORCH_EXT+cpu -f https://software.intel.com/ipex-whl-stable RUN python3 -m pip install --no-cache-dir intel_extension_for_pytorch==$INTEL_TORCH_EXT+cpu -f https://software.intel.com/ipex-whl-stable
RUN python3 -m pip install --no-cache-dir git+https://github.com/facebookresearch/detectron2.git pytesseract https://github.com/kpu/kenlm/archive/master.zip RUN python3 -m pip install --no-cache-dir git+https://github.com/facebookresearch/detectron2.git pytesseract https://github.com/kpu/kenlm/archive/master.zip
......
...@@ -3,6 +3,9 @@ LABEL maintainer="Hugging Face" ...@@ -3,6 +3,9 @@ LABEL maintainer="Hugging Face"
ARG DEBIAN_FRONTEND=noninteractive ARG DEBIAN_FRONTEND=noninteractive
# Example: `cu102`, `cu113`, etc.
ARG CUDA='cu113'
RUN apt -y update RUN apt -y update
RUN apt install -y libaio-dev RUN apt install -y libaio-dev
RUN python3 -m pip install --no-cache-dir --upgrade pip RUN python3 -m pip install --no-cache-dir --upgrade pip
...@@ -13,13 +16,16 @@ RUN git clone https://github.com/huggingface/transformers && cd transformers && ...@@ -13,13 +16,16 @@ RUN git clone https://github.com/huggingface/transformers && cd transformers &&
# Install latest release PyTorch # Install latest release PyTorch
# (PyTorch must be installed before pre-compiling any DeepSpeed c++/cuda ops.) # (PyTorch must be installed before pre-compiling any DeepSpeed c++/cuda ops.)
# (https://www.deepspeed.ai/tutorials/advanced-install/#pre-install-deepspeed-ops) # (https://www.deepspeed.ai/tutorials/advanced-install/#pre-install-deepspeed-ops)
RUN python3 -m pip install --no-cache-dir -U torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/cu113 RUN python3 -m pip install --no-cache-dir -U torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/$CUDA
RUN python3 -m pip install --no-cache-dir ./transformers[deepspeed-testing] RUN python3 -m pip install --no-cache-dir ./transformers[deepspeed-testing]
# Pre-build DeepSpeed, so it would be ready for testing (otherwise, the 1st deepspeed test will timeout) # Pre-build **latest** DeepSpeed, so it would be ready for testing (otherwise, the 1st deepspeed test will timeout)
RUN python3 -m pip uninstall -y deepspeed RUN python3 -m pip uninstall -y deepspeed
RUN DS_BUILD_CPU_ADAM=1 DS_BUILD_AIO=1 DS_BUILD_UTILS=1 python3 -m pip install deepspeed --global-option="build_ext" --global-option="-j8" --no-cache -v --disable-pip-version-check # This has to be run (again) inside the GPU VMs running the tests.
# The installation works here, but some tests fail, if we don't pre-build deepspeed again in the VMs running the tests.
# TODO: Find out why test fail.
RUN DS_BUILD_CPU_ADAM=1 DS_BUILD_AIO=1 DS_BUILD_UTILS=1 python3 -m pip install deepspeed --global-option="build_ext" --global-option="-j8" --no-cache -v --disable-pip-version-check 2>&1
# When installing in editable mode, `transformers` is not recognized as a package. # When installing in editable mode, `transformers` is not recognized as a package.
# this line must be added in order for python to be aware of transformers. # this line must be added in order for python to be aware of transformers.
......
FROM nvcr.io/nvidia/pytorch:21.03-py3
LABEL maintainer="Hugging Face"
ARG DEBIAN_FRONTEND=noninteractive
# Example: `cu102`, `cu113`, etc.
ARG CUDA='cu113'
RUN apt -y update
RUN apt install -y libaio-dev
RUN python3 -m pip install --no-cache-dir --upgrade pip
ARG REF=main
RUN git clone https://github.com/huggingface/transformers && cd transformers && git checkout $REF
# Install **nightly** release PyTorch (flag `--pre`)
# (PyTorch must be installed before pre-compiling any DeepSpeed c++/cuda ops.)
# (https://www.deepspeed.ai/tutorials/advanced-install/#pre-install-deepspeed-ops)
RUN python3 -m pip install --no-cache-dir -U --pre torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/nightly/$CUDA
RUN python3 -m pip install --no-cache-dir ./transformers[deepspeed-testing]
# Pre-build **nightly** release of DeepSpeed, so it would be ready for testing (otherwise, the 1st deepspeed test will timeout)
RUN python3 -m pip uninstall -y deepspeed
# This has to be run inside the GPU VMs running the tests. (So far, it fails here due to GPU checks during compilation.)
# Issue: https://github.com/microsoft/DeepSpeed/issues/2010
# RUN git clone https://github.com/microsoft/DeepSpeed && cd DeepSpeed && rm -rf build && \
# DS_BUILD_CPU_ADAM=1 DS_BUILD_AIO=1 DS_BUILD_UTILS=1 python3 -m pip install . --global-option="build_ext" --global-option="-j8" --no-cache -v --disable-pip-version-check 2>&1
# When installing in editable mode, `transformers` is not recognized as a package.
# this line must be added in order for python to be aware of transformers.
RUN cd transformers && python3 setup.py develop
# Disable for now as deepspeed is not installed above. To be enabled once the issue is fixed.
# RUN python3 -c "from deepspeed.launcher.runner import main"
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment