Enable PyTorch nightly build CI (#17335)

* nightly build pytorch CI * fix working dir * change time and event name Co-authored-by: ydshieh <ydshieh@users.noreply.github.com>

Enable PyTorch nightly build CI (#17335)
* nightly build pytorch CI * fix working dir * change time and event name Co-authored-by: ydshieh <ydshieh@users.noreply.github.com>
ca169dbd · Yih-Dar · GitHub · 3c7e56fb · ca169dbd · ca169dbd
Unverified Commit ca169dbd authored Jun 17, 2022 by Yih-Dar Committed by GitHub Jun 17, 2022
7 changed files
--- a/.github/workflows/build-docker-images.yml
+++ b/.github/workflows/build-docker-images.yml
@@ -39,6 +39,33 @@ jobs:
          push: true
          tags: huggingface/transformers-all-latest-gpu
+  latest-with-torch-nightly-docker:
+    name: "Nightly PyTorch + Stable TensorFlow"
+    runs-on: ubuntu-latest
+    steps:
+      -
+        name: Set up Docker Buildx
+        uses: docker/setup-buildx-action@v1
+      -
+        name: Check out code
+        uses: actions/checkout@v2
+      -
+        name: Login to DockerHub
+        uses: docker/login-action@v1
+        with:
+          username: ${{ secrets.DOCKERHUB_USERNAME }}
+          password: ${{ secrets.DOCKERHUB_PASSWORD }}
+      -
+        name: Build and push
+        uses: docker/build-push-action@v2
+        with:
+          context: ./docker/transformers-all-latest-gpu
+          build-args: |
+            REF=main
+            PYTORCH=pre
+          push: true
+          tags: huggingface/transformers-all-latest-torch-nightly-gpu
  latest-torch-deepspeed-docker:
    name: "Latest PyTorch + DeepSpeed"
    runs-on: ubuntu-latest
@@ -65,6 +92,32 @@ jobs:
          push: true
          tags: huggingface/transformers-pytorch-deepspeed-latest-gpu
+  nightly-torch-deepspeed-docker:
+    name: "Nightly PyTorch + DeepSpeed"
+    runs-on: ubuntu-latest
+    steps:
+      -
+        name: Set up Docker Buildx
+        uses: docker/setup-buildx-action@v1
+      -
+        name: Check out code
+        uses: actions/checkout@v2
+      -
+        name: Login to DockerHub
+        uses: docker/login-action@v1
+        with:
+          username: ${{ secrets.DOCKERHUB_USERNAME }}
+          password: ${{ secrets.DOCKERHUB_PASSWORD }}
+      -
+        name: Build and push
+        uses: docker/build-push-action@v2
+        with:
+          context: ./docker/transformers-pytorch-deepspeed-nightly-gpu
+          build-args: |
+            REF=main
+          push: true
+          tags: huggingface/transformers-pytorch-deepspeed-nightly-gpu
  doc-builder:
    name: "Doc builder"
    runs-on: ubuntu-latest

--- a/.github/workflows/self-nightly-scheduled.yml
+++ b/.github/workflows/self-nightly-scheduled.yml
-name: Self-hosted runner; Nightly (scheduled)
+name: Self-hosted runner (nightly)
+# Note that each job's dependencies go into a corresponding docker file.
+#
+# For example for `run_all_tests_torch_cuda_extensions_gpu` the docker image is
+# `huggingface/transformers-pytorch-deepspeed-latest-gpu`, which can be found at
+# `docker/transformers-pytorch-deepspeed-latest-gpu/Dockerfile`
 on:
-    push:
+  repository_dispatch:
-        branches:
+  schedule:
-            - nightly_ci*
+    - cron: "0 16 * * *"
-    repository_dispatch:
-    schedule:
-        - cron: "0 0 */3 * *"
 env:
-    HF_HOME: /mnt/cache
+  HF_HOME: /mnt/cache
-    TRANSFORMERS_IS_CI: yes
+  TRANSFORMERS_IS_CI: yes
-    RUN_SLOW: yes
+  OMP_NUM_THREADS: 8
-    OMP_NUM_THREADS: 16
+  MKL_NUM_THREADS: 8
-    MKL_NUM_THREADS: 16
+  RUN_SLOW: yes
-    PYTEST_TIMEOUT: 600
+  SIGOPT_API_TOKEN: ${{ secrets.SIGOPT_API_TOKEN }}
-    SIGOPT_API_TOKEN: ${{ secrets.SIGOPT_API_TOKEN }}
+  TF_FORCE_GPU_ALLOW_GROWTH: true
+  RUN_PT_TF_CROSS_TESTS: 1
 jobs:
-    run_all_tests_torch_gpu:
+  setup:
-        runs-on: [self-hosted, docker-gpu, single-gpu]
+    name: Setup
-        container:
+    strategy:
-            image: pytorch/pytorch:1.10.0-cuda11.3-cudnn8-runtime
+      matrix:
-            options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
+        machine_type: [single-gpu, multi-gpu]
-        steps:
+    runs-on: ${{ format('{0}-{1}', matrix.machine_type, 'docker') }}
-            - name: Launcher docker
+    container:
-              uses: actions/checkout@v2
+      image: huggingface/transformers-all-latest-torch-nightly-gpu
+      options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
-            - name: NVIDIA-SMI
+    outputs:
-              run: |
+      matrix: ${{ steps.set-matrix.outputs.matrix }}
-                  nvidia-smi
+    steps:
+      - name: Update clone
-            - name: Install dependencies
+        working-directory: /transformers
-              run: |
+        run: |
-                  apt -y update && apt install -y libsndfile1-dev git espeak-ng
+          git fetch && git checkout ${{ github.sha }}
-                  pip install --upgrade pip
-                  pip install .[integrations,sklearn,testing,onnxruntime,sentencepiece,torch-speech,vision,timm]
+      - name: Cleanup
-                  pip install https://github.com/kpu/kenlm/archive/master.zip
+        working-directory: /transformers
-                  pip install --pre torch torchvision torchaudio -f https://download.pytorch.org/whl/nightly/cu113/torch_nightly.html -U
+        run: |
+          rm -rf tests/__pycache__
-            - name: Are GPUs recognized by our DL frameworks
+          rm -rf tests/models/__pycache__
-              run: |
+          rm -rf reports
-                utils/print_env.py
+      - id: set-matrix
-            - name: Run all tests on GPU
+        name: Identify models to test
-              run: |
+        working-directory: /transformers/tests
-                  python -m pytest -n 1 -v --dist=loadfile --make-reports=tests_torch_gpu tests
+        run: |
+          echo "::set-output name=matrix::$(python3 -c 'import os; tests = os.getcwd(); model_tests = os.listdir(os.path.join(tests, "models")); d1 = sorted(list(filter(os.path.isdir, os.listdir(tests)))); d2 = sorted(list(filter(os.path.isdir, [f"models/{x}" for x in model_tests]))); d1.remove("models"); d = d2 + d1; print(d)')"
-            - name: Failure short reports
-              if: ${{ always() }}
+      - name: NVIDIA-SMI
-              run: cat reports/tests_torch_gpu/failures_short.txt
+        run: |
+          nvidia-smi
-            - name: Run examples tests on GPU
-              if: ${{ always() }}
+  run_tests_single_gpu:
-              env:
+    name: Model tests
-                  OMP_NUM_THREADS: 16
+    strategy:
-                  MKL_NUM_THREADS: 16
+      fail-fast: false
-                  RUN_SLOW: yes
+      matrix:
-                  HF_HOME: /mnt/cache
+        folders: ${{ fromJson(needs.setup.outputs.matrix) }}
-                  TRANSFORMERS_IS_CI: yes
+        machine_type: [single-gpu]
-              run: |
+    runs-on: ${{ format('{0}-{1}', matrix.machine_type, 'docker') }}
-                  pip install -r examples/pytorch/_tests_requirements.txt
+    container:
-                  python -m pytest -n 1 -v --dist=loadfile --make-reports=examples_torch_gpu examples
+      image: huggingface/transformers-all-latest-torch-nightly-gpu
+      options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
-            - name: Failure short reports
+    needs: setup
-              if: ${{ always() }}
+    steps:
-              run: cat reports/examples_torch_gpu/failures_short.txt
+      - name: Echo folder ${{ matrix.folders }}
+        shell: bash
-            - name: Run all pipeline tests on GPU
+        # For folders like `models/bert`, set an env. var. (`matrix_folders`) to `models_bert`, which will be used to
-              if: ${{ always() }}
+        # set the artifact folder names (because the character `/` is not allowed).
-              env:
+        run: |
-                  RUN_PIPELINE_TESTS: yes
+          echo "${{ matrix.folders }}"
-              run: |
+          matrix_folders=${{ matrix.folders }}
-                  python -m pytest -n 1 -v --dist=loadfile -m is_pipeline_test --make-reports=tests_torch_pipeline_gpu tests
+          matrix_folders=${matrix_folders/'models/'/'models_'}
+          echo "$matrix_folders"
-            - name: Failure short reports
+          echo "matrix_folders=$matrix_folders" >> $GITHUB_ENV
-              if: ${{ always() }}
-              run: cat reports/tests_torch_pipeline_gpu/failures_short.txt
+      - name: Update clone
+        working-directory: /transformers
-            - name: Test suite reports artifacts
+        run: git fetch && git checkout ${{ github.sha }}
-              if: ${{ always() }}
-              uses: actions/upload-artifact@v2
+      - name: NVIDIA-SMI
-              with:
+        run: |
-                  name: run_all_tests_torch_gpu_test_reports
+          nvidia-smi
-                  path: reports
+      - name: Environment
-    run_all_tests_torch_multi_gpu:
+        working-directory: /transformers
-        runs-on: [self-hosted, docker-gpu, multi-gpu]
+        run: |
-        container:
+          python3 utils/print_env.py
-            image: pytorch/pytorch:1.10.0-cuda11.3-cudnn8-runtime
-            options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
+      - name: Run all tests on GPU
-        steps:
+        working-directory: /transformers
-            - name: Launcher docker
+        run: python3 -m pytest -v --make-reports=${{ matrix.machine_type }}_tests_gpu_${{ matrix.folders }} tests/${{ matrix.folders }}
-              uses: actions/checkout@v2
+      - name: Failure short reports
-            - name: NVIDIA-SMI
+        if: ${{ failure() }}
-              continue-on-error: true
+        continue-on-error: true
-              run: |
+        run: cat /transformers/reports/${{ matrix.machine_type }}_tests_gpu_${{ matrix.folders }}/failures_short.txt
-                  nvidia-smi
+      - name: Test suite reports artifacts
-            - name: Install dependencies
+        if: ${{ always() }}
-              run: |
+        uses: actions/upload-artifact@v2
-                  apt -y update && apt install -y libsndfile1-dev git espeak-ng
+        with:
-                  pip install --upgrade pip
+          name: ${{ matrix.machine_type }}_run_all_tests_gpu_${{ env.matrix_folders }}_test_reports
-                  pip install .[integrations,sklearn,testing,onnxruntime,sentencepiece,torch-speech,vision,timm]
+          path: /transformers/reports/${{ matrix.machine_type }}_tests_gpu_${{ matrix.folders }}
-                  pip install https://github.com/kpu/kenlm/archive/master.zip
-                  pip install --pre torch torchvision torchaudio -f https://download.pytorch.org/whl/nightly/cu113/torch_nightly.html -U
+  run_tests_multi_gpu:
+    name: Model tests
-            - name: Are GPUs recognized by our DL frameworks
+    strategy:
-              run: |
+      fail-fast: false
-                utils/print_env.py
+      matrix:
+        folders: ${{ fromJson(needs.setup.outputs.matrix) }}
-            - name: Run all tests on GPU
+        machine_type: [multi-gpu]
-              env:
+    runs-on: ${{ format('{0}-{1}', matrix.machine_type, 'docker') }}
-                  MKL_SERVICE_FORCE_INTEL: 1
+    container:
-              run: |
+      image: huggingface/transformers-all-latest-torch-nightly-gpu
-                  python -m pytest -n 1 -v --dist=loadfile --make-reports=tests_torch_multi_gpu tests
+      options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
+    needs: setup
-            - name: Failure short reports
+    steps:
-              if: ${{ always() }}
+      - name: Echo folder ${{ matrix.folders }}
-              run: cat reports/tests_torch_multi_gpu/failures_short.txt
+        shell: bash
+        # For folders like `models/bert`, set an env. var. (`matrix_folders`) to `models_bert`, which will be used to
-            - name: Run all pipeline tests on GPU
+        # set the artifact folder names (because the character `/` is not allowed).
-              if: ${{ always() }}
+        run: |
-              env:
+          echo "${{ matrix.folders }}"
-                  RUN_PIPELINE_TESTS: yes
+          matrix_folders=${{ matrix.folders }}
-              run: |
+          matrix_folders=${matrix_folders/'models/'/'models_'}
-                  python -m pytest -n 1 -v --dist=loadfile -m is_pipeline_test --make-reports=tests_torch_pipeline_multi_gpu tests
+          echo "$matrix_folders"
+          echo "matrix_folders=$matrix_folders" >> $GITHUB_ENV
-            - name: Failure short reports
-              if: ${{ always() }}
+      - name: Update clone
-              run: cat reports/tests_torch_pipeline_multi_gpu/failures_short.txt
+        working-directory: /transformers
+        run: git fetch && git checkout ${{ github.sha }}
-            - name: Test suite reports artifacts
-              if: ${{ always() }}
+      - name: NVIDIA-SMI
-              uses: actions/upload-artifact@v2
+        run: |
-              with:
+          nvidia-smi
-                  name: run_all_tests_torch_multi_gpu_test_reports
-                  path: reports
+      - name: Environment
+        working-directory: /transformers
-    run_all_tests_torch_cuda_extensions_gpu:
+        run: |
-        runs-on: [self-hosted, docker-gpu, single-gpu]
+          python3 utils/print_env.py
-        container:
-            image: nvcr.io/nvidia/pytorch:21.03-py3
+      - name: Run all tests on GPU
-            options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
+        working-directory: /transformers
-        steps:
+        run: python3 -m pytest -v --make-reports=${{ matrix.machine_type }}_tests_gpu_${{ matrix.folders }} tests/${{ matrix.folders }}
-            - name: Launcher docker
-              uses: actions/checkout@v2
+      - name: Failure short reports
+        if: ${{ failure() }}
-            - name: NVIDIA-SMI
+        continue-on-error: true
-              run: |
+        run: cat /transformers/reports/${{ matrix.machine_type }}_tests_gpu_${{ matrix.folders }}/failures_short.txt
-                  nvidia-smi
+      - name: Test suite reports artifacts
-            - name: Install dependencies
+        if: ${{ always() }}
-              run: |
+        uses: actions/upload-artifact@v2
-                  apt -y update && apt install -y libaio-dev libsndfile1-dev git espeak-ng
+        with:
-                  pip install --upgrade pip
+          name: ${{ matrix.machine_type }}_run_all_tests_gpu_${{ env.matrix_folders }}_test_reports
-                  pip install --pre torch torchvision torchaudio -f https://download.pytorch.org/whl/nightly/cu113/torch_nightly.html -U
+          path: /transformers/reports/${{ matrix.machine_type }}_tests_gpu_${{ matrix.folders }}
-                  pip install .[deepspeed-testing]
-                  pip install https://github.com/kpu/kenlm/archive/master.zip
+  run_all_tests_torch_cuda_extensions_gpu:
-                  pip install git+https://github.com/microsoft/DeepSpeed
+    name: Torch CUDA extension tests
+    strategy:
-            - name: Are GPUs recognized by our DL frameworks
+      fail-fast: false
-              run: |
+      matrix:
-                utils/print_env.py
+        machine_type: [single-gpu, multi-gpu]
+    runs-on: ${{ format('{0}-{1}', matrix.machine_type, 'docker') }}
-            - name: Run all tests on GPU
+    needs: setup
-              run: |
+    container:
-                  python -m pytest -n 1 -v --dist=loadfile --make-reports=tests_torch_cuda_extensions_gpu tests/deepspeed tests/extended
+      image: huggingface/transformers-pytorch-deepspeed-nightly-gpu
+      options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
-            - name: Failure short reports
+    steps:
-              if: ${{ always() }}
+      - name: Update clone
-              run: cat reports/tests_torch_cuda_extensions_gpu/failures_short.txt
+        working-directory: /workspace/transformers
+        run: git fetch && git checkout ${{ github.sha }}
-            - name: Test suite reports artifacts
-              if: ${{ always() }}
+      # To avoid unknown test failures
-              uses: actions/upload-artifact@v2
+      - name: Pre build DeepSpeed *again*
-              with:
+        working-directory: /workspace
-                  name: run_tests_torch_cuda_extensions_gpu_test_reports
+        run: |
-                  path: reports
+          python3 -m pip uninstall -y deepspeed
+          rm -rf DeepSpeed
-    run_all_tests_torch_cuda_extensions_multi_gpu:
+          git clone https://github.com/microsoft/DeepSpeed && cd DeepSpeed && rm -rf build
-        runs-on: [self-hosted, docker-gpu, multi-gpu]
+          DS_BUILD_CPU_ADAM=1 DS_BUILD_AIO=1 DS_BUILD_UTILS=1 python3 -m pip install . --global-option="build_ext" --global-option="-j8" --no-cache -v --disable-pip-version-check
-        container:
-            image: nvcr.io/nvidia/pytorch:21.03-py3
+      - name: NVIDIA-SMI
-            options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
+        run: |
-        steps:
+          nvidia-smi
-            - name: Launcher docker
-              uses: actions/checkout@v2
+      - name: Environment
+        working-directory: /workspace/transformers
-            - name: NVIDIA-SMI
+        run: |
-              continue-on-error: true
+          python utils/print_env.py
-              run: |
-                  nvidia-smi
+      - name: Run all tests on GPU
+        working-directory: /workspace/transformers
-            - name: Install dependencies
+        run: |
-              run: |
+          python -m pytest -v --make-reports=${{ matrix.machine_type }}_tests_torch_cuda_extensions_gpu tests/deepspeed tests/extended
-                  apt -y update && apt install -y libaio-dev libsndfile1-dev git espeak-ng
-                  pip install --upgrade pip
+      - name: Failure short reports
-                  pip install --pre torch torchvision torchaudio -f https://download.pytorch.org/whl/nightly/cu113/torch_nightly.html -U
+        if: ${{ failure() }}
-                  rm -rf ~/.cache/torch_extensions/ # shared between conflicting builds
+        continue-on-error: true
-                  pip install .[testing,fairscale]
+        run: cat /workspace/transformers/reports/${{ matrix.machine_type }}_tests_torch_cuda_extensions_gpu/failures_short.txt
-                  pip install https://github.com/kpu/kenlm/archive/master.zip
-                  pip install git+https://github.com/microsoft/DeepSpeed # testing bleeding edge
+      - name: Test suite reports artifacts
+        if: ${{ always() }}
-            - name: Are GPUs recognized by our DL frameworks
+        uses: actions/upload-artifact@v2
-              run: |
+        with:
-                utils/print_env.py
+          name: ${{ matrix.machine_type }}_run_tests_torch_cuda_extensions_gpu_test_reports
+          path: /workspace/transformers/reports/${{ matrix.machine_type }}_tests_torch_cuda_extensions_gpu
-            - name: Run all tests on GPU
-              run: |
+  send_results:
-                  python -m pytest -n 1 -v --dist=loadfile --make-reports=tests_torch_cuda_extensions_multi_gpu tests/deepspeed tests/extended
+    name: Send results to webhook
+    runs-on: ubuntu-latest
-            - name: Failure short reports
+    if: always()
-              if: ${{ always() }}
+    needs: [setup, run_tests_single_gpu, run_tests_multi_gpu, run_all_tests_torch_cuda_extensions_gpu]
-              run: cat reports/tests_torch_cuda_extensions_multi_gpu/failures_short.txt
+    steps:
+      - uses: actions/checkout@v2
-            - name: Test suite reports artifacts
+      - uses: actions/download-artifact@v2
-              if: ${{ always() }}
+      - name: Send message to Slack
-              uses: actions/upload-artifact@v2
+        env:
-              with:
+          CI_SLACK_BOT_TOKEN: ${{ secrets.CI_SLACK_BOT_TOKEN }}
-                  name: run_tests_torch_cuda_extensions_multi_gpu_test_reports
+          CI_SLACK_CHANNEL_ID: ${{ secrets.CI_SLACK_CHANNEL_ID }}
-                  path: reports
+          CI_SLACK_CHANNEL_ID_DAILY: ${{ secrets.CI_SLACK_CHANNEL_ID_DAILY }}
+          CI_SLACK_CHANNEL_DUMMY_TESTS: ${{ secrets.CI_SLACK_CHANNEL_DUMMY_TESTS }}
-    send_results:
+          CI_SLACK_REPORT_CHANNEL_ID: ${{ secrets.CI_SLACK_CHANNEL_ID_PAST_FUTURE }}
-        name: Send results to webhook
+          CI_EVENT: nightly-build
-        runs-on: ubuntu-latest
+        # We pass `needs.setup.outputs.matrix` as the argument. A processing in `notification_service.py` to change
-        if: always()
+        # `models/bert` to `models_bert` is required, as the artifact names use `_` instead of `/`.
-        needs: [
+        run: |
-                run_all_tests_torch_gpu,
+          pip install slack_sdk
-                run_all_tests_torch_multi_gpu,
+          python utils/notification_service.py "${{ needs.setup.outputs.matrix }}"
-                run_all_tests_torch_cuda_extensions_gpu,
-                run_all_tests_torch_cuda_extensions_multi_gpu
-        ]
-        steps:
-            - uses: actions/checkout@v2
-            - uses: actions/download-artifact@v2
-            - name: Send message to Slack
-              env:
-                  CI_SLACK_BOT_TOKEN: ${{ secrets.CI_SLACK_BOT_TOKEN }}
-                  CI_SLACK_CHANNEL_ID: ${{ secrets.CI_SLACK_CHANNEL_ID }}
-                  CI_SLACK_CHANNEL_ID_DAILY: ${{ secrets.CI_SLACK_CHANNEL_ID_DAILY }}
-                  CI_SLACK_CHANNEL_ID_PAST_FUTURE: ${{ secrets.CI_SLACK_CHANNEL_ID_PAST_FUTURE }}
-              run: |
-                  pip install slack_sdk
-                  python utils/notification_service.py scheduled nightly-torch
--- a/.github/workflows/self-push.yml
+++ b/.github/workflows/self-push.yml
@@ -207,7 +207,7 @@ jobs:
      # To avoid unknown test failures
      - name: Pre build DeepSpeed *again*
-        working-directory: /workspace/transformers
+        working-directory: /workspace
        run: |
          python3 -m pip uninstall -y deepspeed
          DS_BUILD_CPU_ADAM=1 DS_BUILD_AIO=1 DS_BUILD_UTILS=1 python3 -m pip install deepspeed --global-option="build_ext" --global-option="-j8" --no-cache -v --disable-pip-version-check
@@ -217,10 +217,12 @@ jobs:
          nvidia-smi
      - name: Environment
+        working-directory: /workspace/transformers
        run: |
          python utils/print_env.py
      - name: Run all non-slow selected tests on GPU
+        working-directory: /workspace/transformers
        # TODO: Here we pass all tests in the 2 folders for simplicity. It's better to pass only the identified tests.
        run: |
          python -m pytest -n 1 --dist=loadfile -v --make-reports=${{ matrix.machine_type }}_tests_torch_cuda_extensions_gpu tests/deepspeed tests/extended
@@ -256,7 +258,7 @@ jobs:
      # To avoid unknown test failures
      - name: Pre build DeepSpeed *again*
-        working-directory: /workspace/transformers
+        working-directory: /workspace
        run: |
          python3 -m pip uninstall -y deepspeed
          DS_BUILD_CPU_ADAM=1 DS_BUILD_AIO=1 DS_BUILD_UTILS=1 python3 -m pip install deepspeed --global-option="build_ext" --global-option="-j8" --no-cache -v --disable-pip-version-check

--- a/.github/workflows/self-scheduled.yml
+++ b/.github/workflows/self-scheduled.yml
@@ -308,7 +308,7 @@ jobs:
      # To avoid unknown test failures
      - name: Pre build DeepSpeed *again*
-        working-directory: /workspace/transformers
+        working-directory: /workspace
        run: |
          python3 -m pip uninstall -y deepspeed
          DS_BUILD_CPU_ADAM=1 DS_BUILD_AIO=1 DS_BUILD_UTILS=1 python3 -m pip install deepspeed --global-option="build_ext" --global-option="-j8" --no-cache -v --disable-pip-version-check

--- a/docker/transformers-all-latest-gpu/Dockerfile
+++ b/docker/transformers-all-latest-gpu/Dockerfile
@@ -3,6 +3,9 @@ LABEL maintainer="Hugging Face"
 ARG DEBIAN_FRONTEND=noninteractive
+# Use login shell to read variables from `~/.profile` (to pass dynamic created variables between RUN commands)
+SHELL ["sh", "-lc"]
 # The following `ARG` are mainly used to specify the versions explicitly & directly in this docker file, and not meant
 # to be used as arguments for docker build (so far).
@@ -21,11 +24,20 @@ ARG REF=main
 RUN git clone https://github.com/huggingface/transformers && cd transformers && git checkout $REF
 RUN python3 -m pip install --no-cache-dir -e ./transformers[dev,onnxruntime]
-RUN python3 -m pip install --no-cache-dir -U torch==$PYTORCH torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/$CUDA
+# TODO: Handle these in a python utility script
+RUN [ ${#PYTORCH} -gt 0 -a "$PYTORCH" != "pre" ] && VERSION='torch=='$PYTORCH'.*' ||  VERSION='torch'; echo "export VERSION='$VERSION'" >> ~/.profile
+RUN echo torch=$VERSION
+# `torchvision` and `torchaudio` should be installed along with `torch`, especially for nightly build.
+# Currently, let's just use their latest releases (when `torch` is installed with a release version)
+# TODO: We might need to specify proper versions that work with a specific torch version (especially for past CI).
+RUN [ "$PYTORCH" != "pre" ] && python3 -m pip install --no-cache-dir -U $VERSION torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/$CUDA || python3 -m pip install --no-cache-dir -U --pre torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/nightly/$CUDA
 RUN python3 -m pip install --no-cache-dir -U tensorflow
 RUN python3 -m pip uninstall -y flax jax
-RUN python3 -m pip install --no-cache-dir torch-scatter -f https://data.pyg.org/whl/torch-$PYTORCH+$CUDA.html
+# Use installed torch version for `torch-scatter` to avid to deal with PYTORCH='pre'.
+# If torch is nightly version, the link is likely to be invalid, but the installation falls back to the latest torch-scatter
+RUN python3 -m pip install --no-cache-dir torch-scatter -f https://data.pyg.org/whl/torch-$(python3 -c "from torch import version; print(version.__version__.split('+')[0])")+$CUDA.html
 RUN python3 -m pip install --no-cache-dir intel_extension_for_pytorch==$INTEL_TORCH_EXT+cpu -f https://software.intel.com/ipex-whl-stable
 RUN python3 -m pip install --no-cache-dir git+https://github.com/facebookresearch/detectron2.git pytesseract https://github.com/kpu/kenlm/archive/master.zip

--- a/docker/transformers-pytorch-deepspeed-latest-gpu/Dockerfile
+++ b/docker/transformers-pytorch-deepspeed-latest-gpu/Dockerfile
@@ -3,6 +3,9 @@ LABEL maintainer="Hugging Face"
 ARG DEBIAN_FRONTEND=noninteractive
+# Example: `cu102`, `cu113`, etc.
+ARG CUDA='cu113'
 RUN apt -y update
 RUN apt install -y libaio-dev
 RUN python3 -m pip install --no-cache-dir --upgrade pip
@@ -13,13 +16,16 @@ RUN git clone https://github.com/huggingface/transformers && cd transformers &&
 # Install latest release PyTorch
 # (PyTorch must be installed before pre-compiling any DeepSpeed c++/cuda ops.)
 # (https://www.deepspeed.ai/tutorials/advanced-install/#pre-install-deepspeed-ops)
-RUN python3 -m pip install --no-cache-dir -U torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/cu113
+RUN python3 -m pip install --no-cache-dir -U torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/$CUDA
 RUN python3 -m pip install --no-cache-dir ./transformers[deepspeed-testing]
-# Pre-build DeepSpeed, so it would be ready for testing (otherwise, the 1st deepspeed test will timeout)
+# Pre-build **latest** DeepSpeed, so it would be ready for testing (otherwise, the 1st deepspeed test will timeout)
 RUN python3 -m pip uninstall -y deepspeed
-RUN DS_BUILD_CPU_ADAM=1 DS_BUILD_AIO=1 DS_BUILD_UTILS=1 python3 -m pip install deepspeed --global-option="build_ext" --global-option="-j8" --no-cache -v --disable-pip-version-check
+# This has to be run (again) inside the GPU VMs running the tests.
+# The installation works here, but some tests fail, if we don't pre-build deepspeed again in the VMs running the tests.
+# TODO: Find out why test fail.
+RUN DS_BUILD_CPU_ADAM=1 DS_BUILD_AIO=1 DS_BUILD_UTILS=1 python3 -m pip install deepspeed --global-option="build_ext" --global-option="-j8" --no-cache -v --disable-pip-version-check 2>&1
 # When installing in editable mode, `transformers` is not recognized as a package.
 # this line must be added in order for python to be aware of transformers.

--- a/docker/transformers-pytorch-deepspeed-nightly-gpu/Dockerfile
+++ b/docker/transformers-pytorch-deepspeed-nightly-gpu/Dockerfile
+FROM nvcr.io/nvidia/pytorch:21.03-py3
+LABEL maintainer="Hugging Face"
+ARG DEBIAN_FRONTEND=noninteractive
+# Example: `cu102`, `cu113`, etc.
+ARG CUDA='cu113'
+RUN apt -y update
+RUN apt install -y libaio-dev
+RUN python3 -m pip install --no-cache-dir --upgrade pip
+ARG REF=main
+RUN git clone https://github.com/huggingface/transformers && cd transformers && git checkout $REF
+# Install **nightly** release PyTorch (flag `--pre`)
+# (PyTorch must be installed before pre-compiling any DeepSpeed c++/cuda ops.)
+# (https://www.deepspeed.ai/tutorials/advanced-install/#pre-install-deepspeed-ops)
+RUN python3 -m pip install --no-cache-dir -U --pre torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/nightly/$CUDA
+RUN python3 -m pip install --no-cache-dir ./transformers[deepspeed-testing]
+# Pre-build **nightly** release of DeepSpeed, so it would be ready for testing (otherwise, the 1st deepspeed test will timeout)
+RUN python3 -m pip uninstall -y deepspeed
+# This has to be run inside the GPU VMs running the tests. (So far, it fails here due to GPU checks during compilation.)
+# Issue: https://github.com/microsoft/DeepSpeed/issues/2010
+# RUN git clone https://github.com/microsoft/DeepSpeed && cd DeepSpeed && rm -rf build && \
+#    DS_BUILD_CPU_ADAM=1 DS_BUILD_AIO=1 DS_BUILD_UTILS=1 python3 -m pip install . --global-option="build_ext" --global-option="-j8" --no-cache -v --disable-pip-version-check 2>&1
+# When installing in editable mode, `transformers` is not recognized as a package.
+# this line must be added in order for python to be aware of transformers.
+RUN cd transformers && python3 setup.py develop
+# Disable for now as deepspeed is not installed above. To be enabled once the issue is fixed.
+# RUN python3 -c "from deepspeed.launcher.runner import main"