name: Self-hosted runner (scheduled)

on:
  push:
    branches:
      - multi_ci_*
  repository_dispatch:
  schedule:
    - cron: "0 0 * * *"

env:
  HF_HOME: /mnt/cache
  TRANSFORMERS_IS_CI: yes
  RUN_SLOW: yes
  OMP_NUM_THREADS: 16
  MKL_NUM_THREADS: 16
  PYTEST_TIMEOUT: 600
  SIGOPT_API_TOKEN: ${{ secrets.SIGOPT_API_TOKEN }}
  TF_FORCE_GPU_ALLOW_GROWTH: true
  RUN_PT_TF_CROSS_TESTS: 1

jobs:
  run_all_tests_torch_gpu:
    runs-on: [self-hosted, docker-gpu, single-gpu]
    container:
      image: pytorch/pytorch:1.9.0-cuda11.1-cudnn8-runtime
      options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
    steps:
      - name: Launcher docker
        uses: actions/checkout@v2

      - name: NVIDIA-SMI
        run: |
          nvidia-smi

      - name: Install dependencies
        run: |
          apt -y update && apt install -y libsndfile1-dev git espeak-ng
          pip install --upgrade pip
          pip install .[integrations,sklearn,testing,onnxruntime,sentencepiece,torch-speech,vision,timm]
          pip install https://github.com/kpu/kenlm/archive/master.zip
          python -m pip install 'git+https://github.com/facebookresearch/detectron2.git'
          wandb login ${{ secrets.WANDB_API_KEY }}

      - name: Are GPUs recognized by our DL frameworks
        run: |
          utils/print_env_pt.py

      - name: Run all tests on GPU
        run: |
          python -m pytest -n 1 -v --dist=loadfile --make-reports=tests_torch_gpu tests

      - name: Failure short reports
        if: ${{ always() }}
        run: cat reports/tests_torch_gpu_failures_short.txt

      - name: Test durations
        if: ${{ always() }}
        run: cat reports/tests_torch_gpu_durations.txt

      - name: Run examples tests on GPU
        if: ${{ always() }}
        env:
          OMP_NUM_THREADS: 16
          MKL_NUM_THREADS: 16
          RUN_SLOW: yes
          HF_HOME: /mnt/cache
          TRANSFORMERS_IS_CI: yes
        run: |
          pip install -r examples/pytorch/_tests_requirements.txt
          python -m pytest -n 1 -v --dist=loadfile --make-reports=examples_torch_gpu examples

      - name: Failure short reports
        if: ${{ always() }}
        run: cat reports/examples_torch_gpu_failures_short.txt

      - name: Test durations
        if: ${{ always() }}
        run: cat reports/examples_torch_gpu_durations.txt

      - name: Run all pipeline tests on GPU
        if: ${{ always() }}
        env:
          RUN_PIPELINE_TESTS: yes
        run: |
          python -m pytest -n 1 -v --dist=loadfile -m is_pipeline_test --make-reports=tests_torch_pipeline_gpu tests

      - name: Failure short reports
        if: ${{ always() }}
        run: cat reports/tests_torch_pipeline_gpu_failures_short.txt

      - name: Test durations
        if: ${{ always() }}
        run: cat reports/tests_torch_pipeline_gpu_durations.txt

      - name: Test suite reports artifacts
        if: ${{ always() }}
        uses: actions/upload-artifact@v2
        with:
          name: run_all_tests_torch_gpu_test_reports
          path: reports

#  run_all_tests_flax_gpu:
#    runs-on: [self-hosted, docker-gpu-test, single-gpu]
#    container:
#      image: tensorflow/tensorflow:2.4.1-gpu
#      options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
#    steps:
#      - name: Launcher docker
#        uses: actions/checkout@v2
#
#      - name: NVIDIA-SMI
#        continue-on-error: true
#        run: |
#          nvidia-smi
#
#      - name: Install dependencies
#        run: |
#          pip install --upgrade pip
#          pip install --upgrade "jax[cuda111]" -f https://storage.googleapis.com/jax-releases/jax_releases.html
#          pip install .[flax,integrations,sklearn,testing,sentencepiece,flax-speech,vision]
#          pip install https://github.com/kpu/kenlm/archive/master.zip
#
#      - name: Are GPUs recognized by our DL frameworks
#        run: |
#          python -c "from jax.lib import xla_bridge; print('GPU available:', xla_bridge.get_backend().platform)"
#          python -c "import jax; print('Number of GPUs available:', len(jax.local_devices()))"
#
#      - name: Run all tests on GPU
#        run: |
#          python -m pytest -n 1 -v --dist=loadfile --make-reports=tests_flax_gpu tests
#
#      - name: Failure short reports
#        if: ${{ always() }}
#        run: cat reports/tests_flax_gpu_failures_short.txt
#
#      - name: Test durations
#        if: ${{ always() }}
#        run: cat reports/tests_flax_gpu_durations.txt
#
#      - name: Test suite reports artifacts
#        if: ${{ always() }}
#        uses: actions/upload-artifact@v2
#        with:
#          name: run_all_tests_flax_gpu_test_reports
#          path: reports

  run_all_tests_tf_gpu:
    runs-on: [self-hosted, docker-gpu, single-gpu]
    container:
      image: tensorflow/tensorflow:2.4.1-gpu
      options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
    steps:
      - name: Launcher docker
        uses: actions/checkout@v2

      - name: NVIDIA-SMI
        run: |
          nvidia-smi

      - name: Install dependencies
        run: |
          apt -y update && apt install -y libsndfile1-dev git espeak-ng
          pip install --upgrade pip
          pip install .[sklearn,testing,onnx,sentencepiece,tf-speech,vision]
          pip install https://github.com/kpu/kenlm/archive/master.zip


      - name: Are GPUs recognized by our DL frameworks
        run: |
          TF_CPP_MIN_LOG_LEVEL=3 python -c "import tensorflow as tf; print('TF GPUs available:', bool(tf.config.list_physical_devices('GPU')))"
          TF_CPP_MIN_LOG_LEVEL=3 python -c "import tensorflow as tf; print('Number of TF GPUs available:', len(tf.config.list_physical_devices('GPU')))"

      - name: Run all tests on GPU
        env:
          TF_NUM_INTEROP_THREADS: 1
          TF_NUM_INTRAOP_THREADS: 16
        run: |
          python -m pytest -n 1 -v --dist=loadfile --make-reports=tests_tf_gpu tests

      - name: Failure short reports
        if: ${{ always() }}
        run: cat reports/tests_tf_gpu_failures_short.txt

      - name: Test durations
        if: ${{ always() }}
        run: cat reports/tests_tf_gpu_durations.txt

      - name: Run all pipeline tests on GPU
        if: ${{ always() }}
        env:
          RUN_PIPELINE_TESTS: yes
          TF_NUM_INTEROP_THREADS: 1
          TF_NUM_INTRAOP_THREADS: 16
        run: |
          python -m pytest -n 1 -v --dist=loadfile -m is_pipeline_test --make-reports=tests_tf_pipeline_gpu tests

      - name: Failure short reports
        if: ${{ always() }}
        run: cat reports/tests_tf_pipeline_gpu_failures_short.txt

      - name: Test durations
        if: ${{ always() }}
        run: cat reports/tests_tf_pipeline_gpu_durations.txt

      - name: Test suite reports artifacts
        if: ${{ always() }}
        uses: actions/upload-artifact@v2
        with:
          name: run_all_tests_tf_gpu_test_reports
          path: reports

  run_all_examples_torch_xla_tpu:
    runs-on: [self-hosted, docker-tpu-test, tpu-v3-8]
    container:
      image: gcr.io/tpu-pytorch/xla:nightly_3.8_tpuvm
      options: --privileged -v "/lib/libtpu.so:/lib/libtpu.so" -v /mnt/cache/.cache/huggingface:/mnt/cache/ --shm-size 16G
    steps:
      - name: Launcher docker
        uses: actions/checkout@v2

      - name: Install dependencies
        run: |
          pip install --upgrade pip
          pip install .[testing]

      - name: Are TPUs recognized by our DL frameworks
        env:
          XRT_TPU_CONFIG: localservice;0;localhost:51011
        run: |
          python -c "import torch_xla.core.xla_model as xm; print(xm.xla_device())"

      - name: Run example tests on TPU
        env:
          XRT_TPU_CONFIG: "localservice;0;localhost:51011"
          MKL_SERVICE_FORCE_INTEL: "1"  # See: https://github.com/pytorch/pytorch/issues/37377

        run: |
          python -m pytest -n 1 -v --dist=loadfile --make-reports=tests_torch_xla_tpu examples/pytorch/test_xla_examples.py

      - name: Failure short reports
        if: ${{ always() }}
        run: cat reports/tests_torch_xla_tpu_failures_short.txt

      - name: Tests durations
        if: ${{ always() }}
        run: cat reports/tests_torch_xla_tpu_durations.txt

      - name: Test suite reports artifacts
        if: ${{ always() }}
        uses: actions/upload-artifact@v2
        with:
          name: run_all_examples_torch_xla_tpu
          path: reports

  run_all_tests_torch_multi_gpu:
    runs-on: [self-hosted, docker-gpu, multi-gpu]
    container:
      image: pytorch/pytorch:1.9.0-cuda11.1-cudnn8-runtime
      options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
    steps:
      - name: Launcher docker
        uses: actions/checkout@v2

      - name: NVIDIA-SMI
        continue-on-error: true
        run: |
          nvidia-smi

      - name: Install dependencies
        run: |
          apt -y update && apt install -y libsndfile1-dev git espeak-ng
          pip install --upgrade pip
          pip install .[integrations,sklearn,testing,onnxruntime,sentencepiece,torch-speech,vision,timm]
          pip install https://github.com/kpu/kenlm/archive/master.zip
          python -m pip install 'git+https://github.com/facebookresearch/detectron2.git'
          wandb login ${{ secrets.WANDB_API_KEY }}

      - name: Are GPUs recognized by our DL frameworks
        run: |
          utils/print_env_pt.py

      - name: Run all tests on GPU
        env:
          MKL_SERVICE_FORCE_INTEL: 1
        run: |
          python -m pytest -n 1 -v --dist=loadfile --make-reports=tests_torch_multi_gpu tests

      - name: Failure short reports
        if: ${{ always() }}
        run: cat reports/tests_torch_multi_gpu_failures_short.txt

      - name: Test durations
        if: ${{ always() }}
        run: cat reports/tests_torch_multi_gpu_durations.txt

      - name: Run all pipeline tests on GPU
        if: ${{ always() }}
        env:
          RUN_PIPELINE_TESTS: yes
        run: |
          python -m pytest -n 1 -v --dist=loadfile -m is_pipeline_test --make-reports=tests_torch_pipeline_multi_gpu tests

      - name: Failure short reports
        if: ${{ always() }}
        run: cat reports/tests_torch_pipeline_multi_gpu_failures_short.txt

      - name: Test durations
        if: ${{ always() }}
        run: cat reports/tests_torch_pipeline_multi_gpu_durations.txt

      - name: Test suite reports artifacts
        if: ${{ always() }}
        uses: actions/upload-artifact@v2
        with:
          name: run_all_tests_torch_multi_gpu_test_reports
          path: reports

  run_all_tests_tf_multi_gpu:
    runs-on: [self-hosted, docker-gpu, multi-gpu]
    container:
      image: tensorflow/tensorflow:2.4.1-gpu
      options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
    steps:
      - name: Launcher docker
        uses: actions/checkout@v2

      - name: NVIDIA-SMI
        continue-on-error: true
        run: |
          nvidia-smi

      - name: Install dependencies
        run: |
          apt -y update && apt install -y libsndfile1-dev git espeak-ng
          pip install --upgrade pip
          pip install .[sklearn,testing,onnx,sentencepiece,tf-speech,vision]
          pip install https://github.com/kpu/kenlm/archive/master.zip

      - name: Are GPUs recognized by our DL frameworks
        run: |
          TF_CPP_MIN_LOG_LEVEL=3 python -c "import tensorflow as tf; print('TF GPUs available:', bool(tf.config.list_physical_devices('GPU')))"
          TF_CPP_MIN_LOG_LEVEL=3 python -c "import tensorflow as tf; print('Number of TF GPUs available:', len(tf.config.list_physical_devices('GPU')))"

      - name: Run all tests on GPU
        env:
          TF_NUM_INTEROP_THREADS: 1
          TF_NUM_INTRAOP_THREADS: 16
        run: |
          python -m pytest -n 1 -v --dist=loadfile --make-reports=tests_tf_multi_gpu tests

      - name: Failure short reports
        if: ${{ always() }}
        run: cat reports/tests_tf_multi_gpu_failures_short.txt

      - name: Test durations
        if: ${{ always() }}
        run: cat reports/tests_tf_multi_gpu_durations.txt

      - name: Run all pipeline tests on GPU
        if: ${{ always() }}
        env:
          RUN_PIPELINE_TESTS: yes
          TF_NUM_INTEROP_THREADS: 1
          TF_NUM_INTRAOP_THREADS: 16
        run: |
          python -m pytest -n 1 -v --dist=loadfile -m is_pipeline_test --make-reports=tests_tf_pipeline_multi_gpu tests

      - name: Failure short reports
        if: ${{ always() }}
        run: cat reports/tests_tf_pipeline_multi_gpu_failures_short.txt

      - name: Test durations
        if: ${{ always() }}
        run: cat reports/tests_tf_pipeline_multi_gpu_durations.txt

      - name: Test suite reports artifacts
        if: ${{ always() }}
        uses: actions/upload-artifact@v2
        with:
          name: run_all_tests_tf_multi_gpu_test_reports
          path: reports

#  run_all_tests_flax_multi_gpu:
#    runs-on: [self-hosted, docker-gpu, multi-gpu]
#    container:
#      image: tensorflow/tensorflow:2.4.1-gpu
#      options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
#    steps:
#      - name: Launcher docker
#        uses: actions/checkout@v2
#
#      - name: NVIDIA-SMI
#        run: |
#          nvidia-smi
#
#      - name: Install dependencies
#        run: |
#          pip install --upgrade pip
#          pip install --upgrade "jax[cuda111]" -f https://storage.googleapis.com/jax-releases/jax_releases.html
#          pip install .[flax,integrations,sklearn,testing,sentencepiece,flax-speech,vision]
#
#      - name: Are GPUs recognized by our DL frameworks
#        run: |
#          python -c "from jax.lib import xla_bridge; print('GPU available:', xla_bridge.get_backend().platform)"
#          python -c "import jax; print('Number of GPUs available:', len(jax.local_devices()))"
#
#      - name: Run all tests on GPU
#        run: |
#          python -m pytest -n 1 -v --dist=loadfile --make-reports=tests_flax_gpu tests
#
#      - name: Failure short reports
#        if: ${{ always() }}
#        run: cat reports/tests_flax_gpu_failures_short.txt
#
#      - name: Test suite reports artifacts
#        if: ${{ always() }}
#        uses: actions/upload-artifact@v2
#        with:
#          name: run_all_tests_flax_gpu_test_reports
#          path: reports

  run_all_tests_torch_cuda_extensions_gpu:
    runs-on: [self-hosted, docker-gpu, single-gpu]
    container:
      image: nvcr.io/nvidia/pytorch:21.03-py3
      options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
    steps:
      - name: Launcher docker
        uses: actions/checkout@v2

      - name: NVIDIA-SMI
        run: |
          nvidia-smi

      - name: Install dependencies
        run: |
          apt -y update && apt install -y libaio-dev
          pip install --upgrade pip
          pip install .[testing,deepspeed]

      - name: Are GPUs recognized by our DL frameworks
        run: |
          utils/print_env_pt.py

      - name: Run all tests on GPU
        run: |
          python -m pytest -n 1 -v --dist=loadfile --make-reports=tests_torch_cuda_extensions_gpu tests/deepspeed tests/extended

      - name: Failure short reports
        if: ${{ always() }}
        run: cat reports/tests_torch_cuda_extensions_gpu_failures_short.txt

      - name: Test durations
        if: ${{ always() }}
        run: cat reports/tests_torch_cuda_extensions_gpu_durations.txt

      - name: Test suite reports artifacts
        if: ${{ always() }}
        uses: actions/upload-artifact@v2
        with:
          name: run_tests_torch_cuda_extensions_gpu_test_reports
          path: reports

  run_all_tests_torch_cuda_extensions_multi_gpu:
    runs-on: [self-hosted, docker-gpu, multi-gpu]
    container:
      image: nvcr.io/nvidia/pytorch:21.03-py3
      options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
    steps:
      - name: Launcher docker
        uses: actions/checkout@v2

      - name: NVIDIA-SMI
        continue-on-error: true
        run: |
          nvidia-smi

      - name: Install dependencies
        run: |
          apt -y update && apt install -y libaio-dev
          pip install --upgrade pip
          rm -rf ~/.cache/torch_extensions/ # shared between conflicting builds
          pip install .[testing,deepspeed,fairscale]

      - name: Are GPUs recognized by our DL frameworks
        run: |
          utils/print_env_pt.py

      - name: Run all tests on GPU
        run: |
          python -m pytest -n 1 -v --dist=loadfile --make-reports=tests_torch_cuda_extensions_multi_gpu tests/deepspeed tests/extended

      - name: Failure short reports
        if: ${{ always() }}
        run: cat reports/tests_torch_cuda_extensions_multi_gpu_failures_short.txt

      - name: Test durations
        if: ${{ always() }}
        run: cat reports/tests_torch_cuda_extensions_multi_gpu_durations.txt

      - name: Test suite reports artifacts
        if: ${{ always() }}
        uses: actions/upload-artifact@v2
        with:
          name: run_tests_torch_cuda_extensions_multi_gpu_test_reports
          path: reports

  send_results:
    name: Send results to webhook
    runs-on: ubuntu-latest
    if: always()
    needs: [
        run_all_tests_torch_gpu,
        run_all_tests_tf_gpu,
        run_all_tests_torch_multi_gpu,
        run_all_tests_tf_multi_gpu,
        run_all_tests_torch_cuda_extensions_gpu,
        run_all_tests_torch_cuda_extensions_multi_gpu
    ]
    steps:
      - uses: actions/checkout@v2

      - uses: actions/download-artifact@v2

      - name: Send message to Slack
        env:
          CI_SLACK_BOT_TOKEN: ${{ secrets.CI_SLACK_BOT_TOKEN }}
          CI_SLACK_CHANNEL_ID: ${{ secrets.CI_SLACK_CHANNEL_ID }}
          CI_SLACK_CHANNEL_ID_DAILY: ${{ secrets.CI_SLACK_CHANNEL_ID_DAILY }}


        run: |
          pip install slack_sdk
          python utils/notification_service.py scheduled