name: Self-hosted runner; Nightly (scheduled)

on:
    push:
        branches:
            - nightly_ci*
    repository_dispatch:
    schedule:
        - cron: "0 0 */3 * *"

env:
    HF_HOME: /mnt/cache
    TRANSFORMERS_IS_CI: yes
    RUN_SLOW: yes
    OMP_NUM_THREADS: 16
    MKL_NUM_THREADS: 16
    PYTEST_TIMEOUT: 600

jobs:
    run_all_tests_torch_gpu:
        runs-on: [self-hosted, docker-gpu, single-gpu]
        container:
            image: pytorch/pytorch:1.9.0-cuda11.1-cudnn8-runtime
            options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
        steps:
            - name: Launcher docker
              uses: actions/checkout@v2

            - name: NVIDIA-SMI
              run: |
                  nvidia-smi

            - name: Install dependencies
              run: |
                  apt -y update && apt install -y libsndfile1-dev git
                  pip install --upgrade pip
                  pip install .[integrations,sklearn,testing,onnxruntime,sentencepiece,torch-speech,vision,timm]
                  pip install --pre torch torchvision torchaudio -f https://download.pytorch.org/whl/nightly/cu111/torch_nightly.html -U

            - name: Are GPUs recognized by our DL frameworks
              run: |
                  python -c "import torch; print('Cuda available:', torch.cuda.is_available())"
                  python -c "import torch; print('Cuda version:', torch.version.cuda)"
                  python -c "import torch; print('CuDNN version:', torch.backends.cudnn.version())"
                  python -c "import torch; print('Number of GPUs available:', torch.cuda.device_count())"

            - name: Run all tests on GPU
              run: |
                  python -m pytest -n 1 -v --dist=loadfile --make-reports=tests_torch_gpu tests

            - name: Failure short reports
              if: ${{ always() }}
              run: cat reports/tests_torch_gpu_failures_short.txt

            - name: Run examples tests on GPU
              if: ${{ always() }}
              env:
                  OMP_NUM_THREADS: 16
                  MKL_NUM_THREADS: 16
                  RUN_SLOW: yes
                  HF_HOME: /mnt/cache
                  TRANSFORMERS_IS_CI: yes
              run: |
                  pip install -r examples/pytorch/_tests_requirements.txt
                  python -m pytest -n 1 -v --dist=loadfile --make-reports=examples_torch_gpu examples

            - name: Failure short reports
              if: ${{ always() }}
              run: cat reports/examples_torch_gpu_failures_short.txt

            - name: Run all pipeline tests on GPU
              if: ${{ always() }}
              env:
                  RUN_PIPELINE_TESTS: yes
              run: |
                  python -m pytest -n 1 -v --dist=loadfile -m is_pipeline_test --make-reports=tests_torch_pipeline_gpu tests

            - name: Failure short reports
              if: ${{ always() }}
              run: cat reports/tests_torch_pipeline_gpu_failures_short.txt

            - name: Test suite reports artifacts
              if: ${{ always() }}
              uses: actions/upload-artifact@v2
              with:
                  name: run_all_tests_torch_gpu_test_reports
                  path: reports

    run_all_tests_torch_multi_gpu:
        runs-on: [self-hosted, docker-gpu, multi-gpu]
        container:
            image: pytorch/pytorch:1.9.0-cuda11.1-cudnn8-runtime
            options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
        steps:
            - name: Launcher docker
              uses: actions/checkout@v2

            - name: NVIDIA-SMI
              continue-on-error: true
              run: |
                  nvidia-smi

            - name: Install dependencies
              run: |
                  apt -y update && apt install -y libsndfile1-dev git
                  pip install --upgrade pip
                  pip install .[integrations,sklearn,testing,onnxruntime,sentencepiece,torch-speech,vision,timm]
                  pip install --pre torch torchvision torchaudio -f https://download.pytorch.org/whl/nightly/cu111/torch_nightly.html -U

            - name: Are GPUs recognized by our DL frameworks
              run: |
                  python -c "import torch; print('Cuda available:', torch.cuda.is_available())"
                  python -c "import torch; print('Cuda version:', torch.version.cuda)"
                  python -c "import torch; print('CuDNN version:', torch.backends.cudnn.version())"
                  python -c "import torch; print('Number of GPUs available:', torch.cuda.device_count())"

            - name: Run all tests on GPU
              env:
                  MKL_SERVICE_FORCE_INTEL: 1
              run: |
                  python -m pytest -n 1 -v --dist=loadfile --make-reports=tests_torch_multi_gpu tests

            - name: Failure short reports
              if: ${{ always() }}
              run: cat reports/tests_torch_multi_gpu_failures_short.txt

            - name: Run all pipeline tests on GPU
              if: ${{ always() }}
              env:
                  RUN_PIPELINE_TESTS: yes
              run: |
                  python -m pytest -n 1 -v --dist=loadfile -m is_pipeline_test --make-reports=tests_torch_pipeline_multi_gpu tests

            - name: Failure short reports
              if: ${{ always() }}
              run: cat reports/tests_torch_pipeline_multi_gpu_failures_short.txt

            - name: Test suite reports artifacts
              if: ${{ always() }}
              uses: actions/upload-artifact@v2
              with:
                  name: run_all_tests_torch_multi_gpu_test_reports
                  path: reports

    run_all_tests_torch_cuda_extensions_gpu:
        runs-on: [self-hosted, docker-gpu, single-gpu]
        container:
            image: nvcr.io/nvidia/pytorch:21.03-py3
            options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
        steps:
            - name: Launcher docker
              uses: actions/checkout@v2

            - name: NVIDIA-SMI
              run: |
                  nvidia-smi

            - name: Install dependencies
              run: |
                  apt -y update && apt install -y libaio-dev
                  pip install --upgrade pip
                  pip install --pre torch torchvision torchaudio -f https://download.pytorch.org/whl/nightly/cu111/torch_nightly.html -U
                  pip install .[testing,deepspeed]
                  pip install git+https://github.com/microsoft/DeepSpeed

            - name: Are GPUs recognized by our DL frameworks
              run: |
                  python -c "import torch; print('Cuda available:', torch.cuda.is_available())"
                  python -c "import torch; print('Cuda version:', torch.version.cuda)"
                  python -c "import torch; print('CuDNN version:', torch.backends.cudnn.version())"
                  python -c "import torch; print('Number of GPUs available:', torch.cuda.device_count())"

            - name: Run all tests on GPU
              run: |
                  python -m pytest -n 1 -v --dist=loadfile --make-reports=tests_torch_cuda_extensions_gpu tests/deepspeed tests/extended

            - name: Failure short reports
              if: ${{ always() }}
              run: cat reports/tests_torch_cuda_extensions_gpu_failures_short.txt

            - name: Test suite reports artifacts
              if: ${{ always() }}
              uses: actions/upload-artifact@v2
              with:
                  name: run_tests_torch_cuda_extensions_gpu_test_reports
                  path: reports

    run_all_tests_torch_cuda_extensions_multi_gpu:
        runs-on: [self-hosted, docker-gpu, multi-gpu]
        container:
            image: nvcr.io/nvidia/pytorch:21.03-py3
            options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
        steps:
            - name: Launcher docker
              uses: actions/checkout@v2

            - name: NVIDIA-SMI
              continue-on-error: true
              run: |
                  nvidia-smi

            - name: Install dependencies
              run: |
                  apt -y update && apt install -y libaio-dev
                  pip install --upgrade pip
                  pip install --pre torch torchvision torchaudio -f https://download.pytorch.org/whl/nightly/cu111/torch_nightly.html -U
                  pip install .[testing,deepspeed,fairscale]
                  pip install git+https://github.com/microsoft/DeepSpeed

            - name: Are GPUs recognized by our DL frameworks
              run: |
                  python -c "import torch; print('Cuda available:', torch.cuda.is_available())"
                  python -c "import torch; print('Cuda version:', torch.version.cuda)"
                  python -c "import torch; print('CuDNN version:', torch.backends.cudnn.version())"
                  python -c "import torch; print('Number of GPUs available:', torch.cuda.device_count())"

            - name: Run all tests on GPU
              run: |
                  python -m pytest -n 1 -v --dist=loadfile --make-reports=tests_torch_cuda_extensions_multi_gpu tests/deepspeed tests/extended
 
            - name: Failure short reports
              if: ${{ always() }}
              run: cat reports/tests_torch_cuda_extensions_multi_gpu_failures_short.txt

            - name: Test suite reports artifacts
              if: ${{ always() }}
              uses: actions/upload-artifact@v2
              with:
                  name: run_tests_torch_cuda_extensions_multi_gpu_test_reports
                  path: reports

    send_results:
        name: Send results to webhook
        runs-on: ubuntu-latest
        if: always()
        needs: [
                run_all_tests_torch_gpu,
                run_all_tests_torch_multi_gpu,
                run_all_tests_torch_cuda_extensions_gpu,
                run_all_tests_torch_cuda_extensions_multi_gpu
        ]
        steps:
            - uses: actions/checkout@v2

            - uses: actions/download-artifact@v2

            - name: Send message to Slack
              env:
                  CI_SLACK_BOT_TOKEN: ${{ secrets.CI_SLACK_BOT_TOKEN }}
                  CI_SLACK_CHANNEL_ID: ${{ secrets.CI_SLACK_CHANNEL_ID }}
                  CI_SLACK_CHANNEL_ID_DAILY: ${{ secrets.CI_SLACK_CHANNEL_ID_DAILY }}
                  CI_SLACK_CHANNEL_ID_PAST_FUTURE: ${{ secrets.CI_SLACK_CHANNEL_ID_PAST_FUTURE }}

              run: |
                  pip install slack_sdk
                  python utils/notification_service.py scheduled nightly-torch