name: Self-hosted runner (push) on: push: branches: - master - ci_* - ci-* paths: - "src/**" - "tests/**" - ".github/**" - "templates/**" repository_dispatch: env: HF_HOME: /mnt/cache TRANSFORMERS_IS_CI: yes OMP_NUM_THREADS: 8 MKL_NUM_THREADS: 8 PYTEST_TIMEOUT: 60 jobs: run_tests_torch_gpu: runs-on: [self-hosted, docker-gpu, single-gpu] container: image: pytorch/pytorch:1.9.0-cuda11.1-cudnn8-runtime options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ steps: - name: Launcher docker uses: actions/checkout@v2 - name: NVIDIA-SMI run: | nvidia-smi - name: Install dependencies run: | apt -y update && apt install -y libsndfile1-dev pip install --upgrade pip pip install .[sklearn,testing,onnxruntime,sentencepiece,speech,vision,timm] - name: Are GPUs recognized by our DL frameworks run: | python -c "import torch; print('Cuda available:', torch.cuda.is_available())" python -c "import torch; print('Cuda version:', torch.version.cuda)" python -c "import torch; print('CuDNN version:', torch.backends.cudnn.version())" python -c "import torch; print('Number of GPUs available:', torch.cuda.device_count())" - name: Run all non-slow tests on GPU run: | python -m pytest -n 2 --dist=loadfile -v --make-reports=tests_torch_gpu tests - name: Failure short reports if: ${{ always() }} run: cat reports/tests_torch_gpu_failures_short.txt - name: Test suite reports artifacts if: ${{ always() }} uses: actions/upload-artifact@v2 with: name: run_all_tests_torch_gpu_test_reports path: reports # run_tests_tf_gpu: # runs-on: [self-hosted, docker-gpu, single-gpu] # timeout-minutes: 120 # container: # image: tensorflow/tensorflow:2.4.1-gpu # options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ # steps: # - name: Launcher docker # uses: actions/checkout@v2 # # - name: NVIDIA-SMI # run: | # nvidia-smi # # - name: Install dependencies # run: | # pip install --upgrade pip # pip install .[sklearn,testing,onnxruntime,sentencepiece] # # - name: Are GPUs recognized by our DL frameworks # run: | # TF_CPP_MIN_LOG_LEVEL=3 python -c "import tensorflow as tf; print('TF GPUs available:', bool(tf.config.list_physical_devices('GPU')))" # TF_CPP_MIN_LOG_LEVEL=3 python -c "import tensorflow as tf; print('Number of TF GPUs available:', len(tf.config.list_physical_devices('GPU')))" # # - name: Run all non-slow tests on GPU # env: # TF_NUM_INTRAOP_THREADS: 8 # TF_NUM_INTEROP_THREADS: 1 # run: | # python -m pytest -n 1 --dist=loadfile --make-reports=tests_tf_gpu tests # # - name: Failure short reports # if: ${{ always() }} # run: cat reports/tests_tf_gpu_failures_short.txt # # - name: Test suite reports artifacts # if: ${{ always() }} # uses: actions/upload-artifact@v2 # with: # name: run_all_tests_tf_gpu_test_reports # path: reports run_tests_torch_multi_gpu: runs-on: [self-hosted, docker-gpu, multi-gpu] container: image: pytorch/pytorch:1.9.0-cuda11.1-cudnn8-runtime options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ steps: - name: Launcher docker uses: actions/checkout@v2 - name: NVIDIA-SMI run: | nvidia-smi - name: Install dependencies run: | apt -y update && apt install -y libsndfile1-dev pip install --upgrade pip pip install .[sklearn,testing,onnxruntime,sentencepiece,speech,vision,timm] - name: Are GPUs recognized by our DL frameworks run: | python -c "import torch; print('Cuda available:', torch.cuda.is_available())" python -c "import torch; print('Cuda version:', torch.version.cuda)" python -c "import torch; print('CuDNN version:', torch.backends.cudnn.version())" python -c "import torch; print('Number of GPUs available:', torch.cuda.device_count())" - name: Run all non-slow tests on GPU env: MKL_SERVICE_FORCE_INTEL: 1 run: | python -m pytest -n 2 --dist=loadfile -v --make-reports=tests_torch_multi_gpu tests - name: Failure short reports if: ${{ always() }} run: cat reports/tests_torch_multi_gpu_failures_short.txt - name: Test suite reports artifacts if: ${{ always() }} uses: actions/upload-artifact@v2 with: name: run_all_tests_torch_multi_gpu_test_reports path: reports # run_tests_tf_multi_gpu: # runs-on: [self-hosted, docker-gpu, multi-gpu] # timeout-minutes: 120 # container: # image: tensorflow/tensorflow:2.4.1-gpu # options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ # steps: # - name: Launcher docker # uses: actions/checkout@v2 # # - name: NVIDIA-SMI # run: | # nvidia-smi # # - name: Install dependencies # run: | # pip install --upgrade pip # pip install .[sklearn,testing,onnxruntime,sentencepiece] # # - name: Are GPUs recognized by our DL frameworks # run: | # TF_CPP_MIN_LOG_LEVEL=3 python -c "import tensorflow as tf; print('TF GPUs available:', bool(tf.config.list_physical_devices('GPU')))" # TF_CPP_MIN_LOG_LEVEL=3 python -c "import tensorflow as tf; print('Number of TF GPUs available:', len(tf.config.list_physical_devices('GPU')))" # # - name: Run all non-slow tests on GPU # env: # TF_NUM_INTRAOP_THREADS: 8 # TF_NUM_INTEROP_THREADS: 1 # run: | # python -m pytest -n 1 --dist=loadfile --make-reports=tests_tf_multi_gpu tests # # - name: Failure short reports # if: ${{ always() }} # run: cat reports/tests_tf_multi_gpu_failures_short.txt # # - name: Test suite reports artifacts # if: ${{ always() }} # uses: actions/upload-artifact@v2 # with: # name: run_all_tests_tf_multi_gpu_test_reports # path: reports run_tests_torch_cuda_extensions_gpu: runs-on: [self-hosted, docker-gpu, single-gpu] container: image: nvcr.io/nvidia/pytorch:21.03-py3 options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ steps: - name: Launcher docker uses: actions/checkout@v2 - name: NVIDIA-SMI run: | nvidia-smi - name: Install dependencies run: | apt -y update && apt install -y libaio-dev pip install --upgrade pip pip install .[testing,deepspeed] - name: Are GPUs recognized by our DL frameworks run: | python -c "import torch; print('Cuda available:', torch.cuda.is_available())" python -c "import torch; print('Cuda version:', torch.version.cuda)" python -c "import torch; print('CuDNN version:', torch.backends.cudnn.version())" python -c "import torch; print('Number of GPUs available:', torch.cuda.device_count())" - name: Run all tests on GPU run: | python -m pytest -n 1 --dist=loadfile -v --make-reports=tests_torch_cuda_extensions_gpu tests/deepspeed tests/extended - name: Failure short reports if: ${{ always() }} run: cat reports/tests_torch_cuda_extensions_gpu_failures_short.txt - name: Test suite reports artifacts if: ${{ always() }} uses: actions/upload-artifact@v2 with: name: run_tests_torch_cuda_extensions_gpu_test_reports path: reports run_tests_torch_cuda_extensions_multi_gpu: runs-on: [self-hosted, docker-gpu, multi-gpu] container: image: nvcr.io/nvidia/pytorch:21.03-py3 options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ steps: - name: Launcher docker uses: actions/checkout@v2 - name: NVIDIA-SMI run: | nvidia-smi - name: Install dependencies run: | apt -y update && apt install -y libaio-dev pip install --upgrade pip pip install .[testing,deepspeed,fairscale] - name: Are GPUs recognized by our DL frameworks run: | python -c "import torch; print('Cuda available:', torch.cuda.is_available())" python -c "import torch; print('Cuda version:', torch.version.cuda)" python -c "import torch; print('CuDNN version:', torch.backends.cudnn.version())" python -c "import torch; print('Number of GPUs available:', torch.cuda.device_count())" - name: Run all tests on GPU run: | python -m pytest -n 1 --dist=loadfile -v --make-reports=tests_torch_cuda_extensions_multi_gpu tests/deepspeed tests/extended - name: Failure short reports if: ${{ always() }} run: cat reports/tests_torch_cuda_extensions_multi_gpu_failures_short.txt - name: Test suite reports artifacts if: ${{ always() }} uses: actions/upload-artifact@v2 with: name: run_tests_torch_cuda_extensions_multi_gpu_test_reports path: reports send_results: name: Send results to webhook runs-on: ubuntu-latest if: always() needs: [ run_tests_torch_gpu, # run_tests_tf_gpu, run_tests_torch_multi_gpu, # run_tests_tf_multi_gpu, run_tests_torch_cuda_extensions_gpu, run_tests_torch_cuda_extensions_multi_gpu ] steps: - uses: actions/checkout@v2 - uses: actions/download-artifact@v2 - name: Send message to Slack env: CI_SLACK_BOT_TOKEN: ${{ secrets.CI_SLACK_BOT_TOKEN }} CI_SLACK_CHANNEL_ID: ${{ secrets.CI_SLACK_CHANNEL_ID }} run: | pip install slack_sdk python utils/notification_service.py push