name: Self-hosted runner (scheduled) on: push: branches: - multi_ci_* repository_dispatch: schedule: - cron: "0 0 * * *" env: HF_HOME: /mnt/cache TRANSFORMERS_IS_CI: yes RUN_SLOW: yes OMP_NUM_THREADS: 16 MKL_NUM_THREADS: 16 jobs: run_all_tests_torch_gpu: runs-on: [self-hosted, docker-gpu, single-gpu] container: image: pytorch/pytorch:1.8.0-cuda11.1-cudnn8-runtime options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ steps: - name: Launcher docker uses: actions/checkout@v2 - name: NVIDIA-SMI run: | nvidia-smi - name: Install dependencies run: | apt -y update && apt install -y libsndfile1-dev pip install --upgrade pip pip install .[sklearn,testing,onnxruntime,sentencepiece,speech,vision,timm] - name: Are GPUs recognized by our DL frameworks run: | python -c "import torch; print('Cuda available:', torch.cuda.is_available())" python -c "import torch; print('Cuda version:', torch.version.cuda)" python -c "import torch; print('CuDNN version:', torch.backends.cudnn.version())" python -c "import torch; print('Number of GPUs available:', torch.cuda.device_count())" - name: Run all tests on GPU run: | python -m pytest -n 1 --dist=loadfile --make-reports=tests_torch_gpu tests - name: Failure short reports if: ${{ always() }} run: cat reports/tests_torch_gpu_failures_short.txt - name: Run examples tests on GPU if: ${{ always() }} env: OMP_NUM_THREADS: 16 MKL_NUM_THREADS: 16 RUN_SLOW: yes HF_HOME: /mnt/cache TRANSFORMERS_IS_CI: yes run: | pip install -r examples/pytorch/_tests_requirements.txt python -m pytest -n 1 --dist=loadfile --make-reports=examples_torch_gpu examples - name: Failure short reports if: ${{ always() }} run: cat reports/examples_torch_gpu_failures_short.txt - name: Run all pipeline tests on GPU if: ${{ always() }} env: RUN_PIPELINE_TESTS: yes run: | python -m pytest -n 1 --dist=loadfile -m is_pipeline_test --make-reports=tests_torch_pipeline_gpu tests - name: Failure short reports if: ${{ always() }} run: cat reports/tests_torch_pipeline_gpu_failures_short.txt - name: Test suite reports artifacts if: ${{ always() }} uses: actions/upload-artifact@v2 with: name: run_all_tests_torch_gpu_test_reports path: reports run_all_tests_tf_gpu: runs-on: [self-hosted, docker-gpu, single-gpu] container: image: tensorflow/tensorflow:2.4.1-gpu options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ steps: - name: Launcher docker uses: actions/checkout@v2 - name: NVIDIA-SMI run: | nvidia-smi - name: Install dependencies run: | pip install --upgrade pip pip install .[sklearn,testing,onnx,sentencepiece] - name: Are GPUs recognized by our DL frameworks run: | TF_CPP_MIN_LOG_LEVEL=3 python -c "import tensorflow as tf; print('TF GPUs available:', bool(tf.config.list_physical_devices('GPU')))" TF_CPP_MIN_LOG_LEVEL=3 python -c "import tensorflow as tf; print('Number of TF GPUs available:', len(tf.config.list_physical_devices('GPU')))" - name: Run all tests on GPU env: TF_NUM_INTEROP_THREADS: 1 TF_NUM_INTRAOP_THREADS: 16 run: | python -m pytest -n 1 --dist=loadfile --make-reports=tests_tf_gpu tests - name: Failure short reports if: ${{ always() }} run: cat reports/tests_tf_gpu_failures_short.txt - name: Run all pipeline tests on GPU if: ${{ always() }} env: RUN_PIPELINE_TESTS: yes TF_NUM_INTEROP_THREADS: 1 TF_NUM_INTRAOP_THREADS: 16 run: | python -m pytest -n 1 --dist=loadfile -m is_pipeline_test --make-reports=tests_tf_pipeline_gpu tests - name: Failure short reports if: ${{ always() }} run: cat reports/tests_tf_pipeline_gpu_failures_short.txt - name: Test suite reports artifacts if: ${{ always() }} uses: actions/upload-artifact@v2 with: name: run_all_tests_tf_gpu_test_reports path: reports run_all_tests_torch_multi_gpu: runs-on: [self-hosted, docker-gpu, multi-gpu] container: image: pytorch/pytorch:1.8.0-cuda11.1-cudnn8-runtime options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ steps: - name: Launcher docker uses: actions/checkout@v2 - name: NVIDIA-SMI run: | nvidia-smi - name: Install dependencies run: | apt -y update && apt install -y libsndfile1-dev pip install --upgrade pip pip install .[sklearn,testing,onnxruntime,sentencepiece,speech,vision,timm] - name: Are GPUs recognized by our DL frameworks run: | python -c "import torch; print('Cuda available:', torch.cuda.is_available())" python -c "import torch; print('Cuda version:', torch.version.cuda)" python -c "import torch; print('CuDNN version:', torch.backends.cudnn.version())" python -c "import torch; print('Number of GPUs available:', torch.cuda.device_count())" - name: Run all tests on GPU env: MKL_SERVICE_FORCE_INTEL: 1 run: | python -m pytest -n 1 --dist=loadfile --make-reports=tests_torch_multi_gpu tests - name: Failure short reports if: ${{ always() }} run: cat reports/tests_torch_multi_gpu_failures_short.txt - name: Run all pipeline tests on GPU if: ${{ always() }} env: RUN_PIPELINE_TESTS: yes run: | python -m pytest -n 1 --dist=loadfile -m is_pipeline_test --make-reports=tests_torch_pipeline_multi_gpu tests - name: Failure short reports if: ${{ always() }} run: cat reports/tests_torch_pipeline_multi_gpu_failures_short.txt - name: Test suite reports artifacts if: ${{ always() }} uses: actions/upload-artifact@v2 with: name: run_all_tests_torch_multi_gpu_test_reports path: reports run_all_tests_tf_multi_gpu: runs-on: [self-hosted, docker-gpu, multi-gpu] container: image: tensorflow/tensorflow:2.4.1-gpu options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ steps: - name: Launcher docker uses: actions/checkout@v2 - name: NVIDIA-SMI run: | nvidia-smi - name: Install dependencies run: | pip install --upgrade pip pip install .[sklearn,testing,onnx,sentencepiece] - name: Are GPUs recognized by our DL frameworks run: | TF_CPP_MIN_LOG_LEVEL=3 python -c "import tensorflow as tf; print('TF GPUs available:', bool(tf.config.list_physical_devices('GPU')))" TF_CPP_MIN_LOG_LEVEL=3 python -c "import tensorflow as tf; print('Number of TF GPUs available:', len(tf.config.list_physical_devices('GPU')))" - name: Run all tests on GPU env: TF_NUM_INTEROP_THREADS: 1 TF_NUM_INTRAOP_THREADS: 16 run: | python -m pytest -n 1 --dist=loadfile --make-reports=tests_tf_multi_gpu tests - name: Failure short reports if: ${{ always() }} run: cat reports/tests_tf_multi_gpu_failures_short.txt - name: Run all pipeline tests on GPU if: ${{ always() }} env: RUN_PIPELINE_TESTS: yes TF_NUM_INTEROP_THREADS: 1 TF_NUM_INTRAOP_THREADS: 16 run: | python -m pytest -n 1 --dist=loadfile -m is_pipeline_test --make-reports=tests_tf_pipeline_multi_gpu tests - name: Failure short reports if: ${{ always() }} run: cat reports/tests_tf_pipeline_multi_gpu_failures_short.txt - name: Test suite reports artifacts if: ${{ always() }} uses: actions/upload-artifact@v2 with: name: run_all_tests_tf_multi_gpu_test_reports path: reports run_all_tests_torch_cuda_extensions_gpu: runs-on: [self-hosted, docker-gpu, single-gpu] container: image: nvcr.io/nvidia/pytorch:21.03-py3 options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ steps: - name: Launcher docker uses: actions/checkout@v2 - name: NVIDIA-SMI run: | nvidia-smi - name: Install dependencies run: | apt -y update && apt install -y libaio-dev pip install --upgrade pip pip install .[testing,deepspeed] - name: Are GPUs recognized by our DL frameworks run: | python -c "import torch; print('Cuda available:', torch.cuda.is_available())" python -c "import torch; print('Cuda version:', torch.version.cuda)" python -c "import torch; print('CuDNN version:', torch.backends.cudnn.version())" python -c "import torch; print('Number of GPUs available:', torch.cuda.device_count())" - name: Run all tests on GPU run: | python -m pytest -n 1 --dist=loadfile --make-reports=tests_torch_cuda_extensions_gpu tests/deepspeed tests/extended - name: Failure short reports if: ${{ always() }} run: cat reports/tests_torch_cuda_extensions_gpu_failures_short.txt - name: Test suite reports artifacts if: ${{ always() }} uses: actions/upload-artifact@v2 with: name: run_tests_torch_cuda_extensions_gpu_test_reports path: reports run_all_tests_torch_cuda_extensions_multi_gpu: runs-on: [self-hosted, docker-gpu, multi-gpu] container: image: nvcr.io/nvidia/pytorch:21.03-py3 options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ steps: - name: Launcher docker uses: actions/checkout@v2 - name: NVIDIA-SMI run: | nvidia-smi - name: Install dependencies run: | apt -y update && apt install -y libaio-dev pip install --upgrade pip pip install .[testing,deepspeed,fairscale] - name: Are GPUs recognized by our DL frameworks run: | python -c "import torch; print('Cuda available:', torch.cuda.is_available())" python -c "import torch; print('Cuda version:', torch.version.cuda)" python -c "import torch; print('CuDNN version:', torch.backends.cudnn.version())" python -c "import torch; print('Number of GPUs available:', torch.cuda.device_count())" - name: Run all tests on GPU run: | python -m pytest -n 1 --dist=loadfile --make-reports=tests_torch_cuda_extensions_multi_gpu tests/deepspeed tests/extended - name: Failure short reports if: ${{ always() }} run: cat reports/tests_torch_cuda_extensions_multi_gpu_failures_short.txt - name: Test suite reports artifacts if: ${{ always() }} uses: actions/upload-artifact@v2 with: name: run_tests_torch_cuda_extensions_multi_gpu_test_reports path: reports send_results: name: Send results to webhook runs-on: ubuntu-latest if: always() needs: [ run_all_tests_torch_gpu, run_all_tests_tf_gpu, run_all_tests_torch_multi_gpu, run_all_tests_tf_multi_gpu, run_all_tests_torch_cuda_extensions_gpu, run_all_tests_torch_cuda_extensions_multi_gpu ] steps: - uses: actions/checkout@v2 - uses: actions/download-artifact@v2 - name: Send message to Slack env: CI_SLACK_BOT_TOKEN: ${{ secrets.CI_SLACK_BOT_TOKEN }} CI_SLACK_CHANNEL_ID: ${{ secrets.CI_SLACK_CHANNEL_ID }} run: | pip install slack_sdk python utils/notification_service.py scheduled