name: Self-hosted runner (scheduled) on: push: branches: - multi_ci_* repository_dispatch: schedule: - cron: "0 0 * * *" env: HF_HOME: /mnt/cache TRANSFORMERS_IS_CI: yes RUN_SLOW: yes OMP_NUM_THREADS: 16 MKL_NUM_THREADS: 16 PYTEST_TIMEOUT: 600 SIGOPT_API_TOKEN: ${{ secrets.SIGOPT_API_TOKEN }} TF_FORCE_GPU_ALLOW_GROWTH: true RUN_PT_TF_CROSS_TESTS: 1 jobs: run_all_tests_torch_gpu: runs-on: [self-hosted, docker-gpu, single-gpu] container: image: pytorch/pytorch:1.9.0-cuda11.1-cudnn8-runtime options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ steps: - name: Launcher docker uses: actions/checkout@v2 - name: NVIDIA-SMI run: | nvidia-smi - name: Install dependencies run: | apt -y update && apt install -y libsndfile1-dev git espeak-ng pip install --upgrade pip pip install .[integrations,sklearn,testing,onnxruntime,sentencepiece,torch-speech,vision,timm] pip install https://github.com/kpu/kenlm/archive/master.zip python -m pip install 'git+https://github.com/facebookresearch/detectron2.git' wandb login ${{ secrets.WANDB_API_KEY }} - name: Are GPUs recognized by our DL frameworks run: | utils/print_env_pt.py - name: Run all tests on GPU run: | python -m pytest -n 1 -v --dist=loadfile --make-reports=tests_torch_gpu tests - name: Failure short reports if: ${{ always() }} run: cat reports/tests_torch_gpu_failures_short.txt - name: Test durations if: ${{ always() }} run: cat reports/tests_torch_gpu_durations.txt - name: Run examples tests on GPU if: ${{ always() }} env: OMP_NUM_THREADS: 16 MKL_NUM_THREADS: 16 RUN_SLOW: yes HF_HOME: /mnt/cache TRANSFORMERS_IS_CI: yes run: | pip install -r examples/pytorch/_tests_requirements.txt python -m pytest -n 1 -v --dist=loadfile --make-reports=examples_torch_gpu examples - name: Failure short reports if: ${{ always() }} run: cat reports/examples_torch_gpu_failures_short.txt - name: Test durations if: ${{ always() }} run: cat reports/examples_torch_gpu_durations.txt - name: Run all pipeline tests on GPU if: ${{ always() }} env: RUN_PIPELINE_TESTS: yes run: | python -m pytest -n 1 -v --dist=loadfile -m is_pipeline_test --make-reports=tests_torch_pipeline_gpu tests - name: Failure short reports if: ${{ always() }} run: cat reports/tests_torch_pipeline_gpu_failures_short.txt - name: Test durations if: ${{ always() }} run: cat reports/tests_torch_pipeline_gpu_durations.txt - name: Test suite reports artifacts if: ${{ always() }} uses: actions/upload-artifact@v2 with: name: run_all_tests_torch_gpu_test_reports path: reports # run_all_tests_flax_gpu: # runs-on: [self-hosted, docker-gpu-test, single-gpu] # container: # image: tensorflow/tensorflow:2.4.1-gpu # options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ # steps: # - name: Launcher docker # uses: actions/checkout@v2 # # - name: NVIDIA-SMI # continue-on-error: true # run: | # nvidia-smi # # - name: Install dependencies # run: | # pip install --upgrade pip # pip install --upgrade "jax[cuda111]" -f https://storage.googleapis.com/jax-releases/jax_releases.html # pip install .[flax,integrations,sklearn,testing,sentencepiece,flax-speech,vision] # pip install https://github.com/kpu/kenlm/archive/master.zip # # - name: Are GPUs recognized by our DL frameworks # run: | # python -c "from jax.lib import xla_bridge; print('GPU available:', xla_bridge.get_backend().platform)" # python -c "import jax; print('Number of GPUs available:', len(jax.local_devices()))" # # - name: Run all tests on GPU # run: | # python -m pytest -n 1 -v --dist=loadfile --make-reports=tests_flax_gpu tests # # - name: Failure short reports # if: ${{ always() }} # run: cat reports/tests_flax_gpu_failures_short.txt # # - name: Test durations # if: ${{ always() }} # run: cat reports/tests_flax_gpu_durations.txt # # - name: Test suite reports artifacts # if: ${{ always() }} # uses: actions/upload-artifact@v2 # with: # name: run_all_tests_flax_gpu_test_reports # path: reports run_all_tests_tf_gpu: runs-on: [self-hosted, docker-gpu, single-gpu] container: image: tensorflow/tensorflow:2.4.1-gpu options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ steps: - name: Launcher docker uses: actions/checkout@v2 - name: NVIDIA-SMI run: | nvidia-smi - name: Install dependencies run: | apt -y update && apt install -y libsndfile1-dev git espeak-ng pip install --upgrade pip pip install .[sklearn,testing,onnx,sentencepiece,tf-speech,vision] pip install https://github.com/kpu/kenlm/archive/master.zip - name: Are GPUs recognized by our DL frameworks run: | TF_CPP_MIN_LOG_LEVEL=3 python -c "import tensorflow as tf; print('TF GPUs available:', bool(tf.config.list_physical_devices('GPU')))" TF_CPP_MIN_LOG_LEVEL=3 python -c "import tensorflow as tf; print('Number of TF GPUs available:', len(tf.config.list_physical_devices('GPU')))" - name: Run all tests on GPU env: TF_NUM_INTEROP_THREADS: 1 TF_NUM_INTRAOP_THREADS: 16 run: | python -m pytest -n 1 -v --dist=loadfile --make-reports=tests_tf_gpu tests - name: Failure short reports if: ${{ always() }} run: cat reports/tests_tf_gpu_failures_short.txt - name: Test durations if: ${{ always() }} run: cat reports/tests_tf_gpu_durations.txt - name: Run all pipeline tests on GPU if: ${{ always() }} env: RUN_PIPELINE_TESTS: yes TF_NUM_INTEROP_THREADS: 1 TF_NUM_INTRAOP_THREADS: 16 run: | python -m pytest -n 1 -v --dist=loadfile -m is_pipeline_test --make-reports=tests_tf_pipeline_gpu tests - name: Failure short reports if: ${{ always() }} run: cat reports/tests_tf_pipeline_gpu_failures_short.txt - name: Test durations if: ${{ always() }} run: cat reports/tests_tf_pipeline_gpu_durations.txt - name: Test suite reports artifacts if: ${{ always() }} uses: actions/upload-artifact@v2 with: name: run_all_tests_tf_gpu_test_reports path: reports run_all_examples_torch_xla_tpu: runs-on: [self-hosted, docker-tpu-test, tpu-v3-8] container: image: gcr.io/tpu-pytorch/xla:nightly_3.8_tpuvm options: --privileged -v "/lib/libtpu.so:/lib/libtpu.so" -v /mnt/cache/.cache/huggingface:/mnt/cache/ --shm-size 16G steps: - name: Launcher docker uses: actions/checkout@v2 - name: Install dependencies run: | pip install --upgrade pip pip install .[testing] - name: Are TPUs recognized by our DL frameworks env: XRT_TPU_CONFIG: localservice;0;localhost:51011 run: | python -c "import torch_xla.core.xla_model as xm; print(xm.xla_device())" - name: Run example tests on TPU env: XRT_TPU_CONFIG: "localservice;0;localhost:51011" MKL_SERVICE_FORCE_INTEL: "1" # See: https://github.com/pytorch/pytorch/issues/37377 run: | python -m pytest -n 1 -v --dist=loadfile --make-reports=tests_torch_xla_tpu examples/pytorch/test_xla_examples.py - name: Failure short reports if: ${{ always() }} run: cat reports/tests_torch_xla_tpu_failures_short.txt - name: Tests durations if: ${{ always() }} run: cat reports/tests_torch_xla_tpu_durations.txt - name: Test suite reports artifacts if: ${{ always() }} uses: actions/upload-artifact@v2 with: name: run_all_examples_torch_xla_tpu path: reports run_all_tests_torch_multi_gpu: runs-on: [self-hosted, docker-gpu, multi-gpu] container: image: pytorch/pytorch:1.9.0-cuda11.1-cudnn8-runtime options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ steps: - name: Launcher docker uses: actions/checkout@v2 - name: NVIDIA-SMI continue-on-error: true run: | nvidia-smi - name: Install dependencies run: | apt -y update && apt install -y libsndfile1-dev git espeak-ng pip install --upgrade pip pip install .[integrations,sklearn,testing,onnxruntime,sentencepiece,torch-speech,vision,timm] pip install https://github.com/kpu/kenlm/archive/master.zip python -m pip install 'git+https://github.com/facebookresearch/detectron2.git' wandb login ${{ secrets.WANDB_API_KEY }} - name: Are GPUs recognized by our DL frameworks run: | utils/print_env_pt.py - name: Run all tests on GPU env: MKL_SERVICE_FORCE_INTEL: 1 run: | python -m pytest -n 1 -v --dist=loadfile --make-reports=tests_torch_multi_gpu tests - name: Failure short reports if: ${{ always() }} run: cat reports/tests_torch_multi_gpu_failures_short.txt - name: Test durations if: ${{ always() }} run: cat reports/tests_torch_multi_gpu_durations.txt - name: Run all pipeline tests on GPU if: ${{ always() }} env: RUN_PIPELINE_TESTS: yes run: | python -m pytest -n 1 -v --dist=loadfile -m is_pipeline_test --make-reports=tests_torch_pipeline_multi_gpu tests - name: Failure short reports if: ${{ always() }} run: cat reports/tests_torch_pipeline_multi_gpu_failures_short.txt - name: Test durations if: ${{ always() }} run: cat reports/tests_torch_pipeline_multi_gpu_durations.txt - name: Test suite reports artifacts if: ${{ always() }} uses: actions/upload-artifact@v2 with: name: run_all_tests_torch_multi_gpu_test_reports path: reports run_all_tests_tf_multi_gpu: runs-on: [self-hosted, docker-gpu, multi-gpu] container: image: tensorflow/tensorflow:2.4.1-gpu options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ steps: - name: Launcher docker uses: actions/checkout@v2 - name: NVIDIA-SMI continue-on-error: true run: | nvidia-smi - name: Install dependencies run: | apt -y update && apt install -y libsndfile1-dev git espeak-ng pip install --upgrade pip pip install .[sklearn,testing,onnx,sentencepiece,tf-speech,vision] pip install https://github.com/kpu/kenlm/archive/master.zip - name: Are GPUs recognized by our DL frameworks run: | TF_CPP_MIN_LOG_LEVEL=3 python -c "import tensorflow as tf; print('TF GPUs available:', bool(tf.config.list_physical_devices('GPU')))" TF_CPP_MIN_LOG_LEVEL=3 python -c "import tensorflow as tf; print('Number of TF GPUs available:', len(tf.config.list_physical_devices('GPU')))" - name: Run all tests on GPU env: TF_NUM_INTEROP_THREADS: 1 TF_NUM_INTRAOP_THREADS: 16 run: | python -m pytest -n 1 -v --dist=loadfile --make-reports=tests_tf_multi_gpu tests - name: Failure short reports if: ${{ always() }} run: cat reports/tests_tf_multi_gpu_failures_short.txt - name: Test durations if: ${{ always() }} run: cat reports/tests_tf_multi_gpu_durations.txt - name: Run all pipeline tests on GPU if: ${{ always() }} env: RUN_PIPELINE_TESTS: yes TF_NUM_INTEROP_THREADS: 1 TF_NUM_INTRAOP_THREADS: 16 run: | python -m pytest -n 1 -v --dist=loadfile -m is_pipeline_test --make-reports=tests_tf_pipeline_multi_gpu tests - name: Failure short reports if: ${{ always() }} run: cat reports/tests_tf_pipeline_multi_gpu_failures_short.txt - name: Test durations if: ${{ always() }} run: cat reports/tests_tf_pipeline_multi_gpu_durations.txt - name: Test suite reports artifacts if: ${{ always() }} uses: actions/upload-artifact@v2 with: name: run_all_tests_tf_multi_gpu_test_reports path: reports # run_all_tests_flax_multi_gpu: # runs-on: [self-hosted, docker-gpu, multi-gpu] # container: # image: tensorflow/tensorflow:2.4.1-gpu # options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ # steps: # - name: Launcher docker # uses: actions/checkout@v2 # # - name: NVIDIA-SMI # run: | # nvidia-smi # # - name: Install dependencies # run: | # pip install --upgrade pip # pip install --upgrade "jax[cuda111]" -f https://storage.googleapis.com/jax-releases/jax_releases.html # pip install .[flax,integrations,sklearn,testing,sentencepiece,flax-speech,vision] # # - name: Are GPUs recognized by our DL frameworks # run: | # python -c "from jax.lib import xla_bridge; print('GPU available:', xla_bridge.get_backend().platform)" # python -c "import jax; print('Number of GPUs available:', len(jax.local_devices()))" # # - name: Run all tests on GPU # run: | # python -m pytest -n 1 -v --dist=loadfile --make-reports=tests_flax_gpu tests # # - name: Failure short reports # if: ${{ always() }} # run: cat reports/tests_flax_gpu_failures_short.txt # # - name: Test suite reports artifacts # if: ${{ always() }} # uses: actions/upload-artifact@v2 # with: # name: run_all_tests_flax_gpu_test_reports # path: reports run_all_tests_torch_cuda_extensions_gpu: runs-on: [self-hosted, docker-gpu, single-gpu] container: image: nvcr.io/nvidia/pytorch:21.03-py3 options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ steps: - name: Launcher docker uses: actions/checkout@v2 - name: NVIDIA-SMI run: | nvidia-smi - name: Install dependencies run: | apt -y update && apt install -y libaio-dev pip install --upgrade pip pip install .[testing,deepspeed] - name: Are GPUs recognized by our DL frameworks run: | utils/print_env_pt.py - name: Run all tests on GPU run: | python -m pytest -n 1 -v --dist=loadfile --make-reports=tests_torch_cuda_extensions_gpu tests/deepspeed tests/extended - name: Failure short reports if: ${{ always() }} run: cat reports/tests_torch_cuda_extensions_gpu_failures_short.txt - name: Test durations if: ${{ always() }} run: cat reports/tests_torch_cuda_extensions_gpu_durations.txt - name: Test suite reports artifacts if: ${{ always() }} uses: actions/upload-artifact@v2 with: name: run_tests_torch_cuda_extensions_gpu_test_reports path: reports run_all_tests_torch_cuda_extensions_multi_gpu: runs-on: [self-hosted, docker-gpu, multi-gpu] container: image: nvcr.io/nvidia/pytorch:21.03-py3 options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ steps: - name: Launcher docker uses: actions/checkout@v2 - name: NVIDIA-SMI continue-on-error: true run: | nvidia-smi - name: Install dependencies run: | apt -y update && apt install -y libaio-dev pip install --upgrade pip rm -rf ~/.cache/torch_extensions/ # shared between conflicting builds pip install .[testing,deepspeed,fairscale] - name: Are GPUs recognized by our DL frameworks run: | utils/print_env_pt.py - name: Run all tests on GPU run: | python -m pytest -n 1 -v --dist=loadfile --make-reports=tests_torch_cuda_extensions_multi_gpu tests/deepspeed tests/extended - name: Failure short reports if: ${{ always() }} run: cat reports/tests_torch_cuda_extensions_multi_gpu_failures_short.txt - name: Test durations if: ${{ always() }} run: cat reports/tests_torch_cuda_extensions_multi_gpu_durations.txt - name: Test suite reports artifacts if: ${{ always() }} uses: actions/upload-artifact@v2 with: name: run_tests_torch_cuda_extensions_multi_gpu_test_reports path: reports send_results: name: Send results to webhook runs-on: ubuntu-latest if: always() needs: [ run_all_tests_torch_gpu, run_all_tests_tf_gpu, run_all_tests_torch_multi_gpu, run_all_tests_tf_multi_gpu, run_all_tests_torch_cuda_extensions_gpu, run_all_tests_torch_cuda_extensions_multi_gpu ] steps: - uses: actions/checkout@v2 - uses: actions/download-artifact@v2 - name: Send message to Slack env: CI_SLACK_BOT_TOKEN: ${{ secrets.CI_SLACK_BOT_TOKEN }} CI_SLACK_CHANNEL_ID: ${{ secrets.CI_SLACK_CHANNEL_ID }} CI_SLACK_CHANNEL_ID_DAILY: ${{ secrets.CI_SLACK_CHANNEL_ID_DAILY }} run: | pip install slack_sdk python utils/notification_service.py scheduled