name: Self-hosted runner (push) on: push: branches: - main - ci_* - ci-* paths: - "src/**" - "tests/**" - ".github/**" - "templates/**" - "utils/**" repository_dispatch: env: HF_HOME: /mnt/cache TRANSFORMERS_IS_CI: yes OMP_NUM_THREADS: 8 MKL_NUM_THREADS: 8 PYTEST_TIMEOUT: 60 jobs: run_tests_torch_gpu: runs-on: [self-hosted, docker-gpu, single-gpu] container: image: pytorch/pytorch:1.9.0-cuda11.1-cudnn8-runtime options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ steps: - name: Install dependencies run: | apt -y update && apt install -y software-properties-common && apt -y update && add-apt-repository -y ppa:git-core/ppa && apt -y update && apt install -y git apt install -y libsndfile1-dev espeak-ng pip install --upgrade pip pip install .[sklearn,testing,onnxruntime,sentencepiece,torch-speech,vision,timm] pip install https://github.com/kpu/kenlm/archive/master.zip - name: Launcher docker uses: actions/checkout@v2 with: fetch-depth: 2 - name: NVIDIA-SMI run: | nvidia-smi - name: Are GPUs recognized by our DL frameworks run: | utils/print_env_pt.py - name: Fetch the tests to run run: | python utils/tests_fetcher.py --diff_with_last_commit | tee test_preparation.txt - name: Report fetched tests uses: actions/upload-artifact@v2 with: name: test_fetched path: test_preparation.txt - name: Run all non-slow tests on GPU run: | if [ -f test_list.txt ]; then python -m pytest -n 2 --dist=loadfile -v --make-reports=tests_torch_gpu $(cat test_list.txt) fi - name: Failure short reports if: ${{ failure() }} run: cat reports/tests_torch_gpu_failures_short.txt - name: Test suite reports artifacts if: ${{ always() }} uses: actions/upload-artifact@v2 with: name: run_all_tests_torch_gpu_test_reports path: reports # run_tests_flax_gpu: # runs-on: [self-hosted, docker-gpu-test, single-gpu] # container: # image: tensorflow/tensorflow:2.4.1-gpu # options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ # steps: # - name: Set up Python 3.7 # uses: actions/setup-python@v2 # with: # python-version: 3.7 # # - name: Install dependencies # run: | # apt -y update && apt install -y software-properties-common && apt -y update && add-apt-repository -y ppa:git-core/ppa && apt -y update && apt install -y git espeak-ng # pip install --upgrade "jax[cuda111]" -f https://storage.googleapis.com/jax-releases/jax_releases.html # pip install --upgrade pip # pip install .[sklearn,testing,sentencepiece,flax,flax-speech,vision] # # - name: Launcher docker # uses: actions/checkout@v2 # with: # fetch-depth: 2 # # - name: NVIDIA-SMI # continue-on-error: true # run: | # nvidia-smi # # - name: Are GPUs recognized by our DL frameworks # run: | # python -c "from jax.lib import xla_bridge; print('GPU available:', xla_bridge.get_backend().platform)" # python -c "import jax; print('Number of GPUs available:', len(jax.local_devices()))" # # - name: Fetch the tests to run # run: | # python utils/tests_fetcher.py --diff_with_last_commit | tee test_preparation.txt # # - name: Report fetched tests # uses: actions/upload-artifact@v2 # with: # name: test_fetched # path: test_preparation.txt # # - name: Run all non-slow tests on GPU # run: | # if [ -f test_list.txt ]; then # python -m pytest -n 2 --dist=loadfile -v --make-reports=tests_flax_gpu $(cat test_list.txt) # fi # # - name: Failure short reports # if: ${{ failure() }} # run: cat reports/tests_flax_gpu_failures_short.txt # # - name: Test suite reports artifacts # if: ${{ always() }} # uses: actions/upload-artifact@v2 # with: # name: run_all_tests_flax_gpu_test_reports # path: reports # # run_tests_tf_gpu: # runs-on: [self-hosted, docker-gpu, single-gpu] # timeout-minutes: 120 # container: # image: tensorflow/tensorflow:2.4.1-gpu # options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ # steps: # - name: Install dependencies # run: | # apt -y update && apt install -y software-properties-common && apt -y update && add-apt-repository -y ppa:git-core/ppa && apt -y update && apt install -y git espeak-ng # pip install --upgrade pip # pip install .[sklearn,testing,onnxruntime,sentencepiece,tf-speech] # pip install https://github.com/kpu/kenlm/archive/master.zip # # - name: Launcher docker # uses: actions/checkout@v2 # with: # fetch-depth: 2 # # - name: NVIDIA-SMI # run: | # nvidia-smi # # - name: Are GPUs recognized by our DL frameworks # run: | # TF_CPP_MIN_LOG_LEVEL=3 python -c "import tensorflow as tf; print('TF GPUs available:', bool(tf.config.list_physical_devices('GPU')))" # TF_CPP_MIN_LOG_LEVEL=3 python -c "import tensorflow as tf; print('Number of TF GPUs available:', len(tf.config.list_physical_devices('GPU')))" # # - name: Fetch the tests to run # run: | # python utils/tests_fetcher.py --diff_with_last_commit | tee test_preparation.txt # # - name: Report fetched tests # uses: actions/upload-artifact@v2 # with: # name: test_fetched # path: test_preparation.txt # # - name: Run all non-slow tests on GPU # env: # TF_NUM_INTRAOP_THREADS: 8 # TF_NUM_INTEROP_THREADS: 1 # run: | # if [ -f test_list.txt ]; then # python -m pytest -n 1 --dist=loadfile --make-reports=tests_tf_gpu $(cat test_list.txt) # fi # # - name: Failure short reports # if: ${{ failure() }} # run: cat reports/tests_tf_gpu_failures_short.txt # # - name: Test suite reports artifacts # if: ${{ always() }} # uses: actions/upload-artifact@v2 # with: # name: run_all_tests_tf_gpu_test_reports # path: reports run_tests_torch_multi_gpu: runs-on: [self-hosted, docker-gpu, multi-gpu] container: image: pytorch/pytorch:1.9.0-cuda11.1-cudnn8-runtime options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ steps: - name: Install dependencies run: | apt -y update && apt install -y software-properties-common && apt -y update && add-apt-repository -y ppa:git-core/ppa && apt -y update && apt install -y git espeak-ng apt install -y libsndfile1-dev espeak-ng pip install --upgrade pip pip install .[sklearn,testing,onnxruntime,sentencepiece,torch-speech,vision,timm] pip install https://github.com/kpu/kenlm/archive/master.zip - name: Launcher docker uses: actions/checkout@v2 with: fetch-depth: 2 - name: NVIDIA-SMI continue-on-error: true run: | nvidia-smi - name: Are GPUs recognized by our DL frameworks run: | utils/print_env_pt.py - name: Fetch the tests to run run: | python utils/tests_fetcher.py --diff_with_last_commit | tee test_preparation.txt - name: Report fetched tests uses: actions/upload-artifact@v2 with: name: test_fetched path: test_preparation.txt - name: Run all non-slow tests on GPU env: MKL_SERVICE_FORCE_INTEL: 1 run: | if [ -f test_list.txt ]; then python -m pytest -n 2 --dist=loadfile -v --make-reports=tests_torch_multi_gpu $(cat test_list.txt) fi - name: Failure short reports if: ${{ failure() }} run: cat reports/tests_torch_multi_gpu_failures_short.txt - name: Test suite reports artifacts if: ${{ always() }} uses: actions/upload-artifact@v2 with: name: run_all_tests_torch_multi_gpu_test_reports path: reports # run_tests_flax_multi_gpu: # runs-on: [self-hosted, docker-gpu, multi-gpu] # container: # image: tensorflow/tensorflow:2.4.1-gpu # options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ # steps: # - name: Install dependencies # run: | # apt -y update && apt install -y software-properties-common && apt -y update && add-apt-repository -y ppa:git-core/ppa && apt -y update && apt install -y git espeak-ng # pip install --upgrade "jax[cuda111]" -f https://storage.googleapis.com/jax-releases/jax_releases.html # pip install --upgrade pip # pip install .[sklearn,testing,sentencepiece,flax,flax-speech,vision] # pip install https://github.com/kpu/kenlm/archive/master.zip # # - name: Launcher docker # uses: actions/checkout@v2 # with: # fetch-depth: 2 # # - name: NVIDIA-SMI # continue-on-error: true # run: | # nvidia-smi # # - name: Are GPUs recognized by our DL frameworks # run: | # python -c "from jax.lib import xla_bridge; print('GPU available:', xla_bridge.get_backend().platform)" # python -c "import jax; print('Number of GPUs available:', len(jax.local_devices()))" # # - name: Fetch the tests to run # run: | # python utils/tests_fetcher.py --diff_with_last_commit | tee test_preparation.txt # # - name: Report fetched tests # uses: actions/upload-artifact@v2 # with: # name: test_fetched # path: test_preparation.txt # # - name: Run all non-slow tests on GPU # run: | # if [ -f test_list.txt ]; then # python -m pytest -n 2 --dist=loadfile -v --make-reports=tests_flax_multi_gpu $(cat test_list.txt) # fi # # - name: Failure short reports # if: ${{ failure() }} # run: cat reports/tests_flax_multi_gpu_failures_short.txt # # - name: Test suite reports artifacts # if: ${{ always() }} # uses: actions/upload-artifact@v2 # with: # name: run_all_tests_flax_multi_gpu_test_reports # path: reports # run_tests_tf_multi_gpu: # runs-on: [self-hosted, docker-gpu, multi-gpu] # timeout-minutes: 120 # container: # image: tensorflow/tensorflow:2.4.1-gpu # options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ # steps: # - name: Install dependencies # run: | # apt -y update && apt install -y software-properties-common && apt -y update && add-apt-repository -y ppa:git-core/ppa && apt -y update && apt install -y git espeak-ng # pip install --upgrade pip # pip install .[sklearn,testing,onnxruntime,sentencepiece,tf-speech] # pip install https://github.com/kpu/kenlm/archive/master.zip # # - name: Launcher docker # uses: actions/checkout@v2 # with: # fetch-depth: 2 # # - name: NVIDIA-SMI # run: | # nvidia-smi # # - name: Are GPUs recognized by our DL frameworks # run: | # TF_CPP_MIN_LOG_LEVEL=3 python -c "import tensorflow as tf; print('TF GPUs available:', bool(tf.config.list_physical_devices('GPU')))" # TF_CPP_MIN_LOG_LEVEL=3 python -c "import tensorflow as tf; print('Number of TF GPUs available:', len(tf.config.list_physical_devices('GPU')))" # # - name: Fetch the tests to run # run: | # python utils/tests_fetcher.py --diff_with_last_commit | tee test_preparation.txt # # - name: Report fetched tests # uses: actions/upload-artifact@v2 # with: # name: test_fetched # path: test_preparation.txt # # - name: Run all non-slow tests on GPU # env: # TF_NUM_INTRAOP_THREADS: 8 # TF_NUM_INTEROP_THREADS: 1 # run: | # if [ -f test_list.txt ]; then # python -m pytest -n 1 --dist=loadfile --make-reports=tests_tf_multi_gpu $(cat test_list.txt) # fi # # - name: Failure short reports # if: ${{ failure() }} # run: cat reports/tests_tf_multi_gpu_failures_short.txt # # - name: Test suite reports artifacts # if: ${{ always() }} # uses: actions/upload-artifact@v2 # with: # name: run_all_tests_tf_multi_gpu_test_reports # path: reports run_tests_torch_cuda_extensions_gpu: runs-on: [self-hosted, docker-gpu, single-gpu] container: image: nvcr.io/nvidia/pytorch:21.03-py3 options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ steps: - name: Launcher docker uses: actions/checkout@v2 with: fetch-depth: 2 - name: NVIDIA-SMI run: | nvidia-smi - name: Install dependencies run: | apt -y update && apt install -y libaio-dev pip install --upgrade pip pip install .[deepspeed-testing] - name: Are GPUs recognized by our DL frameworks run: | utils/print_env_pt.py - name: Fetch the tests to run run: | python utils/tests_fetcher.py --diff_with_last_commit --filters tests/deepspeed tests/extended | tee test_preparation.txt - name: Report fetched tests uses: actions/upload-artifact@v2 with: name: test_fetched path: test_preparation.txt - name: Run all tests on GPU run: | if [ -f test_list.txt ]; then python -m pytest -n 1 --dist=loadfile -v --make-reports=tests_torch_cuda_extensions_gpu $(cat test_list.txt) fi - name: Failure short reports if: ${{ failure() }} run: cat reports/tests_torch_cuda_extensions_gpu_failures_short.txt - name: Test suite reports artifacts if: ${{ always() }} uses: actions/upload-artifact@v2 with: name: run_tests_torch_cuda_extensions_gpu_test_reports path: reports run_tests_torch_cuda_extensions_multi_gpu: runs-on: [self-hosted, docker-gpu, multi-gpu] container: image: nvcr.io/nvidia/pytorch:21.03-py3 options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ steps: - name: Launcher docker uses: actions/checkout@v2 with: fetch-depth: 2 - name: NVIDIA-SMI continue-on-error: true run: | nvidia-smi - name: Install dependencies run: | apt -y update && apt install -y libaio-dev pip install --upgrade pip rm -rf ~/.cache/torch_extensions/ # shared between conflicting builds pip install .[testing,deepspeed,fairscale] - name: Are GPUs recognized by our DL frameworks run: | utils/print_env_pt.py - name: Fetch the tests to run run: | python utils/tests_fetcher.py --diff_with_last_commit --filters tests/deepspeed tests/extended | tee test_preparation.txt - name: Report fetched tests uses: actions/upload-artifact@v2 with: name: test_fetched path: test_preparation.txt - name: Run all tests on GPU run: | if [ -f test_list.txt ]; then python -m pytest -n 1 --dist=loadfile -v --make-reports=tests_torch_cuda_extensions_multi_gpu $(cat test_list.txt) fi - name: Failure short reports if: ${{ failure() }} run: cat reports/tests_torch_cuda_extensions_multi_gpu_failures_short.txt - name: Test suite reports artifacts if: ${{ always() }} uses: actions/upload-artifact@v2 with: name: run_tests_torch_cuda_extensions_multi_gpu_test_reports path: reports send_results: name: Send results to webhook runs-on: ubuntu-latest if: always() needs: [ run_tests_torch_gpu, # run_tests_tf_gpu, run_tests_torch_multi_gpu, # run_tests_tf_multi_gpu, run_tests_torch_cuda_extensions_gpu, run_tests_torch_cuda_extensions_multi_gpu ] steps: - uses: actions/checkout@v2 - uses: actions/download-artifact@v2 - name: Send message to Slack env: CI_SLACK_BOT_TOKEN: ${{ secrets.CI_SLACK_BOT_TOKEN }} CI_SLACK_CHANNEL_ID: ${{ secrets.CI_SLACK_CHANNEL_ID }} run: | pip install slack_sdk python utils/notification_service_deprecated.py push