Unverified Commit 58f672e6 authored by Lysandre Debut's avatar Lysandre Debut Committed by GitHub
Browse files

Tests run on Docker (#10681)



* Tests run on Docker
Co-authored-by: default avatarMorgan <funtowiczmo@gmail.com>

* Comments from code review

* Reply to itself

* Dependencies
Co-authored-by: default avatarMorgan <funtowiczmo@gmail.com>
parent d41dd535
......@@ -10,73 +10,42 @@ on:
- "tests/**"
- ".github/**"
- "templates/**"
# pull_request:
repository_dispatch:
jobs:
run_tests_torch_gpu:
runs-on: [self-hosted, gpu, single-gpu]
runs-on: [self-hosted, docker-gpu, single-gpu]
container:
image: pytorch/pytorch:1.8.0-cuda11.1-cudnn8-runtime
options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
steps:
- uses: actions/checkout@v2
- name: Python version
run: |
which python
python --version
pip --version
- name: Current dir
run: pwd
- run: nvidia-smi
- name: Kill any run-away pytest processes
run: (pkill -f tests; pkill -f examples) || echo "no zombies"
- name: Launcher docker
uses: actions/checkout@v2
- name: Loading cache.
uses: actions/cache@v2
id: cache
with:
path: .env
key: v1.2-tests_torch_gpu-${{ hashFiles('setup.py') }}
- name: Create new python env (on self-hosted runners we have to handle isolation ourselves)
- name: NVIDIA-SMI
run: |
python -m venv .env
source .env/bin/activate
which python
python --version
pip --version
nvidia-smi
- name: Install dependencies
run: |
source .env/bin/activate
sudo apt-get -y update && sudo apt-get install -y libsndfile1-dev
apt -y update && apt install -y libsndfile1-dev
pip install --upgrade pip
pip install .[torch,sklearn,testing,onnxruntime,sentencepiece,speech]
pip install git+https://github.com/huggingface/datasets
pip install .[sklearn,testing,onnxruntime,sentencepiece,speech]
- name: Are GPUs recognized by our DL frameworks
run: |
source .env/bin/activate
python -c "import torch; print('Cuda available:', torch.cuda.is_available())"
python -c "import torch; print('Cuda version:', torch.version.cuda)"
python -c "import torch; print('CuDNN version:', torch.backends.cudnn.version())"
python -c "import torch; print('Number of GPUs available:', torch.cuda.device_count())"
# - name: Create model files
# run: |
# source .env/bin/activate
# transformers-cli add-new-model --testing --testing_file=templates/adding_a_new_model/tests/encoder-bert-tokenizer.json --path=templates/adding_a_new_model
# transformers-cli add-new-model --testing --testing_file=templates/adding_a_new_model/tests/pt-encoder-bert-tokenizer.json --path=templates/adding_a_new_model
# transformers-cli add-new-model --testing --testing_file=templates/adding_a_new_model/tests/standalone.json --path=templates/adding_a_new_model
# transformers-cli add-new-model --testing --testing_file=templates/adding_a_new_model/tests/tf-encoder-bert-tokenizer.json --path=templates/adding_a_new_model
- name: Run all non-slow tests on GPU
env:
OMP_NUM_THREADS: 1
CUDA_VISIBLE_DEVICES: 0
OMP_NUM_THREADS: 8
MKL_NUM_THREADS: 8
HF_HOME: /mnt/cache
run: |
source .env/bin/activate
python -m pytest -n 2 --dist=loadfile -s --make-reports=tests_torch_gpu tests
python -m pytest -n 2 --dist=loadfile --make-reports=tests_torch_gpu tests
- name: Failure short reports
if: ${{ always() }}
......@@ -89,68 +58,38 @@ jobs:
name: run_all_tests_torch_gpu_test_reports
path: reports
run_tests_tf_gpu:
runs-on: [self-hosted, gpu, single-gpu]
runs-on: [self-hosted, docker-gpu, single-gpu]
container:
image: tensorflow/tensorflow:2.4.1-gpu
options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
steps:
- uses: actions/checkout@v2
- name: Python version
run: |
which python
python --version
pip --version
- name: Current dir
run: pwd
- run: nvidia-smi
- name: Kill any run-away pytest processes
run: (pkill -f tests; pkill -f examples) || echo "no zombies"
- name: Loading cache.
uses: actions/cache@v2
id: cache
with:
path: .env
key: v1.2-tests_tf_gpu-${{ hashFiles('setup.py') }}
- name: Launcher docker
uses: actions/checkout@v2
- name: Create new python env (on self-hosted runners we have to handle isolation ourselves)
- name: NVIDIA-SMI
run: |
python -m venv .env
source .env/bin/activate
which python
python --version
pip --version
nvidia-smi
- name: Install dependencies
run: |
source .env/bin/activate
pip install --upgrade pip
pip install .[tf,sklearn,testing,onnxruntime,sentencepiece]
pip install git+https://github.com/huggingface/datasets
pip install .[sklearn,testing,onnxruntime,sentencepiece]
- name: Are GPUs recognized by our DL frameworks
run: |
source .env/bin/activate
TF_CPP_MIN_LOG_LEVEL=3 python -c "import tensorflow as tf; print('TF GPUs available:', bool(tf.config.list_physical_devices('GPU')))"
TF_CPP_MIN_LOG_LEVEL=3 python -c "import tensorflow as tf; print('Number of TF GPUs available:', len(tf.config.list_physical_devices('GPU')))"
- name: Create model files
run: |
source .env/bin/activate
# transformers-cli add-new-model --testing --testing_file=templates/adding_a_new_model/tests/encoder-bert-tokenizer.json --path=templates/adding_a_new_model
# transformers-cli add-new-model --testing --testing_file=templates/adding_a_new_model/tests/pt-encoder-bert-tokenizer.json --path=templates/adding_a_new_model
# transformers-cli add-new-model --testing --testing_file=templates/adding_a_new_model/tests/standalone.json --path=templates/adding_a_new_model
# transformers-cli add-new-model --testing --testing_file=templates/adding_a_new_model/tests/tf-encoder-bert-tokenizer.json --path=templates/adding_a_new_model
- name: Run all non-slow tests on GPU
env:
OMP_NUM_THREADS: 1
CUDA_VISIBLE_DEVICES: 0
OMP_NUM_THREADS: 8
MKL_NUM_THREADS: 8
TF_NUM_INTRAOP_THREADS: 8
TF_NUM_INTEROP_THREADS: 1
HF_HOME: /mnt/cache
run: |
source .env/bin/activate
python -m pytest -n 2 --dist=loadfile -s --make-reports=tests_tf_gpu tests
python -m pytest -n 2 --dist=loadfile --make-reports=tests_tf_gpu tests
- name: Failure short reports
if: ${{ always() }}
......@@ -163,58 +102,41 @@ jobs:
name: run_all_tests_tf_gpu_test_reports
path: reports
run_tests_torch_multi_gpu:
runs-on: [self-hosted, gpu, multi-gpu]
runs-on: [self-hosted, docker-gpu, multi-gpu]
container:
image: pytorch/pytorch:1.8.0-cuda11.1-cudnn8-runtime
options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
steps:
- uses: actions/checkout@v2
- name: Python version
run: |
which python
python --version
pip --version
- name: Launcher docker
uses: actions/checkout@v2
- name: Current dir
run: pwd
- run: nvidia-smi
- name: Kill any run-away pytest processes
run: (pkill -f tests; pkill -f examples) || echo "no zombies"
- name: Loading cache.
uses: actions/cache@v2
id: cache
with:
path: .env
key: v1.2-tests_torch_multi_gpu-${{ hashFiles('setup.py') }}
- name: Create new python env (on self-hosted runners we have to handle isolation ourselves)
- name: NVIDIA-SMI
run: |
python -m venv .env
source .env/bin/activate
which python
python --version
pip --version
nvidia-smi
- name: Install dependencies
run: |
source .env/bin/activate
sudo apt-get -y update && sudo apt-get install -y libsndfile1-dev
apt -y update && apt install -y libsndfile1-dev
pip install --upgrade pip
pip install .[torch,sklearn,testing,onnxruntime,sentencepiece,speech]
pip install git+https://github.com/huggingface/datasets
pip install .[sklearn,testing,onnxruntime,sentencepiece,speech]
- name: Are GPUs recognized by our DL frameworks
run: |
source .env/bin/activate
python -c "import torch; print('Cuda available:', torch.cuda.is_available())"
python -c "import torch; print('Cuda version:', torch.version.cuda)"
python -c "import torch; print('CuDNN version:', torch.backends.cudnn.version())"
python -c "import torch; print('Number of GPUs available:', torch.cuda.device_count())"
- name: Run all non-slow tests on GPU
env:
OMP_NUM_THREADS: 1
OMP_NUM_THREADS: 8
MKL_NUM_THREADS: 8
MKL_SERVICE_FORCE_INTEL: 1
HF_HOME: /mnt/cache
run: |
source .env/bin/activate
python -m pytest -n 2 --dist=loadfile -s --make-reports=tests_torch_multi_gpu tests
python -m pytest -n 2 --dist=loadfile --make-reports=tests_torch_multi_gpu tests
- name: Failure short reports
if: ${{ always() }}
......@@ -228,56 +150,37 @@ jobs:
path: reports
run_tests_tf_multi_gpu:
runs-on: [self-hosted, gpu, multi-gpu]
runs-on: [self-hosted, docker-gpu, multi-gpu]
container:
image: tensorflow/tensorflow:2.4.1-gpu
options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
steps:
- uses: actions/checkout@v2
- name: Python version
run: |
which python
python --version
pip --version
- name: Current dir
run: pwd
- run: nvidia-smi
- name: Kill any run-away pytest processes
run: (pkill -f tests; pkill -f examples) || echo "no zombies"
- name: Launcher docker
uses: actions/checkout@v2
- name: Loading cache.
uses: actions/cache@v2
id: cache
with:
path: .env
key: v1.2-tests_tf_multi_gpu-${{ hashFiles('setup.py') }}
- name: Create new python env (on self-hosted runners we have to handle isolation ourselves)
- name: NVIDIA-SMI
run: |
python -m venv .env
source .env/bin/activate
which python
python --version
pip --version
nvidia-smi
- name: Install dependencies
run: |
source .env/bin/activate
pip install --upgrade pip
pip install .[tf,sklearn,testing,onnxruntime,sentencepiece]
pip install git+https://github.com/huggingface/datasets
pip install .[sklearn,testing,onnxruntime,sentencepiece]
- name: Are GPUs recognized by our DL frameworks
run: |
source .env/bin/activate
TF_CPP_MIN_LOG_LEVEL=3 python -c "import tensorflow as tf; print('TF GPUs available:', bool(tf.config.list_physical_devices('GPU')))"
TF_CPP_MIN_LOG_LEVEL=3 python -c "import tensorflow as tf; print('Number of TF GPUs available:', len(tf.config.list_physical_devices('GPU')))"
- name: Run all non-slow tests on GPU
env:
OMP_NUM_THREADS: 1
OMP_NUM_THREADS: 8
MKL_NUM_THREADS: 8
TF_NUM_INTRAOP_THREADS: 8
TF_NUM_INTEROP_THREADS: 1
HF_HOME: /mnt/cache
run: |
source .env/bin/activate
python -m pytest -n 2 --dist=loadfile -s --make-reports=tests_tf_multi_gpu tests
python -m pytest -n 2 --dist=loadfile --make-reports=tests_tf_multi_gpu tests
- name: Failure short reports
if: ${{ always() }}
......@@ -289,3 +192,22 @@ jobs:
with:
name: run_all_tests_tf_multi_gpu_test_reports
path: reports
send_results:
name: Send results to webhook
runs-on: ubuntu-latest
if: always()
needs: [run_tests_torch_gpu, run_tests_tf_gpu, run_tests_torch_multi_gpu, run_tests_tf_multi_gpu]
steps:
- uses: actions/checkout@v2
- uses: actions/download-artifact@v2
- name: Send message to Slack
env:
CI_SLACK_BOT_TOKEN: ${{ secrets.CI_SLACK_BOT_TOKEN }}
CI_SLACK_CHANNEL_ID: ${{ secrets.CI_SLACK_CHANNEL_ID }}
run: |
pip install slack_sdk
python utils/notification_service.py push
\ No newline at end of file
# configuration notes:
#
# - `source .env/bin/activate` is currently needed to be run first thing first in each step. Otherwise
# the step uses the system-wide python interpreter.
name: Self-hosted runner (scheduled)
on:
......@@ -15,61 +10,39 @@ on:
jobs:
run_all_tests_torch_gpu:
runs-on: [self-hosted, gpu, single-gpu]
runs-on: [self-hosted, docker-gpu, single-gpu]
container:
image: pytorch/pytorch:1.8.0-cuda11.1-cudnn8-runtime
options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
steps:
- uses: actions/checkout@v2
- name: Loading cache.
uses: actions/cache@v2
id: cache
with:
path: .env
key: v 1.2-slow_tests_torch_gpu-${{ hashFiles('setup.py') }}
- name: Launcher docker
uses: actions/checkout@v2
- name: Python version
- name: NVIDIA-SMI
run: |
which python
python --version
pip --version
- name: Current dir
run: pwd
- run: nvidia-smi
- name: Kill any run-away pytest processes
run: (pkill -f tests; pkill -f examples) || echo "no zombies"
- name: Create new python env (on self-hosted runners we have to handle isolation ourselves)
if: steps.cache.outputs.cache-hit != 'true'
run: |
python -m venv .env
source .env/bin/activate
which python
python --version
pip --version
nvidia-smi
- name: Install dependencies
run: |
source .env/bin/activate
apt -y update && apt install -y libsndfile1-dev
pip install --upgrade pip
pip install .[torch,sklearn,testing,onnxruntime,sentencepiece]
pip install git+https://github.com/huggingface/datasets
pip list
pip install .[sklearn,testing,onnxruntime,sentencepiece,speech]
- name: Are GPUs recognized by our DL frameworks
run: |
source .env/bin/activate
python -c "import torch; print('Cuda available:', torch.cuda.is_available())"
python -c "import torch; print('Cuda version:', torch.version.cuda)"
python -c "import torch; print('CuDNN version:', torch.backends.cudnn.version())"
python -c "import torch; print('Number of GPUs available:', torch.cuda.device_count())"
- name: Run all tests on GPU
env:
OMP_NUM_THREADS: 1
OMP_NUM_THREADS: 16
MKL_NUM_THREADS: 16
RUN_SLOW: yes
HF_HOME: /mnt/cache
run: |
source .env/bin/activate
python -m pytest -n 1 --dist=loadfile -s --make-reports=tests_torch_gpu tests
python -m pytest -n 1 --dist=loadfile --make-reports=tests_torch_gpu tests
- name: Failure short reports
if: ${{ always() }}
......@@ -78,12 +51,13 @@ jobs:
- name: Run examples tests on GPU
if: ${{ always() }}
env:
OMP_NUM_THREADS: 1
OMP_NUM_THREADS: 16
MKL_NUM_THREADS: 16
RUN_SLOW: yes
HF_HOME: /mnt/cache
run: |
source .env/bin/activate
pip install -r examples/_tests_requirements.txt
python -m pytest -n 1 --dist=loadfile -s --make-reports=examples_torch_gpu examples
python -m pytest -n 1 --dist=loadfile --make-reports=examples_torch_gpu examples
- name: Failure short reports
if: ${{ always() }}
......@@ -92,13 +66,13 @@ jobs:
- name: Run all pipeline tests on GPU
if: ${{ always() }}
env:
TF_FORCE_GPU_ALLOW_GROWTH: "true"
OMP_NUM_THREADS: 1
OMP_NUM_THREADS: 16
MKL_NUM_THREADS: 16
RUN_SLOW: yes
RUN_PIPELINE_TESTS: yes
HF_HOME: /mnt/cache
run: |
source .env/bin/activate
python -m pytest -n 1 --dist=loadfile -s -m is_pipeline_test --make-reports=tests_torch_pipeline_gpu tests
python -m pytest -n 1 --dist=loadfile -m is_pipeline_test --make-reports=tests_torch_pipeline_gpu tests
- name: Failure short reports
if: ${{ always() }}
......@@ -111,64 +85,39 @@ jobs:
name: run_all_tests_torch_gpu_test_reports
path: reports
run_all_tests_tf_gpu:
runs-on: [self-hosted, gpu, single-gpu]
runs-on: [self-hosted, docker-gpu, single-gpu]
container:
image: tensorflow/tensorflow:2.4.1-gpu
options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
steps:
- uses: actions/checkout@v2
- name: Launcher docker
uses: actions/checkout@v2
- name: Loading cache.
uses: actions/cache@v2
id: cache
with:
path: .env
key: v1.2-slow_tests_tf_gpu-${{ hashFiles('setup.py') }}
- name: Python version
- name: NVIDIA-SMI
run: |
which python
python --version
pip --version
- name: Current dir
run: pwd
- run: nvidia-smi
- name: Kill any run-away pytest processes
run: (pkill -f tests; pkill -f examples) || echo "no zombies"
- name: Create new python env (on self-hosted runners we have to handle isolation ourselves)
if: steps.cache.outputs.cache-hit != 'true'
run: |
python -m venv .env
source .env/bin/activate
which python
python --version
pip --version
nvidia-smi
- name: Install dependencies
run: |
source .env/bin/activate
pip install --upgrade pip
pip install .[tf,sklearn,testing,onnxruntime,sentencepiece]
pip install git+https://github.com/huggingface/datasets
pip list
pip install .[sklearn,testing,onnx,sentencepiece]
- name: Are GPUs recognized by our DL frameworks
run: |
source .env/bin/activate
TF_CPP_MIN_LOG_LEVEL=3 python -c "import tensorflow as tf; print('TF GPUs available:', bool(tf.config.list_physical_devices('GPU')))"
TF_CPP_MIN_LOG_LEVEL=3 python -c "import tensorflow as tf; print('Number of TF GPUs available:', len(tf.config.list_physical_devices('GPU')))"
- name: Run all tests on GPU
env:
OMP_NUM_THREADS: 1
RUN_SLOW: yes
HF_HOME: /mnt/cache
OMP_NUM_THREADS: 16
TF_NUM_INTEROP_THREADS: 1
TF_NUM_INTRAOP_THREADS: 16
MKL_NUM_THREADS: 16
run: |
source .env/bin/activate
python -m pytest -n 1 --dist=loadfile -s --make-reports=tests_tf_gpu tests
python -m pytest -n 1 --dist=loadfile --make-reports=tests_tf_gpu tests
- name: Failure short reports
if: ${{ always() }}
......@@ -177,17 +126,19 @@ jobs:
- name: Run all pipeline tests on GPU
if: ${{ always() }}
env:
TF_FORCE_GPU_ALLOW_GROWTH: "true"
OMP_NUM_THREADS: 1
RUN_SLOW: yes
HF_HOME: /mnt/cache
OMP_NUM_THREADS: 16
RUN_PIPELINE_TESTS: yes
TF_NUM_INTEROP_THREADS: 1
TF_NUM_INTRAOP_THREADS: 16
MKL_NUM_THREADS: 16
run: |
source .env/bin/activate
python -m pytest -n 1 --dist=loadfile -s -m is_pipeline_test --make-reports=tests_tf_pipelines_gpu tests
python -m pytest -n 1 --dist=loadfile -m is_pipeline_test --make-reports=tests_tf_pipeline_gpu tests
- name: Failure short reports
if: ${{ always() }}
run: cat reports/tests_tf_pipelines_gpu_failures_short.txt
run: cat reports/tests_tf_pipeline_gpu_failures_short.txt
- name: Test suite reports artifacts
if: ${{ always() }}
......@@ -197,92 +148,55 @@ jobs:
path: reports
run_all_tests_torch_multi_gpu:
runs-on: [self-hosted, gpu, multi-gpu]
runs-on: [self-hosted, docker-gpu, multi-gpu]
container:
image: pytorch/pytorch:1.8.0-cuda11.1-cudnn8-runtime
options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
steps:
- uses: actions/checkout@v2
- name: Loading cache.
uses: actions/cache@v2
id: cache
with:
path: .env
key: v1.2-slow_tests_torch_multi_gpu-${{ hashFiles('setup.py') }}
- name: Launcher docker
uses: actions/checkout@v2
- name: Python version
- name: NVIDIA-SMI
run: |
which python
python --version
pip --version
- name: Current dir
run: pwd
- run: nvidia-smi
- name: Kill any run-away pytest processes
run: (pkill -f tests; pkill -f examples) || echo "no zombies"
- name: Create new python env (on self-hosted runners we have to handle isolation ourselves)
if: steps.cache.outputs.cache-hit != 'true'
run: |
python -m venv .env
source .env/bin/activate
which python
python --version
pip --version
nvidia-smi
- name: Install dependencies
run: |
source .env/bin/activate
apt -y update && apt install -y libsndfile1-dev
pip install --upgrade pip
pip install .[torch,sklearn,testing,onnxruntime,sentencepiece]
pip install git+https://github.com/huggingface/datasets
pip install fairscale
pip install deepspeed
pip list
pip install .[sklearn,testing,onnxruntime,sentencepiece,speech]
- name: Are GPUs recognized by our DL frameworks
run: |
source .env/bin/activate
python -c "import torch; print('Cuda available:', torch.cuda.is_available())"
python -c "import torch; print('Cuda version:', torch.version.cuda)"
python -c "import torch; print('CuDNN version:', torch.backends.cudnn.version())"
python -c "import torch; print('Number of GPUs available:', torch.cuda.device_count())"
- name: Run all tests on multi-GPU
- name: Run all tests on GPU
env:
OMP_NUM_THREADS: 1
RUN_SLOW: yes
HF_HOME: /mnt/cache
OMP_NUM_THREADS: 16
MKL_NUM_THREADS: 16
MKL_SERVICE_FORCE_INTEL: 1
run: |
source .env/bin/activate
python -m pytest -n 1 --dist=loadfile -s --make-reports=tests_torch_multi_gpu tests
python -m pytest -n 1 --dist=loadfile --make-reports=tests_torch_multi_gpu tests
- name: Failure short reports
if: ${{ always() }}
run: cat reports/tests_torch_multi_gpu_failures_short.txt
- name: Run examples tests on multi-GPU
if: ${{ always() }}
env:
OMP_NUM_THREADS: 1
RUN_SLOW: yes
run: |
source .env/bin/activate
pip install -r examples/_tests_requirements.txt
python -m pytest -n 1 --dist=loadfile -s --make-reports=tests_torch_examples_multi_gpu examples
- name: Failure short reports
if: ${{ always() }}
run: cat reports/tests_torch_examples_multi_gpu_failures_short.txt
- name: Run all pipeline tests on multi-GPU
- name: Run all pipeline tests on GPU
if: ${{ always() }}
env:
TF_FORCE_GPU_ALLOW_GROWTH: "true"
OMP_NUM_THREADS: 1
OMP_NUM_THREADS: 16
MKL_NUM_THREADS: 16
RUN_SLOW: yes
RUN_PIPELINE_TESTS: yes
HF_HOME: /mnt/cache
run: |
source .env/bin/activate
python -m pytest -n 1 --dist=loadfile -s -m is_pipeline_test --make-reports=tests_torch_pipeline_multi_gpu tests
python -m pytest -n 1 --dist=loadfile -m is_pipeline_test --make-reports=tests_torch_pipeline_multi_gpu tests
- name: Failure short reports
if: ${{ always() }}
......@@ -296,76 +210,55 @@ jobs:
path: reports
run_all_tests_tf_multi_gpu:
runs-on: [self-hosted, gpu, multi-gpu]
runs-on: [self-hosted, docker-gpu, multi-gpu]
container:
image: tensorflow/tensorflow:2.4.1-gpu
options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
steps:
- uses: actions/checkout@v2
- name: Loading cache.
uses: actions/cache@v2
id: cache
with:
path: .env
key: v1.2-slow_tests_tf_multi_gpu-${{ hashFiles('setup.py') }}
- name: Python version
run: |
which python
python --version
pip --version
- name: Launcher docker
uses: actions/checkout@v2
- name: Current dir
run: pwd
- run: nvidia-smi
- name: Kill any run-away pytest processes
run: (pkill -f tests; pkill -f examples) || echo "no zombies"
- name: Create new python env (on self-hosted runners we have to handle isolation ourselves)
if: steps.cache.outputs.cache-hit != 'true'
- name: NVIDIA-SMI
run: |
python -m venv .env
source .env/bin/activate
which python
python --version
pip --version
nvidia-smi
- name: Install dependencies
run: |
source .env/bin/activate
pip install --upgrade pip
pip install .[tf,sklearn,testing,onnxruntime,sentencepiece]
pip install git+https://github.com/huggingface/datasets
pip list
pip install .[sklearn,testing,onnx,sentencepiece]
- name: Are GPUs recognized by our DL frameworks
run: |
source .env/bin/activate
TF_CPP_MIN_LOG_LEVEL=3 python -c "import tensorflow as tf; print('TF GPUs available:', bool(tf.config.list_physical_devices('GPU')))"
TF_CPP_MIN_LOG_LEVEL=3 python -c "import tensorflow as tf; print('Number of TF GPUs available:', len(tf.config.list_physical_devices('GPU')))"
- name: Run all tests on multi-GPU
- name: Run all tests on GPU
env:
OMP_NUM_THREADS: 1
OMP_NUM_THREADS: 16
RUN_SLOW: yes
MKL_NUM_THREADS: 16
TF_NUM_INTEROP_THREADS: 1
TF_NUM_INTRAOP_THREADS: 16
HF_HOME: /mnt/cache
run: |
source .env/bin/activate
python -m pytest -n 1 --dist=loadfile -s --make-reports=tests_tf_multi_gpu tests
python -m pytest -n 1 --dist=loadfile --make-reports=tests_tf_multi_gpu tests
- name: Failure short reports
if: ${{ always() }}
run: cat reports/tests_tf_multi_gpu_failures_short.txt
- name: Run all pipeline tests on multi-GPU
- name: Run all pipeline tests on GPU
if: ${{ always() }}
env:
TF_FORCE_GPU_ALLOW_GROWTH: "true"
OMP_NUM_THREADS: 1
OMP_NUM_THREADS: 16
RUN_SLOW: yes
RUN_PIPELINE_TESTS: yes
MKL_NUM_THREADS: 16
TF_NUM_INTEROP_THREADS: 1
TF_NUM_INTRAOP_THREADS: 16
HF_HOME: /mnt/cache
run: |
source .env/bin/activate
python -m pytest -n 1 --dist=loadfile -s -m is_pipeline_test --make-reports=tests_tf_pipeline_multi_gpu tests
python -m pytest -n 1 --dist=loadfile -m is_pipeline_test --make-reports=tests_tf_pipeline_multi_gpu tests
- name: Failure short reports
if: ${{ always() }}
......@@ -377,3 +270,23 @@ jobs:
with:
name: run_all_tests_tf_multi_gpu_test_reports
path: reports
send_results:
name: Send results to webhook
runs-on: ubuntu-latest
if: always()
needs: [run_all_tests_torch_gpu, run_all_tests_tf_gpu, run_all_tests_torch_multi_gpu, run_all_tests_tf_multi_gpu]
steps:
- uses: actions/checkout@v2
- uses: actions/download-artifact@v2
- name: Send message to Slack
env:
CI_SLACK_BOT_TOKEN: ${{ secrets.CI_SLACK_BOT_TOKEN }}
CI_SLACK_CHANNEL_ID: ${{ secrets.CI_SLACK_CHANNEL_ID }}
run: |
pip install slack_sdk
python utils/notification_service.py scheduled
......@@ -115,6 +115,7 @@ _deps = [
"psutil",
"pydantic",
"pytest",
"pytest-sugar",
"pytest-xdist",
"python>=3.6.0",
"recommonmark",
......@@ -225,6 +226,7 @@ else:
extras["tokenizers"] = deps_list("tokenizers")
extras["onnxruntime"] = deps_list("onnxruntime", "onnxruntime-tools")
extras["onnx"] = deps_list("onnxconverter-common", "keras2onnx") + extras["onnxruntime"]
extras["modelcreation"] = deps_list("cookiecutter")
extras["serving"] = deps_list("pydantic", "uvicorn", "fastapi", "starlette")
......@@ -232,7 +234,7 @@ extras["speech"] = deps_list("soundfile", "torchaudio")
extras["sentencepiece"] = deps_list("sentencepiece", "protobuf")
extras["testing"] = (
deps_list("pytest", "pytest-xdist", "timeout-decorator", "parameterized", "psutil", "datasets")
deps_list("pytest", "pytest-xdist", "timeout-decorator", "parameterized", "psutil", "datasets", "pytest-sugar")
+ extras["retrieval"]
+ extras["modelcreation"]
)
......
......@@ -28,6 +28,7 @@ deps = {
"psutil": "psutil",
"pydantic": "pydantic",
"pytest": "pytest",
"pytest-sugar": "pytest-sugar",
"pytest-xdist": "pytest-xdist",
"python": "python>=3.6.0",
"recommonmark": "recommonmark",
......
......@@ -137,6 +137,17 @@ def slow(test_case):
return test_case
def tooslow(test_case):
"""
Decorator marking a test as too slow.
Slow tests are skipped while they're in the process of being fixed. No test should stay tagged as "tooslow" as
these will not be tested by the CI.
"""
return unittest.skip("test is too slow")(test_case)
def custom_tokenizers(test_case):
"""
Decorator marking a test for a custom tokenizer.
......
......@@ -25,7 +25,14 @@ from importlib import import_module
from typing import List, Tuple
from transformers import is_tf_available
from transformers.testing_utils import _tf_gpu_memory_limit, is_pt_tf_cross_test, require_onnx, require_tf, slow
from transformers.testing_utils import (
_tf_gpu_memory_limit,
is_pt_tf_cross_test,
require_onnx,
require_tf,
slow,
tooslow,
)
if is_tf_available():
......@@ -129,7 +136,7 @@ class TFModelTesterMixin:
self.assert_outputs_same(after_outputs, outputs)
@slow
@tooslow
def test_graph_mode(self):
config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
for model_class in self.all_model_classes:
......@@ -143,7 +150,7 @@ class TFModelTesterMixin:
outputs = run_in_graph_mode()
self.assertIsNotNone(outputs)
@slow
@tooslow
def test_xla_mode(self):
config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
for model_class in self.all_model_classes:
......@@ -184,7 +191,7 @@ class TFModelTesterMixin:
expected_arg_names = ["input_ids"]
self.assertListEqual(arg_names[:1], expected_arg_names)
@slow
@tooslow
def test_saved_model_creation(self):
config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
config.output_hidden_states = False
......@@ -205,7 +212,7 @@ class TFModelTesterMixin:
saved_model_dir = os.path.join(tmpdirname, "saved_model", "1")
self.assertTrue(os.path.exists(saved_model_dir))
@slow
@tooslow
def test_saved_model_creation_extended(self):
config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
config.output_hidden_states = True
......@@ -314,7 +321,7 @@ class TFModelTesterMixin:
onnxruntime.InferenceSession(onnx_model.SerializeToString())
@slow
@tooslow
def test_mixed_precision(self):
tf.keras.mixed_precision.experimental.set_policy("mixed_float16")
......@@ -488,7 +495,7 @@ class TFModelTesterMixin:
max_diff = np.amax(np.abs(tfo - pto))
self.assertLessEqual(max_diff, 4e-2)
@slow
@tooslow
def test_train_pipeline_custom_model(self):
config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
# head_mask and decoder_head_mask has different shapes than other input args
......@@ -909,7 +916,7 @@ class TFModelTesterMixin:
model(inputs)
@slow
@tooslow
def test_graph_mode_with_inputs_embeds(self):
config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
......
# Copyright 2020 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os
import re
import sys
from slack_sdk import WebClient
def handle_test_results(test_results):
expressions = test_results.split(" ")
failed = 0
success = 0
# When the output is short enough, the output is surrounded by = signs: "== OUTPUT =="
# When it is too long, those signs are not present.
time_spent = expressions[-2] if "=" in expressions[-1] else expressions[-1]
for i, expression in enumerate(expressions):
if "failed" in expression:
failed += int(expressions[i - 1])
if "passed" in expression:
success += int(expressions[i - 1])
return failed, success, time_spent
def format_for_slack(total_results, results, scheduled: bool):
print(results)
header = {
"type": "header",
"text": {
"type": "plain_text",
"text": "🤗 Results of the scheduled tests, March 11, 2021." if scheduled else "🤗 Self-push results",
"emoji": True,
},
}
total = (
{
"type": "section",
"fields": [
{"type": "mrkdwn", "text": f"*Failures:*\n{total_results['failed']} failures."},
{"type": "mrkdwn", "text": f"*Passed:*\n{total_results['success']} tests passed."},
],
}
if total_results["failed"] > 0
else {
"type": "section",
"fields": [{"type": "mrkdwn", "text": f"*Congrats!*\nAll {total_results['success']} tests pass."}],
}
)
blocks = [header, total]
if total_results["failed"] > 0:
for key, result in results.items():
print(key, result)
blocks.append({"type": "header", "text": {"type": "plain_text", "text": key, "emoji": True}})
blocks.append(
{
"type": "section",
"fields": [
{
"type": "mrkdwn",
"text": f"*Results:*\n{result['failed']} failed, {result['success']} passed.",
},
{"type": "mrkdwn", "text": f"*Time spent:*\n{result['time_spent']}"},
],
}
)
else:
for key, result in results.items():
blocks.append(
{"type": "section", "fields": [{"type": "mrkdwn", "text": f"*{key}*\n{result['time_spent']}."}]}
)
footer = {
"type": "section",
"text": {
"type": "mrkdwn",
"text": "<https://github.com/huggingface/transformers/actions/workflows/self-scheduled.yml|View on GitHub>"
if scheduled
else "<https://github.com/huggingface/transformers/actions/workflows/self-push.yml|View on GitHub>",
},
}
blocks.append(footer)
blocks = {"blocks": blocks}
return blocks
if __name__ == "__main__":
scheduled = sys.argv[1] == "scheduled"
if scheduled:
# The scheduled run has several artifacts for each job.
file_paths = {
"TF Single GPU": {
"common": "run_all_tests_tf_gpu_test_reports/tests_tf_gpu_[].txt",
"pipeline": "run_all_tests_tf_gpu_test_reports/tests_tf_pipeline_gpu_[].txt",
},
"Torch Single GPU": {
"common": "run_all_tests_torch_gpu_test_reports/tests_torch_gpu_[].txt",
"pipeline": "run_all_tests_torch_gpu_test_reports/tests_torch_pipeline_gpu_[].txt",
"examples": "run_all_tests_torch_gpu_test_reports/examples_torch_gpu_[].txt",
},
"TF Multi GPU": {
"common": "run_all_tests_tf_multi_gpu_test_reports/tests_tf_multi_gpu_[].txt",
"pipeline": "run_all_tests_tf_multi_gpu_test_reports/tests_tf_pipeline_multi_gpu_[].txt",
},
"Torch Multi GPU": {
"common": "run_all_tests_torch_multi_gpu_test_reports/tests_torch_multi_gpu_[].txt",
"pipeline": "run_all_tests_torch_multi_gpu_test_reports/tests_torch_pipeline_multi_gpu_[].txt",
},
}
else:
file_paths = {
"TF Single GPU": {"common": "run_all_tests_tf_gpu_test_reports/tests_tf_gpu_[].txt"},
"Torch Single GPU": {"common": "run_all_tests_torch_gpu_test_reports/tests_torch_gpu_[].txt"},
"TF Multi GPU": {"common": "run_all_tests_tf_multi_gpu_test_reports/tests_tf_multi_gpu_[].txt"},
"Torch Multi GPU": {"common": "run_all_tests_torch_multi_gpu_test_reports/tests_torch_multi_gpu_[].txt"},
}
client = WebClient(token=os.environ["CI_SLACK_BOT_TOKEN"])
channel_id = os.environ["CI_SLACK_CHANNEL_ID"]
try:
results = {}
for job, file_dict in file_paths.items():
# Single return value for failed/success across steps of a same job
results[job] = {"failed": 0, "success": 0, "time_spent": "", "failures": ""}
for key, file_path in file_dict.items():
with open(file_path.replace("[]", "stats")) as f:
failed, success, time_spent = handle_test_results(f.read())
results[job]["failed"] += failed
results[job]["success"] += success
results[job]["time_spent"] += time_spent[1:-1] + ", "
with open(file_path.replace("[]", "summary_short")) as f:
for line in f:
if re.search("FAILED", line):
results[job]["failures"] += line
# Remove the trailing ", "
results[job]["time_spent"] = results[job]["time_spent"][:-2]
test_results_keys = ["failed", "success"]
total = {"failed": 0, "success": 0}
for job, job_result in results.items():
for result_key in test_results_keys:
total[result_key] += job_result[result_key]
to_be_sent_to_slack = format_for_slack(total, results, scheduled)
result = client.chat_postMessage(
channel=channel_id,
blocks=to_be_sent_to_slack["blocks"],
)
for job, job_result in results.items():
if len(job_result["failures"]):
client.chat_postMessage(
channel=channel_id, text=f"{job}\n{job_result['failures']}", thread_ts=result["ts"]
)
except Exception as e:
# Voluntarily catch every exception and send it to Slack.
raise Exception(f"Setup error: no artifacts were found. Error: {e}") from e
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment