"app/dialog/git@developer.sourcefind.cn:OpenDAS/ollama.git" did not exist on "95fdd8d619ad4dc9215cdce8a8665284a96cd96f"
Unverified Commit 58f672e6 authored by Lysandre Debut's avatar Lysandre Debut Committed by GitHub
Browse files

Tests run on Docker (#10681)



* Tests run on Docker
Co-authored-by: default avatarMorgan <funtowiczmo@gmail.com>

* Comments from code review

* Reply to itself

* Dependencies
Co-authored-by: default avatarMorgan <funtowiczmo@gmail.com>
parent d41dd535
...@@ -10,73 +10,42 @@ on: ...@@ -10,73 +10,42 @@ on:
- "tests/**" - "tests/**"
- ".github/**" - ".github/**"
- "templates/**" - "templates/**"
# pull_request:
repository_dispatch: repository_dispatch:
jobs: jobs:
run_tests_torch_gpu: run_tests_torch_gpu:
runs-on: [self-hosted, gpu, single-gpu] runs-on: [self-hosted, docker-gpu, single-gpu]
container:
image: pytorch/pytorch:1.8.0-cuda11.1-cudnn8-runtime
options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
steps: steps:
- uses: actions/checkout@v2 - name: Launcher docker
- name: Python version uses: actions/checkout@v2
run: |
which python
python --version
pip --version
- name: Current dir
run: pwd
- run: nvidia-smi
- name: Kill any run-away pytest processes
run: (pkill -f tests; pkill -f examples) || echo "no zombies"
- name: Loading cache. - name: NVIDIA-SMI
uses: actions/cache@v2
id: cache
with:
path: .env
key: v1.2-tests_torch_gpu-${{ hashFiles('setup.py') }}
- name: Create new python env (on self-hosted runners we have to handle isolation ourselves)
run: | run: |
python -m venv .env nvidia-smi
source .env/bin/activate
which python
python --version
pip --version
- name: Install dependencies - name: Install dependencies
run: | run: |
source .env/bin/activate apt -y update && apt install -y libsndfile1-dev
sudo apt-get -y update && sudo apt-get install -y libsndfile1-dev
pip install --upgrade pip pip install --upgrade pip
pip install .[torch,sklearn,testing,onnxruntime,sentencepiece,speech] pip install .[sklearn,testing,onnxruntime,sentencepiece,speech]
pip install git+https://github.com/huggingface/datasets
- name: Are GPUs recognized by our DL frameworks - name: Are GPUs recognized by our DL frameworks
run: | run: |
source .env/bin/activate
python -c "import torch; print('Cuda available:', torch.cuda.is_available())" python -c "import torch; print('Cuda available:', torch.cuda.is_available())"
python -c "import torch; print('Cuda version:', torch.version.cuda)"
python -c "import torch; print('CuDNN version:', torch.backends.cudnn.version())"
python -c "import torch; print('Number of GPUs available:', torch.cuda.device_count())" python -c "import torch; print('Number of GPUs available:', torch.cuda.device_count())"
# - name: Create model files
# run: |
# source .env/bin/activate
# transformers-cli add-new-model --testing --testing_file=templates/adding_a_new_model/tests/encoder-bert-tokenizer.json --path=templates/adding_a_new_model
# transformers-cli add-new-model --testing --testing_file=templates/adding_a_new_model/tests/pt-encoder-bert-tokenizer.json --path=templates/adding_a_new_model
# transformers-cli add-new-model --testing --testing_file=templates/adding_a_new_model/tests/standalone.json --path=templates/adding_a_new_model
# transformers-cli add-new-model --testing --testing_file=templates/adding_a_new_model/tests/tf-encoder-bert-tokenizer.json --path=templates/adding_a_new_model
- name: Run all non-slow tests on GPU - name: Run all non-slow tests on GPU
env: env:
OMP_NUM_THREADS: 1 OMP_NUM_THREADS: 8
CUDA_VISIBLE_DEVICES: 0 MKL_NUM_THREADS: 8
HF_HOME: /mnt/cache
run: | run: |
source .env/bin/activate python -m pytest -n 2 --dist=loadfile --make-reports=tests_torch_gpu tests
python -m pytest -n 2 --dist=loadfile -s --make-reports=tests_torch_gpu tests
- name: Failure short reports - name: Failure short reports
if: ${{ always() }} if: ${{ always() }}
...@@ -89,68 +58,38 @@ jobs: ...@@ -89,68 +58,38 @@ jobs:
name: run_all_tests_torch_gpu_test_reports name: run_all_tests_torch_gpu_test_reports
path: reports path: reports
run_tests_tf_gpu: run_tests_tf_gpu:
runs-on: [self-hosted, gpu, single-gpu] runs-on: [self-hosted, docker-gpu, single-gpu]
container:
image: tensorflow/tensorflow:2.4.1-gpu
options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
steps: steps:
- uses: actions/checkout@v2 - name: Launcher docker
- name: Python version uses: actions/checkout@v2
run: |
which python
python --version
pip --version
- name: Current dir
run: pwd
- run: nvidia-smi
- name: Kill any run-away pytest processes
run: (pkill -f tests; pkill -f examples) || echo "no zombies"
- name: Loading cache.
uses: actions/cache@v2
id: cache
with:
path: .env
key: v1.2-tests_tf_gpu-${{ hashFiles('setup.py') }}
- name: Create new python env (on self-hosted runners we have to handle isolation ourselves) - name: NVIDIA-SMI
run: | run: |
python -m venv .env nvidia-smi
source .env/bin/activate
which python
python --version
pip --version
- name: Install dependencies - name: Install dependencies
run: | run: |
source .env/bin/activate
pip install --upgrade pip pip install --upgrade pip
pip install .[tf,sklearn,testing,onnxruntime,sentencepiece] pip install .[sklearn,testing,onnxruntime,sentencepiece]
pip install git+https://github.com/huggingface/datasets
- name: Are GPUs recognized by our DL frameworks - name: Are GPUs recognized by our DL frameworks
run: | run: |
source .env/bin/activate
TF_CPP_MIN_LOG_LEVEL=3 python -c "import tensorflow as tf; print('TF GPUs available:', bool(tf.config.list_physical_devices('GPU')))" TF_CPP_MIN_LOG_LEVEL=3 python -c "import tensorflow as tf; print('TF GPUs available:', bool(tf.config.list_physical_devices('GPU')))"
TF_CPP_MIN_LOG_LEVEL=3 python -c "import tensorflow as tf; print('Number of TF GPUs available:', len(tf.config.list_physical_devices('GPU')))" TF_CPP_MIN_LOG_LEVEL=3 python -c "import tensorflow as tf; print('Number of TF GPUs available:', len(tf.config.list_physical_devices('GPU')))"
- name: Create model files
run: |
source .env/bin/activate
# transformers-cli add-new-model --testing --testing_file=templates/adding_a_new_model/tests/encoder-bert-tokenizer.json --path=templates/adding_a_new_model
# transformers-cli add-new-model --testing --testing_file=templates/adding_a_new_model/tests/pt-encoder-bert-tokenizer.json --path=templates/adding_a_new_model
# transformers-cli add-new-model --testing --testing_file=templates/adding_a_new_model/tests/standalone.json --path=templates/adding_a_new_model
# transformers-cli add-new-model --testing --testing_file=templates/adding_a_new_model/tests/tf-encoder-bert-tokenizer.json --path=templates/adding_a_new_model
- name: Run all non-slow tests on GPU - name: Run all non-slow tests on GPU
env: env:
OMP_NUM_THREADS: 1 OMP_NUM_THREADS: 8
CUDA_VISIBLE_DEVICES: 0 MKL_NUM_THREADS: 8
TF_NUM_INTRAOP_THREADS: 8
TF_NUM_INTEROP_THREADS: 1
HF_HOME: /mnt/cache
run: | run: |
source .env/bin/activate python -m pytest -n 2 --dist=loadfile --make-reports=tests_tf_gpu tests
python -m pytest -n 2 --dist=loadfile -s --make-reports=tests_tf_gpu tests
- name: Failure short reports - name: Failure short reports
if: ${{ always() }} if: ${{ always() }}
...@@ -163,58 +102,41 @@ jobs: ...@@ -163,58 +102,41 @@ jobs:
name: run_all_tests_tf_gpu_test_reports name: run_all_tests_tf_gpu_test_reports
path: reports path: reports
run_tests_torch_multi_gpu: run_tests_torch_multi_gpu:
runs-on: [self-hosted, gpu, multi-gpu] runs-on: [self-hosted, docker-gpu, multi-gpu]
container:
image: pytorch/pytorch:1.8.0-cuda11.1-cudnn8-runtime
options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
steps: steps:
- uses: actions/checkout@v2 - name: Launcher docker
- name: Python version uses: actions/checkout@v2
run: |
which python
python --version
pip --version
- name: Current dir - name: NVIDIA-SMI
run: pwd
- run: nvidia-smi
- name: Kill any run-away pytest processes
run: (pkill -f tests; pkill -f examples) || echo "no zombies"
- name: Loading cache.
uses: actions/cache@v2
id: cache
with:
path: .env
key: v1.2-tests_torch_multi_gpu-${{ hashFiles('setup.py') }}
- name: Create new python env (on self-hosted runners we have to handle isolation ourselves)
run: | run: |
python -m venv .env nvidia-smi
source .env/bin/activate
which python
python --version
pip --version
- name: Install dependencies - name: Install dependencies
run: | run: |
source .env/bin/activate apt -y update && apt install -y libsndfile1-dev
sudo apt-get -y update && sudo apt-get install -y libsndfile1-dev
pip install --upgrade pip pip install --upgrade pip
pip install .[torch,sklearn,testing,onnxruntime,sentencepiece,speech] pip install .[sklearn,testing,onnxruntime,sentencepiece,speech]
pip install git+https://github.com/huggingface/datasets
- name: Are GPUs recognized by our DL frameworks - name: Are GPUs recognized by our DL frameworks
run: | run: |
source .env/bin/activate
python -c "import torch; print('Cuda available:', torch.cuda.is_available())" python -c "import torch; print('Cuda available:', torch.cuda.is_available())"
python -c "import torch; print('Cuda version:', torch.version.cuda)"
python -c "import torch; print('CuDNN version:', torch.backends.cudnn.version())"
python -c "import torch; print('Number of GPUs available:', torch.cuda.device_count())" python -c "import torch; print('Number of GPUs available:', torch.cuda.device_count())"
- name: Run all non-slow tests on GPU - name: Run all non-slow tests on GPU
env: env:
OMP_NUM_THREADS: 1 OMP_NUM_THREADS: 8
MKL_NUM_THREADS: 8
MKL_SERVICE_FORCE_INTEL: 1
HF_HOME: /mnt/cache
run: | run: |
source .env/bin/activate python -m pytest -n 2 --dist=loadfile --make-reports=tests_torch_multi_gpu tests
python -m pytest -n 2 --dist=loadfile -s --make-reports=tests_torch_multi_gpu tests
- name: Failure short reports - name: Failure short reports
if: ${{ always() }} if: ${{ always() }}
...@@ -228,56 +150,37 @@ jobs: ...@@ -228,56 +150,37 @@ jobs:
path: reports path: reports
run_tests_tf_multi_gpu: run_tests_tf_multi_gpu:
runs-on: [self-hosted, gpu, multi-gpu] runs-on: [self-hosted, docker-gpu, multi-gpu]
container:
image: tensorflow/tensorflow:2.4.1-gpu
options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
steps: steps:
- uses: actions/checkout@v2 - name: Launcher docker
- name: Python version uses: actions/checkout@v2
run: |
which python
python --version
pip --version
- name: Current dir
run: pwd
- run: nvidia-smi
- name: Kill any run-away pytest processes
run: (pkill -f tests; pkill -f examples) || echo "no zombies"
- name: Loading cache. - name: NVIDIA-SMI
uses: actions/cache@v2
id: cache
with:
path: .env
key: v1.2-tests_tf_multi_gpu-${{ hashFiles('setup.py') }}
- name: Create new python env (on self-hosted runners we have to handle isolation ourselves)
run: | run: |
python -m venv .env nvidia-smi
source .env/bin/activate
which python
python --version
pip --version
- name: Install dependencies - name: Install dependencies
run: | run: |
source .env/bin/activate
pip install --upgrade pip pip install --upgrade pip
pip install .[tf,sklearn,testing,onnxruntime,sentencepiece] pip install .[sklearn,testing,onnxruntime,sentencepiece]
pip install git+https://github.com/huggingface/datasets
- name: Are GPUs recognized by our DL frameworks - name: Are GPUs recognized by our DL frameworks
run: | run: |
source .env/bin/activate
TF_CPP_MIN_LOG_LEVEL=3 python -c "import tensorflow as tf; print('TF GPUs available:', bool(tf.config.list_physical_devices('GPU')))" TF_CPP_MIN_LOG_LEVEL=3 python -c "import tensorflow as tf; print('TF GPUs available:', bool(tf.config.list_physical_devices('GPU')))"
TF_CPP_MIN_LOG_LEVEL=3 python -c "import tensorflow as tf; print('Number of TF GPUs available:', len(tf.config.list_physical_devices('GPU')))" TF_CPP_MIN_LOG_LEVEL=3 python -c "import tensorflow as tf; print('Number of TF GPUs available:', len(tf.config.list_physical_devices('GPU')))"
- name: Run all non-slow tests on GPU - name: Run all non-slow tests on GPU
env: env:
OMP_NUM_THREADS: 1 OMP_NUM_THREADS: 8
MKL_NUM_THREADS: 8
TF_NUM_INTRAOP_THREADS: 8
TF_NUM_INTEROP_THREADS: 1
HF_HOME: /mnt/cache
run: | run: |
source .env/bin/activate python -m pytest -n 2 --dist=loadfile --make-reports=tests_tf_multi_gpu tests
python -m pytest -n 2 --dist=loadfile -s --make-reports=tests_tf_multi_gpu tests
- name: Failure short reports - name: Failure short reports
if: ${{ always() }} if: ${{ always() }}
...@@ -289,3 +192,22 @@ jobs: ...@@ -289,3 +192,22 @@ jobs:
with: with:
name: run_all_tests_tf_multi_gpu_test_reports name: run_all_tests_tf_multi_gpu_test_reports
path: reports path: reports
send_results:
name: Send results to webhook
runs-on: ubuntu-latest
if: always()
needs: [run_tests_torch_gpu, run_tests_tf_gpu, run_tests_torch_multi_gpu, run_tests_tf_multi_gpu]
steps:
- uses: actions/checkout@v2
- uses: actions/download-artifact@v2
- name: Send message to Slack
env:
CI_SLACK_BOT_TOKEN: ${{ secrets.CI_SLACK_BOT_TOKEN }}
CI_SLACK_CHANNEL_ID: ${{ secrets.CI_SLACK_CHANNEL_ID }}
run: |
pip install slack_sdk
python utils/notification_service.py push
\ No newline at end of file
# configuration notes:
#
# - `source .env/bin/activate` is currently needed to be run first thing first in each step. Otherwise
# the step uses the system-wide python interpreter.
name: Self-hosted runner (scheduled) name: Self-hosted runner (scheduled)
on: on:
...@@ -15,61 +10,39 @@ on: ...@@ -15,61 +10,39 @@ on:
jobs: jobs:
run_all_tests_torch_gpu: run_all_tests_torch_gpu:
runs-on: [self-hosted, gpu, single-gpu] runs-on: [self-hosted, docker-gpu, single-gpu]
container:
image: pytorch/pytorch:1.8.0-cuda11.1-cudnn8-runtime
options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
steps: steps:
- uses: actions/checkout@v2 - name: Launcher docker
uses: actions/checkout@v2
- name: Loading cache.
uses: actions/cache@v2
id: cache
with:
path: .env
key: v 1.2-slow_tests_torch_gpu-${{ hashFiles('setup.py') }}
- name: Python version - name: NVIDIA-SMI
run: | run: |
which python nvidia-smi
python --version
pip --version
- name: Current dir
run: pwd
- run: nvidia-smi
- name: Kill any run-away pytest processes
run: (pkill -f tests; pkill -f examples) || echo "no zombies"
- name: Create new python env (on self-hosted runners we have to handle isolation ourselves)
if: steps.cache.outputs.cache-hit != 'true'
run: |
python -m venv .env
source .env/bin/activate
which python
python --version
pip --version
- name: Install dependencies - name: Install dependencies
run: | run: |
source .env/bin/activate apt -y update && apt install -y libsndfile1-dev
pip install --upgrade pip pip install --upgrade pip
pip install .[torch,sklearn,testing,onnxruntime,sentencepiece] pip install .[sklearn,testing,onnxruntime,sentencepiece,speech]
pip install git+https://github.com/huggingface/datasets
pip list
- name: Are GPUs recognized by our DL frameworks - name: Are GPUs recognized by our DL frameworks
run: | run: |
source .env/bin/activate
python -c "import torch; print('Cuda available:', torch.cuda.is_available())" python -c "import torch; print('Cuda available:', torch.cuda.is_available())"
python -c "import torch; print('Cuda version:', torch.version.cuda)"
python -c "import torch; print('CuDNN version:', torch.backends.cudnn.version())"
python -c "import torch; print('Number of GPUs available:', torch.cuda.device_count())" python -c "import torch; print('Number of GPUs available:', torch.cuda.device_count())"
- name: Run all tests on GPU - name: Run all tests on GPU
env: env:
OMP_NUM_THREADS: 1 OMP_NUM_THREADS: 16
MKL_NUM_THREADS: 16
RUN_SLOW: yes RUN_SLOW: yes
HF_HOME: /mnt/cache
run: | run: |
source .env/bin/activate python -m pytest -n 1 --dist=loadfile --make-reports=tests_torch_gpu tests
python -m pytest -n 1 --dist=loadfile -s --make-reports=tests_torch_gpu tests
- name: Failure short reports - name: Failure short reports
if: ${{ always() }} if: ${{ always() }}
...@@ -78,12 +51,13 @@ jobs: ...@@ -78,12 +51,13 @@ jobs:
- name: Run examples tests on GPU - name: Run examples tests on GPU
if: ${{ always() }} if: ${{ always() }}
env: env:
OMP_NUM_THREADS: 1 OMP_NUM_THREADS: 16
MKL_NUM_THREADS: 16
RUN_SLOW: yes RUN_SLOW: yes
HF_HOME: /mnt/cache
run: | run: |
source .env/bin/activate
pip install -r examples/_tests_requirements.txt pip install -r examples/_tests_requirements.txt
python -m pytest -n 1 --dist=loadfile -s --make-reports=examples_torch_gpu examples python -m pytest -n 1 --dist=loadfile --make-reports=examples_torch_gpu examples
- name: Failure short reports - name: Failure short reports
if: ${{ always() }} if: ${{ always() }}
...@@ -92,13 +66,13 @@ jobs: ...@@ -92,13 +66,13 @@ jobs:
- name: Run all pipeline tests on GPU - name: Run all pipeline tests on GPU
if: ${{ always() }} if: ${{ always() }}
env: env:
TF_FORCE_GPU_ALLOW_GROWTH: "true" OMP_NUM_THREADS: 16
OMP_NUM_THREADS: 1 MKL_NUM_THREADS: 16
RUN_SLOW: yes RUN_SLOW: yes
RUN_PIPELINE_TESTS: yes RUN_PIPELINE_TESTS: yes
HF_HOME: /mnt/cache
run: | run: |
source .env/bin/activate python -m pytest -n 1 --dist=loadfile -m is_pipeline_test --make-reports=tests_torch_pipeline_gpu tests
python -m pytest -n 1 --dist=loadfile -s -m is_pipeline_test --make-reports=tests_torch_pipeline_gpu tests
- name: Failure short reports - name: Failure short reports
if: ${{ always() }} if: ${{ always() }}
...@@ -111,64 +85,39 @@ jobs: ...@@ -111,64 +85,39 @@ jobs:
name: run_all_tests_torch_gpu_test_reports name: run_all_tests_torch_gpu_test_reports
path: reports path: reports
run_all_tests_tf_gpu: run_all_tests_tf_gpu:
runs-on: [self-hosted, gpu, single-gpu] runs-on: [self-hosted, docker-gpu, single-gpu]
container:
image: tensorflow/tensorflow:2.4.1-gpu
options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
steps: steps:
- uses: actions/checkout@v2 - name: Launcher docker
uses: actions/checkout@v2
- name: Loading cache. - name: NVIDIA-SMI
uses: actions/cache@v2
id: cache
with:
path: .env
key: v1.2-slow_tests_tf_gpu-${{ hashFiles('setup.py') }}
- name: Python version
run: | run: |
which python nvidia-smi
python --version
pip --version
- name: Current dir
run: pwd
- run: nvidia-smi
- name: Kill any run-away pytest processes
run: (pkill -f tests; pkill -f examples) || echo "no zombies"
- name: Create new python env (on self-hosted runners we have to handle isolation ourselves)
if: steps.cache.outputs.cache-hit != 'true'
run: |
python -m venv .env
source .env/bin/activate
which python
python --version
pip --version
- name: Install dependencies - name: Install dependencies
run: | run: |
source .env/bin/activate
pip install --upgrade pip pip install --upgrade pip
pip install .[tf,sklearn,testing,onnxruntime,sentencepiece] pip install .[sklearn,testing,onnx,sentencepiece]
pip install git+https://github.com/huggingface/datasets
pip list
- name: Are GPUs recognized by our DL frameworks - name: Are GPUs recognized by our DL frameworks
run: | run: |
source .env/bin/activate
TF_CPP_MIN_LOG_LEVEL=3 python -c "import tensorflow as tf; print('TF GPUs available:', bool(tf.config.list_physical_devices('GPU')))" TF_CPP_MIN_LOG_LEVEL=3 python -c "import tensorflow as tf; print('TF GPUs available:', bool(tf.config.list_physical_devices('GPU')))"
TF_CPP_MIN_LOG_LEVEL=3 python -c "import tensorflow as tf; print('Number of TF GPUs available:', len(tf.config.list_physical_devices('GPU')))" TF_CPP_MIN_LOG_LEVEL=3 python -c "import tensorflow as tf; print('Number of TF GPUs available:', len(tf.config.list_physical_devices('GPU')))"
- name: Run all tests on GPU - name: Run all tests on GPU
env: env:
OMP_NUM_THREADS: 1
RUN_SLOW: yes RUN_SLOW: yes
HF_HOME: /mnt/cache
OMP_NUM_THREADS: 16
TF_NUM_INTEROP_THREADS: 1
TF_NUM_INTRAOP_THREADS: 16
MKL_NUM_THREADS: 16
run: | run: |
source .env/bin/activate python -m pytest -n 1 --dist=loadfile --make-reports=tests_tf_gpu tests
python -m pytest -n 1 --dist=loadfile -s --make-reports=tests_tf_gpu tests
- name: Failure short reports - name: Failure short reports
if: ${{ always() }} if: ${{ always() }}
...@@ -177,17 +126,19 @@ jobs: ...@@ -177,17 +126,19 @@ jobs:
- name: Run all pipeline tests on GPU - name: Run all pipeline tests on GPU
if: ${{ always() }} if: ${{ always() }}
env: env:
TF_FORCE_GPU_ALLOW_GROWTH: "true"
OMP_NUM_THREADS: 1
RUN_SLOW: yes RUN_SLOW: yes
HF_HOME: /mnt/cache
OMP_NUM_THREADS: 16
RUN_PIPELINE_TESTS: yes RUN_PIPELINE_TESTS: yes
TF_NUM_INTEROP_THREADS: 1
TF_NUM_INTRAOP_THREADS: 16
MKL_NUM_THREADS: 16
run: | run: |
source .env/bin/activate python -m pytest -n 1 --dist=loadfile -m is_pipeline_test --make-reports=tests_tf_pipeline_gpu tests
python -m pytest -n 1 --dist=loadfile -s -m is_pipeline_test --make-reports=tests_tf_pipelines_gpu tests
- name: Failure short reports - name: Failure short reports
if: ${{ always() }} if: ${{ always() }}
run: cat reports/tests_tf_pipelines_gpu_failures_short.txt run: cat reports/tests_tf_pipeline_gpu_failures_short.txt
- name: Test suite reports artifacts - name: Test suite reports artifacts
if: ${{ always() }} if: ${{ always() }}
...@@ -197,92 +148,55 @@ jobs: ...@@ -197,92 +148,55 @@ jobs:
path: reports path: reports
run_all_tests_torch_multi_gpu: run_all_tests_torch_multi_gpu:
runs-on: [self-hosted, gpu, multi-gpu] runs-on: [self-hosted, docker-gpu, multi-gpu]
container:
image: pytorch/pytorch:1.8.0-cuda11.1-cudnn8-runtime
options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
steps: steps:
- uses: actions/checkout@v2 - name: Launcher docker
uses: actions/checkout@v2
- name: Loading cache.
uses: actions/cache@v2
id: cache
with:
path: .env
key: v1.2-slow_tests_torch_multi_gpu-${{ hashFiles('setup.py') }}
- name: Python version - name: NVIDIA-SMI
run: | run: |
which python nvidia-smi
python --version
pip --version
- name: Current dir
run: pwd
- run: nvidia-smi
- name: Kill any run-away pytest processes
run: (pkill -f tests; pkill -f examples) || echo "no zombies"
- name: Create new python env (on self-hosted runners we have to handle isolation ourselves)
if: steps.cache.outputs.cache-hit != 'true'
run: |
python -m venv .env
source .env/bin/activate
which python
python --version
pip --version
- name: Install dependencies - name: Install dependencies
run: | run: |
source .env/bin/activate apt -y update && apt install -y libsndfile1-dev
pip install --upgrade pip pip install --upgrade pip
pip install .[torch,sklearn,testing,onnxruntime,sentencepiece] pip install .[sklearn,testing,onnxruntime,sentencepiece,speech]
pip install git+https://github.com/huggingface/datasets
pip install fairscale
pip install deepspeed
pip list
- name: Are GPUs recognized by our DL frameworks - name: Are GPUs recognized by our DL frameworks
run: | run: |
source .env/bin/activate
python -c "import torch; print('Cuda available:', torch.cuda.is_available())" python -c "import torch; print('Cuda available:', torch.cuda.is_available())"
python -c "import torch; print('Cuda version:', torch.version.cuda)"
python -c "import torch; print('CuDNN version:', torch.backends.cudnn.version())"
python -c "import torch; print('Number of GPUs available:', torch.cuda.device_count())" python -c "import torch; print('Number of GPUs available:', torch.cuda.device_count())"
- name: Run all tests on multi-GPU - name: Run all tests on GPU
env: env:
OMP_NUM_THREADS: 1
RUN_SLOW: yes RUN_SLOW: yes
HF_HOME: /mnt/cache
OMP_NUM_THREADS: 16
MKL_NUM_THREADS: 16
MKL_SERVICE_FORCE_INTEL: 1
run: | run: |
source .env/bin/activate python -m pytest -n 1 --dist=loadfile --make-reports=tests_torch_multi_gpu tests
python -m pytest -n 1 --dist=loadfile -s --make-reports=tests_torch_multi_gpu tests
- name: Failure short reports - name: Failure short reports
if: ${{ always() }} if: ${{ always() }}
run: cat reports/tests_torch_multi_gpu_failures_short.txt run: cat reports/tests_torch_multi_gpu_failures_short.txt
- name: Run examples tests on multi-GPU - name: Run all pipeline tests on GPU
if: ${{ always() }}
env:
OMP_NUM_THREADS: 1
RUN_SLOW: yes
run: |
source .env/bin/activate
pip install -r examples/_tests_requirements.txt
python -m pytest -n 1 --dist=loadfile -s --make-reports=tests_torch_examples_multi_gpu examples
- name: Failure short reports
if: ${{ always() }}
run: cat reports/tests_torch_examples_multi_gpu_failures_short.txt
- name: Run all pipeline tests on multi-GPU
if: ${{ always() }} if: ${{ always() }}
env: env:
TF_FORCE_GPU_ALLOW_GROWTH: "true" OMP_NUM_THREADS: 16
OMP_NUM_THREADS: 1 MKL_NUM_THREADS: 16
RUN_SLOW: yes RUN_SLOW: yes
RUN_PIPELINE_TESTS: yes RUN_PIPELINE_TESTS: yes
HF_HOME: /mnt/cache
run: | run: |
source .env/bin/activate python -m pytest -n 1 --dist=loadfile -m is_pipeline_test --make-reports=tests_torch_pipeline_multi_gpu tests
python -m pytest -n 1 --dist=loadfile -s -m is_pipeline_test --make-reports=tests_torch_pipeline_multi_gpu tests
- name: Failure short reports - name: Failure short reports
if: ${{ always() }} if: ${{ always() }}
...@@ -296,76 +210,55 @@ jobs: ...@@ -296,76 +210,55 @@ jobs:
path: reports path: reports
run_all_tests_tf_multi_gpu: run_all_tests_tf_multi_gpu:
runs-on: [self-hosted, gpu, multi-gpu] runs-on: [self-hosted, docker-gpu, multi-gpu]
container:
image: tensorflow/tensorflow:2.4.1-gpu
options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
steps: steps:
- uses: actions/checkout@v2 - name: Launcher docker
uses: actions/checkout@v2
- name: Loading cache.
uses: actions/cache@v2
id: cache
with:
path: .env
key: v1.2-slow_tests_tf_multi_gpu-${{ hashFiles('setup.py') }}
- name: Python version
run: |
which python
python --version
pip --version
- name: Current dir - name: NVIDIA-SMI
run: pwd
- run: nvidia-smi
- name: Kill any run-away pytest processes
run: (pkill -f tests; pkill -f examples) || echo "no zombies"
- name: Create new python env (on self-hosted runners we have to handle isolation ourselves)
if: steps.cache.outputs.cache-hit != 'true'
run: | run: |
python -m venv .env nvidia-smi
source .env/bin/activate
which python
python --version
pip --version
- name: Install dependencies - name: Install dependencies
run: | run: |
source .env/bin/activate
pip install --upgrade pip pip install --upgrade pip
pip install .[tf,sklearn,testing,onnxruntime,sentencepiece] pip install .[sklearn,testing,onnx,sentencepiece]
pip install git+https://github.com/huggingface/datasets
pip list
- name: Are GPUs recognized by our DL frameworks - name: Are GPUs recognized by our DL frameworks
run: | run: |
source .env/bin/activate
TF_CPP_MIN_LOG_LEVEL=3 python -c "import tensorflow as tf; print('TF GPUs available:', bool(tf.config.list_physical_devices('GPU')))" TF_CPP_MIN_LOG_LEVEL=3 python -c "import tensorflow as tf; print('TF GPUs available:', bool(tf.config.list_physical_devices('GPU')))"
TF_CPP_MIN_LOG_LEVEL=3 python -c "import tensorflow as tf; print('Number of TF GPUs available:', len(tf.config.list_physical_devices('GPU')))" TF_CPP_MIN_LOG_LEVEL=3 python -c "import tensorflow as tf; print('Number of TF GPUs available:', len(tf.config.list_physical_devices('GPU')))"
- name: Run all tests on multi-GPU - name: Run all tests on GPU
env: env:
OMP_NUM_THREADS: 1 OMP_NUM_THREADS: 16
RUN_SLOW: yes RUN_SLOW: yes
MKL_NUM_THREADS: 16
TF_NUM_INTEROP_THREADS: 1
TF_NUM_INTRAOP_THREADS: 16
HF_HOME: /mnt/cache
run: | run: |
source .env/bin/activate python -m pytest -n 1 --dist=loadfile --make-reports=tests_tf_multi_gpu tests
python -m pytest -n 1 --dist=loadfile -s --make-reports=tests_tf_multi_gpu tests
- name: Failure short reports - name: Failure short reports
if: ${{ always() }} if: ${{ always() }}
run: cat reports/tests_tf_multi_gpu_failures_short.txt run: cat reports/tests_tf_multi_gpu_failures_short.txt
- name: Run all pipeline tests on multi-GPU - name: Run all pipeline tests on GPU
if: ${{ always() }} if: ${{ always() }}
env: env:
TF_FORCE_GPU_ALLOW_GROWTH: "true" OMP_NUM_THREADS: 16
OMP_NUM_THREADS: 1
RUN_SLOW: yes RUN_SLOW: yes
RUN_PIPELINE_TESTS: yes RUN_PIPELINE_TESTS: yes
MKL_NUM_THREADS: 16
TF_NUM_INTEROP_THREADS: 1
TF_NUM_INTRAOP_THREADS: 16
HF_HOME: /mnt/cache
run: | run: |
source .env/bin/activate python -m pytest -n 1 --dist=loadfile -m is_pipeline_test --make-reports=tests_tf_pipeline_multi_gpu tests
python -m pytest -n 1 --dist=loadfile -s -m is_pipeline_test --make-reports=tests_tf_pipeline_multi_gpu tests
- name: Failure short reports - name: Failure short reports
if: ${{ always() }} if: ${{ always() }}
...@@ -377,3 +270,23 @@ jobs: ...@@ -377,3 +270,23 @@ jobs:
with: with:
name: run_all_tests_tf_multi_gpu_test_reports name: run_all_tests_tf_multi_gpu_test_reports
path: reports path: reports
send_results:
name: Send results to webhook
runs-on: ubuntu-latest
if: always()
needs: [run_all_tests_torch_gpu, run_all_tests_tf_gpu, run_all_tests_torch_multi_gpu, run_all_tests_tf_multi_gpu]
steps:
- uses: actions/checkout@v2
- uses: actions/download-artifact@v2
- name: Send message to Slack
env:
CI_SLACK_BOT_TOKEN: ${{ secrets.CI_SLACK_BOT_TOKEN }}
CI_SLACK_CHANNEL_ID: ${{ secrets.CI_SLACK_CHANNEL_ID }}
run: |
pip install slack_sdk
python utils/notification_service.py scheduled
...@@ -115,6 +115,7 @@ _deps = [ ...@@ -115,6 +115,7 @@ _deps = [
"psutil", "psutil",
"pydantic", "pydantic",
"pytest", "pytest",
"pytest-sugar",
"pytest-xdist", "pytest-xdist",
"python>=3.6.0", "python>=3.6.0",
"recommonmark", "recommonmark",
...@@ -225,6 +226,7 @@ else: ...@@ -225,6 +226,7 @@ else:
extras["tokenizers"] = deps_list("tokenizers") extras["tokenizers"] = deps_list("tokenizers")
extras["onnxruntime"] = deps_list("onnxruntime", "onnxruntime-tools") extras["onnxruntime"] = deps_list("onnxruntime", "onnxruntime-tools")
extras["onnx"] = deps_list("onnxconverter-common", "keras2onnx") + extras["onnxruntime"]
extras["modelcreation"] = deps_list("cookiecutter") extras["modelcreation"] = deps_list("cookiecutter")
extras["serving"] = deps_list("pydantic", "uvicorn", "fastapi", "starlette") extras["serving"] = deps_list("pydantic", "uvicorn", "fastapi", "starlette")
...@@ -232,7 +234,7 @@ extras["speech"] = deps_list("soundfile", "torchaudio") ...@@ -232,7 +234,7 @@ extras["speech"] = deps_list("soundfile", "torchaudio")
extras["sentencepiece"] = deps_list("sentencepiece", "protobuf") extras["sentencepiece"] = deps_list("sentencepiece", "protobuf")
extras["testing"] = ( extras["testing"] = (
deps_list("pytest", "pytest-xdist", "timeout-decorator", "parameterized", "psutil", "datasets") deps_list("pytest", "pytest-xdist", "timeout-decorator", "parameterized", "psutil", "datasets", "pytest-sugar")
+ extras["retrieval"] + extras["retrieval"]
+ extras["modelcreation"] + extras["modelcreation"]
) )
......
...@@ -28,6 +28,7 @@ deps = { ...@@ -28,6 +28,7 @@ deps = {
"psutil": "psutil", "psutil": "psutil",
"pydantic": "pydantic", "pydantic": "pydantic",
"pytest": "pytest", "pytest": "pytest",
"pytest-sugar": "pytest-sugar",
"pytest-xdist": "pytest-xdist", "pytest-xdist": "pytest-xdist",
"python": "python>=3.6.0", "python": "python>=3.6.0",
"recommonmark": "recommonmark", "recommonmark": "recommonmark",
......
...@@ -137,6 +137,17 @@ def slow(test_case): ...@@ -137,6 +137,17 @@ def slow(test_case):
return test_case return test_case
def tooslow(test_case):
"""
Decorator marking a test as too slow.
Slow tests are skipped while they're in the process of being fixed. No test should stay tagged as "tooslow" as
these will not be tested by the CI.
"""
return unittest.skip("test is too slow")(test_case)
def custom_tokenizers(test_case): def custom_tokenizers(test_case):
""" """
Decorator marking a test for a custom tokenizer. Decorator marking a test for a custom tokenizer.
......
...@@ -25,7 +25,14 @@ from importlib import import_module ...@@ -25,7 +25,14 @@ from importlib import import_module
from typing import List, Tuple from typing import List, Tuple
from transformers import is_tf_available from transformers import is_tf_available
from transformers.testing_utils import _tf_gpu_memory_limit, is_pt_tf_cross_test, require_onnx, require_tf, slow from transformers.testing_utils import (
_tf_gpu_memory_limit,
is_pt_tf_cross_test,
require_onnx,
require_tf,
slow,
tooslow,
)
if is_tf_available(): if is_tf_available():
...@@ -129,7 +136,7 @@ class TFModelTesterMixin: ...@@ -129,7 +136,7 @@ class TFModelTesterMixin:
self.assert_outputs_same(after_outputs, outputs) self.assert_outputs_same(after_outputs, outputs)
@slow @tooslow
def test_graph_mode(self): def test_graph_mode(self):
config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
for model_class in self.all_model_classes: for model_class in self.all_model_classes:
...@@ -143,7 +150,7 @@ class TFModelTesterMixin: ...@@ -143,7 +150,7 @@ class TFModelTesterMixin:
outputs = run_in_graph_mode() outputs = run_in_graph_mode()
self.assertIsNotNone(outputs) self.assertIsNotNone(outputs)
@slow @tooslow
def test_xla_mode(self): def test_xla_mode(self):
config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
for model_class in self.all_model_classes: for model_class in self.all_model_classes:
...@@ -184,7 +191,7 @@ class TFModelTesterMixin: ...@@ -184,7 +191,7 @@ class TFModelTesterMixin:
expected_arg_names = ["input_ids"] expected_arg_names = ["input_ids"]
self.assertListEqual(arg_names[:1], expected_arg_names) self.assertListEqual(arg_names[:1], expected_arg_names)
@slow @tooslow
def test_saved_model_creation(self): def test_saved_model_creation(self):
config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
config.output_hidden_states = False config.output_hidden_states = False
...@@ -205,7 +212,7 @@ class TFModelTesterMixin: ...@@ -205,7 +212,7 @@ class TFModelTesterMixin:
saved_model_dir = os.path.join(tmpdirname, "saved_model", "1") saved_model_dir = os.path.join(tmpdirname, "saved_model", "1")
self.assertTrue(os.path.exists(saved_model_dir)) self.assertTrue(os.path.exists(saved_model_dir))
@slow @tooslow
def test_saved_model_creation_extended(self): def test_saved_model_creation_extended(self):
config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
config.output_hidden_states = True config.output_hidden_states = True
...@@ -314,7 +321,7 @@ class TFModelTesterMixin: ...@@ -314,7 +321,7 @@ class TFModelTesterMixin:
onnxruntime.InferenceSession(onnx_model.SerializeToString()) onnxruntime.InferenceSession(onnx_model.SerializeToString())
@slow @tooslow
def test_mixed_precision(self): def test_mixed_precision(self):
tf.keras.mixed_precision.experimental.set_policy("mixed_float16") tf.keras.mixed_precision.experimental.set_policy("mixed_float16")
...@@ -488,7 +495,7 @@ class TFModelTesterMixin: ...@@ -488,7 +495,7 @@ class TFModelTesterMixin:
max_diff = np.amax(np.abs(tfo - pto)) max_diff = np.amax(np.abs(tfo - pto))
self.assertLessEqual(max_diff, 4e-2) self.assertLessEqual(max_diff, 4e-2)
@slow @tooslow
def test_train_pipeline_custom_model(self): def test_train_pipeline_custom_model(self):
config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
# head_mask and decoder_head_mask has different shapes than other input args # head_mask and decoder_head_mask has different shapes than other input args
...@@ -909,7 +916,7 @@ class TFModelTesterMixin: ...@@ -909,7 +916,7 @@ class TFModelTesterMixin:
model(inputs) model(inputs)
@slow @tooslow
def test_graph_mode_with_inputs_embeds(self): def test_graph_mode_with_inputs_embeds(self):
config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
......
# Copyright 2020 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os
import re
import sys
from slack_sdk import WebClient
def handle_test_results(test_results):
expressions = test_results.split(" ")
failed = 0
success = 0
# When the output is short enough, the output is surrounded by = signs: "== OUTPUT =="
# When it is too long, those signs are not present.
time_spent = expressions[-2] if "=" in expressions[-1] else expressions[-1]
for i, expression in enumerate(expressions):
if "failed" in expression:
failed += int(expressions[i - 1])
if "passed" in expression:
success += int(expressions[i - 1])
return failed, success, time_spent
def format_for_slack(total_results, results, scheduled: bool):
print(results)
header = {
"type": "header",
"text": {
"type": "plain_text",
"text": "🤗 Results of the scheduled tests, March 11, 2021." if scheduled else "🤗 Self-push results",
"emoji": True,
},
}
total = (
{
"type": "section",
"fields": [
{"type": "mrkdwn", "text": f"*Failures:*\n{total_results['failed']} failures."},
{"type": "mrkdwn", "text": f"*Passed:*\n{total_results['success']} tests passed."},
],
}
if total_results["failed"] > 0
else {
"type": "section",
"fields": [{"type": "mrkdwn", "text": f"*Congrats!*\nAll {total_results['success']} tests pass."}],
}
)
blocks = [header, total]
if total_results["failed"] > 0:
for key, result in results.items():
print(key, result)
blocks.append({"type": "header", "text": {"type": "plain_text", "text": key, "emoji": True}})
blocks.append(
{
"type": "section",
"fields": [
{
"type": "mrkdwn",
"text": f"*Results:*\n{result['failed']} failed, {result['success']} passed.",
},
{"type": "mrkdwn", "text": f"*Time spent:*\n{result['time_spent']}"},
],
}
)
else:
for key, result in results.items():
blocks.append(
{"type": "section", "fields": [{"type": "mrkdwn", "text": f"*{key}*\n{result['time_spent']}."}]}
)
footer = {
"type": "section",
"text": {
"type": "mrkdwn",
"text": "<https://github.com/huggingface/transformers/actions/workflows/self-scheduled.yml|View on GitHub>"
if scheduled
else "<https://github.com/huggingface/transformers/actions/workflows/self-push.yml|View on GitHub>",
},
}
blocks.append(footer)
blocks = {"blocks": blocks}
return blocks
if __name__ == "__main__":
scheduled = sys.argv[1] == "scheduled"
if scheduled:
# The scheduled run has several artifacts for each job.
file_paths = {
"TF Single GPU": {
"common": "run_all_tests_tf_gpu_test_reports/tests_tf_gpu_[].txt",
"pipeline": "run_all_tests_tf_gpu_test_reports/tests_tf_pipeline_gpu_[].txt",
},
"Torch Single GPU": {
"common": "run_all_tests_torch_gpu_test_reports/tests_torch_gpu_[].txt",
"pipeline": "run_all_tests_torch_gpu_test_reports/tests_torch_pipeline_gpu_[].txt",
"examples": "run_all_tests_torch_gpu_test_reports/examples_torch_gpu_[].txt",
},
"TF Multi GPU": {
"common": "run_all_tests_tf_multi_gpu_test_reports/tests_tf_multi_gpu_[].txt",
"pipeline": "run_all_tests_tf_multi_gpu_test_reports/tests_tf_pipeline_multi_gpu_[].txt",
},
"Torch Multi GPU": {
"common": "run_all_tests_torch_multi_gpu_test_reports/tests_torch_multi_gpu_[].txt",
"pipeline": "run_all_tests_torch_multi_gpu_test_reports/tests_torch_pipeline_multi_gpu_[].txt",
},
}
else:
file_paths = {
"TF Single GPU": {"common": "run_all_tests_tf_gpu_test_reports/tests_tf_gpu_[].txt"},
"Torch Single GPU": {"common": "run_all_tests_torch_gpu_test_reports/tests_torch_gpu_[].txt"},
"TF Multi GPU": {"common": "run_all_tests_tf_multi_gpu_test_reports/tests_tf_multi_gpu_[].txt"},
"Torch Multi GPU": {"common": "run_all_tests_torch_multi_gpu_test_reports/tests_torch_multi_gpu_[].txt"},
}
client = WebClient(token=os.environ["CI_SLACK_BOT_TOKEN"])
channel_id = os.environ["CI_SLACK_CHANNEL_ID"]
try:
results = {}
for job, file_dict in file_paths.items():
# Single return value for failed/success across steps of a same job
results[job] = {"failed": 0, "success": 0, "time_spent": "", "failures": ""}
for key, file_path in file_dict.items():
with open(file_path.replace("[]", "stats")) as f:
failed, success, time_spent = handle_test_results(f.read())
results[job]["failed"] += failed
results[job]["success"] += success
results[job]["time_spent"] += time_spent[1:-1] + ", "
with open(file_path.replace("[]", "summary_short")) as f:
for line in f:
if re.search("FAILED", line):
results[job]["failures"] += line
# Remove the trailing ", "
results[job]["time_spent"] = results[job]["time_spent"][:-2]
test_results_keys = ["failed", "success"]
total = {"failed": 0, "success": 0}
for job, job_result in results.items():
for result_key in test_results_keys:
total[result_key] += job_result[result_key]
to_be_sent_to_slack = format_for_slack(total, results, scheduled)
result = client.chat_postMessage(
channel=channel_id,
blocks=to_be_sent_to_slack["blocks"],
)
for job, job_result in results.items():
if len(job_result["failures"]):
client.chat_postMessage(
channel=channel_id, text=f"{job}\n{job_result['failures']}", thread_ts=result["ts"]
)
except Exception as e:
# Voluntarily catch every exception and send it to Slack.
raise Exception(f"Setup error: no artifacts were found. Error: {e}") from e
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment