Unverified Commit ba7f2173 authored by Yih-Dar's avatar Yih-Dar Committed by GitHub
Browse files

Add runner availability check (#19054)


Co-authored-by: default avatarydshieh <ydshieh@users.noreply.github.com>
parent ca485e56
name: Self-hosted runner (check runner status)
# Note that each job's dependencies go into a corresponding docker file.
#
# For example for `run_all_tests_torch_cuda_extensions_gpu` the docker image is
# `huggingface/transformers-pytorch-deepspeed-latest-gpu`, which can be found at
# `docker/transformers-pytorch-deepspeed-latest-gpu/Dockerfile`
on:
repository_dispatch:
schedule:
# run per hour
- cron: "* */1 * * *"
env:
TRANSFORMERS_IS_CI: yes
jobs:
check_runner_status:
name: Check Runner Status
runs-on: ubuntu-latest
steps:
- name: Checkout transformers
uses: actions/checkout@v2
with:
fetch-depth: 2
- name: Check Runner Status
run: python utils/check_self_hosted_runner.py --target_runners single-gpu-ci-runner-docker,multi-gpu-ci-runner-docker,single-gpu-scheduled-ci-runner-docker,multi-scheduled-scheduled-ci-runner-docker --token ${{ secrets.ACCESS_REPO_INFO_TOKEN }}
send_results:
name: Send results to webhook
runs-on: ubuntu-latest
needs: check_runner_status
if: ${{ failure() }}
steps:
- name: Preliminary job status
shell: bash
run: |
echo "Runner availability: ${{ needs.check_runner_status.result }}"
- uses: actions/checkout@v2
- uses: actions/download-artifact@v2
- name: Send message to Slack
env:
CI_SLACK_BOT_TOKEN: ${{ secrets.CI_SLACK_BOT_TOKEN }}
CI_SLACK_CHANNEL_ID: ${{ secrets.CI_SLACK_CHANNEL_ID }}
CI_SLACK_CHANNEL_ID_DAILY: ${{ secrets.CI_SLACK_CHANNEL_ID_DAILY }}
CI_SLACK_CHANNEL_DUMMY_TESTS: ${{ secrets.CI_SLACK_CHANNEL_DUMMY_TESTS }}
CI_SLACK_REPORT_CHANNEL_ID: ${{ secrets.CI_SLACK_CHANNEL_ID_DAILY }}
CI_EVENT: runner status check
RUNNER_STATUS: ${{ needs.check_runner_status.result }}
# We pass `needs.setup.outputs.matrix` as the argument. A processing in `notification_service.py` to change
# `models/bert` to `models_bert` is required, as the artifact names use `_` instead of `/`.
run: |
pip install slack_sdk
python utils/notification_service.py
...@@ -23,8 +23,21 @@ env: ...@@ -23,8 +23,21 @@ env:
RUN_PT_TF_CROSS_TESTS: 1 RUN_PT_TF_CROSS_TESTS: 1
jobs: jobs:
run_check_runners: check_runner_status:
name: Check Runner Status
runs-on: ubuntu-latest
steps:
- name: Checkout transformers
uses: actions/checkout@v2
with:
fetch-depth: 2
- name: Check Runner Status
run: python utils/check_self_hosted_runner.py --target_runners single-gpu-scheduled-ci-runner-docker,multi-gpu-scheduled-ci-runner-docker --token ${{ secrets.ACCESS_REPO_INFO_TOKEN }}
check_runners:
name: Check Runners name: Check Runners
needs: check_runner_status
strategy: strategy:
matrix: matrix:
machine_type: [single-gpu, multi-gpu] machine_type: [single-gpu, multi-gpu]
...@@ -39,7 +52,7 @@ jobs: ...@@ -39,7 +52,7 @@ jobs:
setup: setup:
name: Setup name: Setup
needs: run_check_runners needs: check_runners
strategy: strategy:
matrix: matrix:
machine_type: [single-gpu, multi-gpu] machine_type: [single-gpu, multi-gpu]
...@@ -83,7 +96,7 @@ jobs: ...@@ -83,7 +96,7 @@ jobs:
container: container:
image: huggingface/transformers-all-latest-torch-nightly-gpu image: huggingface/transformers-all-latest-torch-nightly-gpu
options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
needs: [run_check_runners, setup] needs: setup
steps: steps:
- name: Echo folder ${{ matrix.folders }} - name: Echo folder ${{ matrix.folders }}
shell: bash shell: bash
...@@ -136,7 +149,7 @@ jobs: ...@@ -136,7 +149,7 @@ jobs:
container: container:
image: huggingface/transformers-all-latest-torch-nightly-gpu image: huggingface/transformers-all-latest-torch-nightly-gpu
options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
needs: [run_check_runners, setup] needs: setup
steps: steps:
- name: Echo folder ${{ matrix.folders }} - name: Echo folder ${{ matrix.folders }}
shell: bash shell: bash
...@@ -185,7 +198,7 @@ jobs: ...@@ -185,7 +198,7 @@ jobs:
matrix: matrix:
machine_type: [single-gpu, multi-gpu] machine_type: [single-gpu, multi-gpu]
runs-on: ${{ format('{0}-{1}', matrix.machine_type, 'docker') }} runs-on: ${{ format('{0}-{1}', matrix.machine_type, 'docker') }}
needs: [run_check_runners, setup] needs: setup
container: container:
image: huggingface/transformers-pytorch-deepspeed-nightly-gpu image: huggingface/transformers-pytorch-deepspeed-nightly-gpu
options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
...@@ -236,13 +249,21 @@ jobs: ...@@ -236,13 +249,21 @@ jobs:
name: Send results to webhook name: Send results to webhook
runs-on: ubuntu-latest runs-on: ubuntu-latest
if: always() if: always()
needs: [run_check_runners, setup, run_tests_single_gpu, run_tests_multi_gpu, run_all_tests_torch_cuda_extensions_gpu] needs: [
check_runner_status,
check_runners,
setup,
run_tests_single_gpu,
run_tests_multi_gpu,
run_all_tests_torch_cuda_extensions_gpu
]
steps: steps:
- name: Preliminary job status - name: Preliminary job status
shell: bash shell: bash
# For the meaning of these environment variables, see the job `Setup` # For the meaning of these environment variables, see the job `Setup`
run: | run: |
echo "Runner status: ${{ needs.run_check_runners.result }}" echo "Runner availability: ${{ needs.check_runner_status.result }}"
echo "Runner status: ${{ needs.check_runners.result }}"
echo "Setup status: ${{ needs.setup.result }}" echo "Setup status: ${{ needs.setup.result }}"
- uses: actions/checkout@v2 - uses: actions/checkout@v2
...@@ -255,8 +276,9 @@ jobs: ...@@ -255,8 +276,9 @@ jobs:
CI_SLACK_CHANNEL_DUMMY_TESTS: ${{ secrets.CI_SLACK_CHANNEL_DUMMY_TESTS }} CI_SLACK_CHANNEL_DUMMY_TESTS: ${{ secrets.CI_SLACK_CHANNEL_DUMMY_TESTS }}
CI_SLACK_REPORT_CHANNEL_ID: ${{ secrets.CI_SLACK_CHANNEL_ID_PAST_FUTURE }} CI_SLACK_REPORT_CHANNEL_ID: ${{ secrets.CI_SLACK_CHANNEL_ID_PAST_FUTURE }}
CI_EVENT: nightly-build CI_EVENT: nightly-build
RUNNER_STATUS: ${{ needs.check_runner_status.result }}
RUNNER_ENV_STATUS: ${{ needs.check_runners.result }}
SETUP_STATUS: ${{ needs.setup.result }} SETUP_STATUS: ${{ needs.setup.result }}
RUNNER_STATUS: ${{ needs.run_check_runners.result }}
# We pass `needs.setup.outputs.matrix` as the argument. A processing in `notification_service.py` to change # We pass `needs.setup.outputs.matrix` as the argument. A processing in `notification_service.py` to change
# `models/bert` to `models_bert` is required, as the artifact names use `_` instead of `/`. # `models/bert` to `models_bert` is required, as the artifact names use `_` instead of `/`.
run: | run: |
......
...@@ -27,9 +27,43 @@ env: ...@@ -27,9 +27,43 @@ env:
RUN_PT_TF_CROSS_TESTS: 1 RUN_PT_TF_CROSS_TESTS: 1
jobs: jobs:
check_runner_status:
name: Check Runner Status
runs-on: ubuntu-latest
steps:
- name: Checkout transformers
uses: actions/checkout@v2
with:
fetch-depth: 2
- name: Check Runner Status
run: python utils/check_self_hosted_runner.py --target_runners single-gpu-past-ci-runner-docker,multi-gpu-past-ci-runner-docker --token ${{ secrets.ACCESS_REPO_INFO_TOKEN }}
check_runners:
name: Check Runners
needs: check_runner_status
strategy:
matrix:
machine_type: [single-gpu, multi-gpu]
runs-on: ${{ format('{0}-{1}', matrix.machine_type, 'docker-past-ci') }}
container:
image: huggingface/transformers-${{ inputs.framework }}-past-${{ inputs.version }}-gpu
options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
steps:
- name: NVIDIA-SMI
run: |
nvidia-smi
setup: setup:
name: Setup name: Setup
runs-on: ubuntu-latest needs: check_runners
strategy:
matrix:
machine_type: [single-gpu, multi-gpu]
runs-on: ${{ format('{0}-{1}', matrix.machine_type, 'docker-past-ci') }}
container:
image: huggingface/transformers-${{ inputs.framework }}-past-${{ inputs.version }}-gpu
options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
outputs: outputs:
matrix: ${{ steps.set-matrix.outputs.matrix }} matrix: ${{ steps.set-matrix.outputs.matrix }}
steps: steps:
...@@ -50,21 +84,6 @@ jobs: ...@@ -50,21 +84,6 @@ jobs:
cd tests cd tests
echo "::set-output name=matrix::$(python3 -c 'import os; tests = os.getcwd(); model_tests = os.listdir(os.path.join(tests, "models")); d1 = sorted(list(filter(os.path.isdir, os.listdir(tests)))); d2 = sorted(list(filter(os.path.isdir, [f"models/{x}" for x in model_tests]))); d1.remove("models"); d = d2 + d1; print(d)')" echo "::set-output name=matrix::$(python3 -c 'import os; tests = os.getcwd(); model_tests = os.listdir(os.path.join(tests, "models")); d1 = sorted(list(filter(os.path.isdir, os.listdir(tests)))); d2 = sorted(list(filter(os.path.isdir, [f"models/{x}" for x in model_tests]))); d1.remove("models"); d = d2 + d1; print(d)')"
run_check_runners:
name: Check Runners
needs: setup
strategy:
matrix:
machine_type: [single-gpu, multi-gpu]
runs-on: ${{ format('{0}-{1}', matrix.machine_type, 'docker-past-ci') }}
container:
image: huggingface/transformers-${{ inputs.framework }}-past-${{ inputs.version }}-gpu
options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
steps:
- name: NVIDIA-SMI
run: |
nvidia-smi
run_tests_single_gpu: run_tests_single_gpu:
name: Model tests name: Model tests
strategy: strategy:
...@@ -76,7 +95,7 @@ jobs: ...@@ -76,7 +95,7 @@ jobs:
container: container:
image: huggingface/transformers-${{ inputs.framework }}-past-${{ inputs.version }}-gpu image: huggingface/transformers-${{ inputs.framework }}-past-${{ inputs.version }}-gpu
options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
needs: [setup, run_check_runners] needs: setup
steps: steps:
- name: Update clone - name: Update clone
working-directory: /transformers working-directory: /transformers
...@@ -129,7 +148,7 @@ jobs: ...@@ -129,7 +148,7 @@ jobs:
container: container:
image: huggingface/transformers-${{ inputs.framework }}-past-${{ inputs.version }}-gpu image: huggingface/transformers-${{ inputs.framework }}-past-${{ inputs.version }}-gpu
options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
needs: [setup, run_check_runners] needs: setup
steps: steps:
- name: Update clone - name: Update clone
working-directory: /transformers working-directory: /transformers
...@@ -175,13 +194,14 @@ jobs: ...@@ -175,13 +194,14 @@ jobs:
name: Send results to webhook name: Send results to webhook
runs-on: ubuntu-latest runs-on: ubuntu-latest
if: always() if: always()
needs: [setup, run_check_runners, run_tests_single_gpu, run_tests_multi_gpu] needs: [check_runner_status, check_runners, setup, run_tests_single_gpu, run_tests_multi_gpu]
steps: steps:
- name: Preliminary job status - name: Preliminary job status
shell: bash shell: bash
# For the meaning of these environment variables, see the job `Setup` # For the meaning of these environment variables, see the job `Setup`
run: | run: |
echo "Runner status: ${{ needs.run_check_runners.result }}" echo "Runner availability: ${{ needs.check_runner_status.result }}"
echo "Runner status: ${{ needs.check_runners.result }}"
echo "Setup status: ${{ needs.setup.result }}" echo "Setup status: ${{ needs.setup.result }}"
- uses: actions/checkout@v2 - uses: actions/checkout@v2
...@@ -199,8 +219,9 @@ jobs: ...@@ -199,8 +219,9 @@ jobs:
CI_SLACK_CHANNEL_DUMMY_TESTS: ${{ secrets.CI_SLACK_CHANNEL_DUMMY_TESTS }} CI_SLACK_CHANNEL_DUMMY_TESTS: ${{ secrets.CI_SLACK_CHANNEL_DUMMY_TESTS }}
CI_SLACK_REPORT_CHANNEL_ID: ${{ secrets.CI_SLACK_CHANNEL_ID_PAST_FUTURE }} CI_SLACK_REPORT_CHANNEL_ID: ${{ secrets.CI_SLACK_CHANNEL_ID_PAST_FUTURE }}
CI_EVENT: Past CI - ${{ inputs.framework }}-${{ inputs.version }} CI_EVENT: Past CI - ${{ inputs.framework }}-${{ inputs.version }}
RUNNER_STATUS: ${{ needs.check_runner_status.result }}
RUNNER_ENV_STATUS: ${{ needs.check_runners.result }}
SETUP_STATUS: ${{ needs.setup.result }} SETUP_STATUS: ${{ needs.setup.result }}
RUNNER_STATUS: ${{ needs.run_check_runners.result }}
# We pass `needs.setup.outputs.matrix` as the argument. A processing in `notification_service.py` to change # We pass `needs.setup.outputs.matrix` as the argument. A processing in `notification_service.py` to change
# `models/bert` to `models_bert` is required, as the artifact names use `_` instead of `/`. # `models/bert` to `models_bert` is required, as the artifact names use `_` instead of `/`.
run: | run: |
......
...@@ -27,9 +27,43 @@ env: ...@@ -27,9 +27,43 @@ env:
RUN_PT_TF_CROSS_TESTS: 1 RUN_PT_TF_CROSS_TESTS: 1
jobs: jobs:
check_runner_status:
name: Check Runner Status
runs-on: ubuntu-latest
steps:
- name: Checkout transformers
uses: actions/checkout@v2
with:
fetch-depth: 2
- name: Check Runner Status
run: python utils/check_self_hosted_runner.py --target_runners single-gpu-ci-runner-docker,multi-gpu-ci-runner-docker --token ${{ secrets.ACCESS_REPO_INFO_TOKEN }}
check_runners:
name: Check Runners
needs: check_runner_status
strategy:
matrix:
machine_type: [single-gpu, multi-gpu]
runs-on: [self-hosted, docker-gpu, '${{ matrix.machine_type }}']
container:
image: huggingface/transformers-all-latest-gpu
options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
steps:
- name: NVIDIA-SMI
run: |
nvidia-smi
setup: setup:
name: Setup name: Setup
runs-on: ubuntu-latest needs: check_runners
strategy:
matrix:
machine_type: [single-gpu, multi-gpu]
runs-on: [self-hosted, docker-gpu, '${{ matrix.machine_type }}']
container:
image: huggingface/transformers-all-latest-gpu
options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
outputs: outputs:
matrix: ${{ steps.set-matrix.outputs.matrix }} matrix: ${{ steps.set-matrix.outputs.matrix }}
test_map: ${{ steps.set-matrix.outputs.test_map }} test_map: ${{ steps.set-matrix.outputs.test_map }}
...@@ -111,24 +145,9 @@ jobs: ...@@ -111,24 +145,9 @@ jobs:
echo "::set-output name=matrix::$keys" echo "::set-output name=matrix::$keys"
echo "::set-output name=test_map::$test_map" echo "::set-output name=test_map::$test_map"
run_check_runners:
name: Check Runners
needs: setup
strategy:
matrix:
machine_type: [single-gpu, multi-gpu]
runs-on: [self-hosted, docker-gpu, '${{ matrix.machine_type }}']
container:
image: huggingface/transformers-all-latest-gpu
options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
steps:
- name: NVIDIA-SMI
run: |
nvidia-smi
run_tests_single_gpu: run_tests_single_gpu:
name: Model tests name: Model tests
needs: [setup, run_check_runners] needs: setup
# `dummy` means there is no test to run # `dummy` means there is no test to run
if: contains(fromJson(needs.setup.outputs.matrix), 'dummy') != true if: contains(fromJson(needs.setup.outputs.matrix), 'dummy') != true
strategy: strategy:
...@@ -213,7 +232,7 @@ jobs: ...@@ -213,7 +232,7 @@ jobs:
run_tests_multi_gpu: run_tests_multi_gpu:
name: Model tests name: Model tests
needs: [setup, run_check_runners] needs: setup
# `dummy` means there is no test to run # `dummy` means there is no test to run
if: contains(fromJson(needs.setup.outputs.matrix), 'dummy') != true if: contains(fromJson(needs.setup.outputs.matrix), 'dummy') != true
strategy: strategy:
...@@ -300,7 +319,7 @@ jobs: ...@@ -300,7 +319,7 @@ jobs:
run_tests_torch_cuda_extensions_single_gpu: run_tests_torch_cuda_extensions_single_gpu:
name: Torch CUDA extension tests name: Torch CUDA extension tests
needs: [setup, run_check_runners] needs: setup
if: contains(fromJson(needs.setup.outputs.matrix), 'deepspeed') || contains(fromJson(needs.setup.outputs.matrix), 'extended') if: contains(fromJson(needs.setup.outputs.matrix), 'deepspeed') || contains(fromJson(needs.setup.outputs.matrix), 'extended')
strategy: strategy:
fail-fast: false fail-fast: false
...@@ -382,7 +401,7 @@ jobs: ...@@ -382,7 +401,7 @@ jobs:
run_tests_torch_cuda_extensions_multi_gpu: run_tests_torch_cuda_extensions_multi_gpu:
name: Torch CUDA extension tests name: Torch CUDA extension tests
needs: [setup, run_check_runners] needs: setup
if: contains(fromJson(needs.setup.outputs.matrix), 'deepspeed') || contains(fromJson(needs.setup.outputs.matrix), 'extended') if: contains(fromJson(needs.setup.outputs.matrix), 'deepspeed') || contains(fromJson(needs.setup.outputs.matrix), 'extended')
strategy: strategy:
fail-fast: false fail-fast: false
...@@ -467,8 +486,9 @@ jobs: ...@@ -467,8 +486,9 @@ jobs:
runs-on: ubuntu-latest runs-on: ubuntu-latest
if: always() if: always()
needs: [ needs: [
check_runner_status,
check_runners,
setup, setup,
run_check_runners,
run_tests_single_gpu, run_tests_single_gpu,
run_tests_multi_gpu, run_tests_multi_gpu,
run_tests_torch_cuda_extensions_single_gpu, run_tests_torch_cuda_extensions_single_gpu,
...@@ -479,8 +499,9 @@ jobs: ...@@ -479,8 +499,9 @@ jobs:
shell: bash shell: bash
# For the meaning of these environment variables, see the job `Setup` # For the meaning of these environment variables, see the job `Setup`
run: | run: |
echo "Runner availability: ${{ needs.check_runner_status.result }}"
echo "Setup status: ${{ needs.setup.result }}" echo "Setup status: ${{ needs.setup.result }}"
echo "Runner status: ${{ needs.run_check_runners.result }}" echo "Runner status: ${{ needs.check_runners.result }}"
# Necessary to get the correct branch name and commit SHA for `workflow_run` event # Necessary to get the correct branch name and commit SHA for `workflow_run` event
# We also take into account the `push` event (we might want to test some changes in a branch) # We also take into account the `push` event (we might want to test some changes in a branch)
...@@ -527,8 +548,9 @@ jobs: ...@@ -527,8 +548,9 @@ jobs:
CI_TITLE_PUSH: ${{ github.event.head_commit.message }} CI_TITLE_PUSH: ${{ github.event.head_commit.message }}
CI_TITLE_WORKFLOW_RUN: ${{ github.event.workflow_run.head_commit.message }} CI_TITLE_WORKFLOW_RUN: ${{ github.event.workflow_run.head_commit.message }}
CI_SHA: ${{ env.CI_SHA }} CI_SHA: ${{ env.CI_SHA }}
RUNNER_STATUS: ${{ needs.check_runner_status.result }}
RUNNER_ENV_STATUS: ${{ needs.check_runners.result }}
SETUP_STATUS: ${{ needs.setup.result }} SETUP_STATUS: ${{ needs.setup.result }}
RUNNER_STATUS: ${{ needs.run_check_runners.result }}
# We pass `needs.setup.outputs.matrix` as the argument. A processing in `notification_service.py` to change # We pass `needs.setup.outputs.matrix` as the argument. A processing in `notification_service.py` to change
# `models/bert` to `models_bert` is required, as the artifact names use `_` instead of `/`. # `models/bert` to `models_bert` is required, as the artifact names use `_` instead of `/`.
......
...@@ -22,8 +22,21 @@ env: ...@@ -22,8 +22,21 @@ env:
RUN_PT_TF_CROSS_TESTS: 1 RUN_PT_TF_CROSS_TESTS: 1
jobs: jobs:
run_check_runners: check_runner_status:
name: Check Runner Status
runs-on: ubuntu-latest
steps:
- name: Checkout transformers
uses: actions/checkout@v2
with:
fetch-depth: 2
- name: Check Runner Status
run: python utils/check_self_hosted_runner.py --target_runners single-gpu-scheduled-ci-runner-docker,multi-gpu-scheduled-ci-runner-docker --token ${{ secrets.ACCESS_REPO_INFO_TOKEN }}
check_runners:
name: Check Runners name: Check Runners
needs: check_runner_status
strategy: strategy:
matrix: matrix:
machine_type: [single-gpu, multi-gpu] machine_type: [single-gpu, multi-gpu]
...@@ -38,7 +51,7 @@ jobs: ...@@ -38,7 +51,7 @@ jobs:
setup: setup:
name: Setup name: Setup
needs: run_check_runners needs: check_runners
strategy: strategy:
matrix: matrix:
machine_type: [single-gpu, multi-gpu] machine_type: [single-gpu, multi-gpu]
...@@ -82,7 +95,7 @@ jobs: ...@@ -82,7 +95,7 @@ jobs:
container: container:
image: huggingface/transformers-all-latest-gpu image: huggingface/transformers-all-latest-gpu
options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
needs: [run_check_runners, setup] needs: setup
steps: steps:
- name: Echo folder ${{ matrix.folders }} - name: Echo folder ${{ matrix.folders }}
shell: bash shell: bash
...@@ -135,7 +148,7 @@ jobs: ...@@ -135,7 +148,7 @@ jobs:
container: container:
image: huggingface/transformers-all-latest-gpu image: huggingface/transformers-all-latest-gpu
options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
needs: [run_check_runners, setup] needs: setup
steps: steps:
- name: Echo folder ${{ matrix.folders }} - name: Echo folder ${{ matrix.folders }}
shell: bash shell: bash
...@@ -183,7 +196,7 @@ jobs: ...@@ -183,7 +196,7 @@ jobs:
container: container:
image: huggingface/transformers-all-latest-gpu image: huggingface/transformers-all-latest-gpu
options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
needs: [run_check_runners, setup] needs: setup
steps: steps:
- name: Update clone - name: Update clone
working-directory: /transformers working-directory: /transformers
...@@ -226,7 +239,7 @@ jobs: ...@@ -226,7 +239,7 @@ jobs:
container: container:
image: huggingface/transformers-pytorch-gpu image: huggingface/transformers-pytorch-gpu
options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
needs: [run_check_runners, setup] needs: setup
steps: steps:
- name: Update clone - name: Update clone
working-directory: /transformers working-directory: /transformers
...@@ -270,7 +283,7 @@ jobs: ...@@ -270,7 +283,7 @@ jobs:
container: container:
image: huggingface/transformers-tensorflow-gpu image: huggingface/transformers-tensorflow-gpu
options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
needs: [run_check_runners, setup] needs: setup
steps: steps:
- name: Update clone - name: Update clone
working-directory: /transformers working-directory: /transformers
...@@ -312,7 +325,7 @@ jobs: ...@@ -312,7 +325,7 @@ jobs:
matrix: matrix:
machine_type: [single-gpu, multi-gpu] machine_type: [single-gpu, multi-gpu]
runs-on: ${{ format('{0}-{1}', matrix.machine_type, 'docker') }} runs-on: ${{ format('{0}-{1}', matrix.machine_type, 'docker') }}
needs: [run_check_runners, setup] needs: setup
container: container:
image: huggingface/transformers-pytorch-deepspeed-latest-gpu image: huggingface/transformers-pytorch-deepspeed-latest-gpu
options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
...@@ -362,7 +375,8 @@ jobs: ...@@ -362,7 +375,8 @@ jobs:
runs-on: ubuntu-latest runs-on: ubuntu-latest
if: always() if: always()
needs: [ needs: [
run_check_runners, check_runner_status,
check_runners,
setup, setup,
run_tests_single_gpu, run_tests_single_gpu,
run_tests_multi_gpu, run_tests_multi_gpu,
...@@ -376,7 +390,8 @@ jobs: ...@@ -376,7 +390,8 @@ jobs:
shell: bash shell: bash
# For the meaning of these environment variables, see the job `Setup` # For the meaning of these environment variables, see the job `Setup`
run: | run: |
echo "Runner status: ${{ needs.run_check_runners.result }}" echo "Runner availability: ${{ needs.check_runner_status.result }}"
echo "Runner status: ${{ needs.check_runners.result }}"
echo "Setup status: ${{ needs.setup.result }}" echo "Setup status: ${{ needs.setup.result }}"
- uses: actions/checkout@v2 - uses: actions/checkout@v2
...@@ -389,8 +404,9 @@ jobs: ...@@ -389,8 +404,9 @@ jobs:
CI_SLACK_CHANNEL_DUMMY_TESTS: ${{ secrets.CI_SLACK_CHANNEL_DUMMY_TESTS }} CI_SLACK_CHANNEL_DUMMY_TESTS: ${{ secrets.CI_SLACK_CHANNEL_DUMMY_TESTS }}
CI_SLACK_REPORT_CHANNEL_ID: ${{ secrets.CI_SLACK_CHANNEL_ID_DAILY }} CI_SLACK_REPORT_CHANNEL_ID: ${{ secrets.CI_SLACK_CHANNEL_ID_DAILY }}
CI_EVENT: scheduled CI_EVENT: scheduled
RUNNER_STATUS: ${{ needs.check_runner_status.result }}
RUNNER_ENV_STATUS: ${{ needs.check_runners.result }}
SETUP_STATUS: ${{ needs.setup.result }} SETUP_STATUS: ${{ needs.setup.result }}
RUNNER_STATUS: ${{ needs.run_check_runners.result }}
# We pass `needs.setup.outputs.matrix` as the argument. A processing in `notification_service.py` to change # We pass `needs.setup.outputs.matrix` as the argument. A processing in `notification_service.py` to change
# `models/bert` to `models_bert` is required, as the artifact names use `_` instead of `/`. # `models/bert` to `models_bert` is required, as the artifact names use `_` instead of `/`.
run: | run: |
......
import argparse
import json
import subprocess
def get_runner_status(target_runners, token):
cmd = (
f'curl -H "Accept: application/vnd.github+json" -H "Authorization: Bearer {token}"'
" https://api.github.com/repos/huggingface/transformers/actions/runners"
)
output = subprocess.run(cmd, shell=True, stdout=subprocess.PIPE)
o = output.stdout.decode("utf-8")
status = json.loads(o)
runners = status["runners"]
for runner in runners:
if runner["name"] in target_runners:
if runner["status"] == "offline":
raise ValueError(f"{runner['name']} is offline!")
if __name__ == "__main__":
def list_str(values):
return values.split(",")
parser = argparse.ArgumentParser()
# Required parameters
parser.add_argument(
"--target_runners",
default=None,
type=list_str,
required=True,
help="Comma-separated list of runners to check status.",
)
parser.add_argument(
"--token", default=None, type=str, required=True, help="A token that has actions:read permission."
)
args = parser.parse_args()
get_runner_status(args.target_runners, args.token)
...@@ -387,7 +387,7 @@ class Message: ...@@ -387,7 +387,7 @@ class Message:
return json.dumps(blocks) return json.dumps(blocks)
@staticmethod @staticmethod
def error_out(title, ci_title="", setup_failed=False, runner_failed=False): def error_out(title, ci_title="", runner_not_available=False, runner_failed=False, setup_failed=False):
blocks = [] blocks = []
title_block = {"type": "header", "text": {"type": "plain_text", "text": title}} title_block = {"type": "header", "text": {"type": "plain_text", "text": title}}
...@@ -397,10 +397,12 @@ class Message: ...@@ -397,10 +397,12 @@ class Message:
ci_title_block = {"type": "section", "text": {"type": "mrkdwn", "text": ci_title}} ci_title_block = {"type": "section", "text": {"type": "mrkdwn", "text": ci_title}}
blocks.append(ci_title_block) blocks.append(ci_title_block)
if setup_failed: if runner_not_available:
text = "💔 Setup job failed. Tests are not run. 😭" text = "💔 CI runners are not available! Tests are not run. 😭"
elif runner_failed: elif runner_failed:
text = "💔 CI runners have problems! Tests are not run. 😭" text = "💔 CI runners have problems! Tests are not run. 😭"
elif setup_failed:
text = "💔 Setup job failed. Tests are not run. 😭"
else: else:
text = "💔 There was an issue running the tests. 😭" text = "💔 There was an issue running the tests. 😭"
...@@ -654,10 +656,13 @@ def prepare_reports(title, header, reports, to_truncate=True): ...@@ -654,10 +656,13 @@ def prepare_reports(title, header, reports, to_truncate=True):
if __name__ == "__main__": if __name__ == "__main__":
setup_status = os.environ.get("SETUP_STATUS")
runner_status = os.environ.get("RUNNER_STATUS") runner_status = os.environ.get("RUNNER_STATUS")
runner_env_status = os.environ.get("RUNNER_ENV_STATUS")
setup_status = os.environ.get("SETUP_STATUS")
runner_not_available = True if runner_status is not None and runner_status != "success" else False
runner_failed = True if runner_env_status is not None and runner_env_status != "success" else False
setup_failed = True if setup_status is not None and setup_status != "success" else False setup_failed = True if setup_status is not None and setup_status != "success" else False
runner_failed = True if runner_status is not None and runner_status != "success" else False
org = "huggingface" org = "huggingface"
repo = "transformers" repo = "transformers"
...@@ -718,8 +723,8 @@ if __name__ == "__main__": ...@@ -718,8 +723,8 @@ if __name__ == "__main__":
else: else:
ci_title = "" ci_title = ""
if setup_failed or runner_failed: if runner_not_available or runner_failed or setup_failed:
Message.error_out(title, ci_title, setup_failed, runner_failed) Message.error_out(title, ci_title, runner_not_available, runner_failed, setup_failed)
exit(0) exit(0)
arguments = sys.argv[1:][0] arguments = sys.argv[1:][0]
...@@ -728,7 +733,7 @@ if __name__ == "__main__": ...@@ -728,7 +733,7 @@ if __name__ == "__main__":
# Need to change from elements like `models/bert` to `models_bert` (the ones used as artifact names). # Need to change from elements like `models/bert` to `models_bert` (the ones used as artifact names).
models = [x.replace("models/", "models_") for x in models] models = [x.replace("models/", "models_") for x in models]
except SyntaxError: except SyntaxError:
Message.error_out() Message.error_out(title, ci_title)
raise ValueError("Errored out.") raise ValueError("Errored out.")
github_actions_job_links = get_job_links() github_actions_job_links = get_job_links()
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment