Unverified Commit ba7f2173 authored by Yih-Dar's avatar Yih-Dar Committed by GitHub
Browse files

Add runner availability check (#19054)


Co-authored-by: default avatarydshieh <ydshieh@users.noreply.github.com>
parent ca485e56
name: Self-hosted runner (check runner status)
# Note that each job's dependencies go into a corresponding docker file.
#
# For example for `run_all_tests_torch_cuda_extensions_gpu` the docker image is
# `huggingface/transformers-pytorch-deepspeed-latest-gpu`, which can be found at
# `docker/transformers-pytorch-deepspeed-latest-gpu/Dockerfile`
on:
repository_dispatch:
schedule:
# run per hour
- cron: "* */1 * * *"
env:
TRANSFORMERS_IS_CI: yes
jobs:
check_runner_status:
name: Check Runner Status
runs-on: ubuntu-latest
steps:
- name: Checkout transformers
uses: actions/checkout@v2
with:
fetch-depth: 2
- name: Check Runner Status
run: python utils/check_self_hosted_runner.py --target_runners single-gpu-ci-runner-docker,multi-gpu-ci-runner-docker,single-gpu-scheduled-ci-runner-docker,multi-scheduled-scheduled-ci-runner-docker --token ${{ secrets.ACCESS_REPO_INFO_TOKEN }}
send_results:
name: Send results to webhook
runs-on: ubuntu-latest
needs: check_runner_status
if: ${{ failure() }}
steps:
- name: Preliminary job status
shell: bash
run: |
echo "Runner availability: ${{ needs.check_runner_status.result }}"
- uses: actions/checkout@v2
- uses: actions/download-artifact@v2
- name: Send message to Slack
env:
CI_SLACK_BOT_TOKEN: ${{ secrets.CI_SLACK_BOT_TOKEN }}
CI_SLACK_CHANNEL_ID: ${{ secrets.CI_SLACK_CHANNEL_ID }}
CI_SLACK_CHANNEL_ID_DAILY: ${{ secrets.CI_SLACK_CHANNEL_ID_DAILY }}
CI_SLACK_CHANNEL_DUMMY_TESTS: ${{ secrets.CI_SLACK_CHANNEL_DUMMY_TESTS }}
CI_SLACK_REPORT_CHANNEL_ID: ${{ secrets.CI_SLACK_CHANNEL_ID_DAILY }}
CI_EVENT: runner status check
RUNNER_STATUS: ${{ needs.check_runner_status.result }}
# We pass `needs.setup.outputs.matrix` as the argument. A processing in `notification_service.py` to change
# `models/bert` to `models_bert` is required, as the artifact names use `_` instead of `/`.
run: |
pip install slack_sdk
python utils/notification_service.py
......@@ -23,8 +23,21 @@ env:
RUN_PT_TF_CROSS_TESTS: 1
jobs:
run_check_runners:
check_runner_status:
name: Check Runner Status
runs-on: ubuntu-latest
steps:
- name: Checkout transformers
uses: actions/checkout@v2
with:
fetch-depth: 2
- name: Check Runner Status
run: python utils/check_self_hosted_runner.py --target_runners single-gpu-scheduled-ci-runner-docker,multi-gpu-scheduled-ci-runner-docker --token ${{ secrets.ACCESS_REPO_INFO_TOKEN }}
check_runners:
name: Check Runners
needs: check_runner_status
strategy:
matrix:
machine_type: [single-gpu, multi-gpu]
......@@ -39,7 +52,7 @@ jobs:
setup:
name: Setup
needs: run_check_runners
needs: check_runners
strategy:
matrix:
machine_type: [single-gpu, multi-gpu]
......@@ -83,7 +96,7 @@ jobs:
container:
image: huggingface/transformers-all-latest-torch-nightly-gpu
options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
needs: [run_check_runners, setup]
needs: setup
steps:
- name: Echo folder ${{ matrix.folders }}
shell: bash
......@@ -136,7 +149,7 @@ jobs:
container:
image: huggingface/transformers-all-latest-torch-nightly-gpu
options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
needs: [run_check_runners, setup]
needs: setup
steps:
- name: Echo folder ${{ matrix.folders }}
shell: bash
......@@ -185,7 +198,7 @@ jobs:
matrix:
machine_type: [single-gpu, multi-gpu]
runs-on: ${{ format('{0}-{1}', matrix.machine_type, 'docker') }}
needs: [run_check_runners, setup]
needs: setup
container:
image: huggingface/transformers-pytorch-deepspeed-nightly-gpu
options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
......@@ -236,13 +249,21 @@ jobs:
name: Send results to webhook
runs-on: ubuntu-latest
if: always()
needs: [run_check_runners, setup, run_tests_single_gpu, run_tests_multi_gpu, run_all_tests_torch_cuda_extensions_gpu]
needs: [
check_runner_status,
check_runners,
setup,
run_tests_single_gpu,
run_tests_multi_gpu,
run_all_tests_torch_cuda_extensions_gpu
]
steps:
- name: Preliminary job status
shell: bash
# For the meaning of these environment variables, see the job `Setup`
run: |
echo "Runner status: ${{ needs.run_check_runners.result }}"
echo "Runner availability: ${{ needs.check_runner_status.result }}"
echo "Runner status: ${{ needs.check_runners.result }}"
echo "Setup status: ${{ needs.setup.result }}"
- uses: actions/checkout@v2
......@@ -255,8 +276,9 @@ jobs:
CI_SLACK_CHANNEL_DUMMY_TESTS: ${{ secrets.CI_SLACK_CHANNEL_DUMMY_TESTS }}
CI_SLACK_REPORT_CHANNEL_ID: ${{ secrets.CI_SLACK_CHANNEL_ID_PAST_FUTURE }}
CI_EVENT: nightly-build
RUNNER_STATUS: ${{ needs.check_runner_status.result }}
RUNNER_ENV_STATUS: ${{ needs.check_runners.result }}
SETUP_STATUS: ${{ needs.setup.result }}
RUNNER_STATUS: ${{ needs.run_check_runners.result }}
# We pass `needs.setup.outputs.matrix` as the argument. A processing in `notification_service.py` to change
# `models/bert` to `models_bert` is required, as the artifact names use `_` instead of `/`.
run: |
......
......@@ -27,9 +27,43 @@ env:
RUN_PT_TF_CROSS_TESTS: 1
jobs:
check_runner_status:
name: Check Runner Status
runs-on: ubuntu-latest
steps:
- name: Checkout transformers
uses: actions/checkout@v2
with:
fetch-depth: 2
- name: Check Runner Status
run: python utils/check_self_hosted_runner.py --target_runners single-gpu-past-ci-runner-docker,multi-gpu-past-ci-runner-docker --token ${{ secrets.ACCESS_REPO_INFO_TOKEN }}
check_runners:
name: Check Runners
needs: check_runner_status
strategy:
matrix:
machine_type: [single-gpu, multi-gpu]
runs-on: ${{ format('{0}-{1}', matrix.machine_type, 'docker-past-ci') }}
container:
image: huggingface/transformers-${{ inputs.framework }}-past-${{ inputs.version }}-gpu
options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
steps:
- name: NVIDIA-SMI
run: |
nvidia-smi
setup:
name: Setup
runs-on: ubuntu-latest
needs: check_runners
strategy:
matrix:
machine_type: [single-gpu, multi-gpu]
runs-on: ${{ format('{0}-{1}', matrix.machine_type, 'docker-past-ci') }}
container:
image: huggingface/transformers-${{ inputs.framework }}-past-${{ inputs.version }}-gpu
options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
outputs:
matrix: ${{ steps.set-matrix.outputs.matrix }}
steps:
......@@ -50,21 +84,6 @@ jobs:
cd tests
echo "::set-output name=matrix::$(python3 -c 'import os; tests = os.getcwd(); model_tests = os.listdir(os.path.join(tests, "models")); d1 = sorted(list(filter(os.path.isdir, os.listdir(tests)))); d2 = sorted(list(filter(os.path.isdir, [f"models/{x}" for x in model_tests]))); d1.remove("models"); d = d2 + d1; print(d)')"
run_check_runners:
name: Check Runners
needs: setup
strategy:
matrix:
machine_type: [single-gpu, multi-gpu]
runs-on: ${{ format('{0}-{1}', matrix.machine_type, 'docker-past-ci') }}
container:
image: huggingface/transformers-${{ inputs.framework }}-past-${{ inputs.version }}-gpu
options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
steps:
- name: NVIDIA-SMI
run: |
nvidia-smi
run_tests_single_gpu:
name: Model tests
strategy:
......@@ -76,7 +95,7 @@ jobs:
container:
image: huggingface/transformers-${{ inputs.framework }}-past-${{ inputs.version }}-gpu
options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
needs: [setup, run_check_runners]
needs: setup
steps:
- name: Update clone
working-directory: /transformers
......@@ -129,7 +148,7 @@ jobs:
container:
image: huggingface/transformers-${{ inputs.framework }}-past-${{ inputs.version }}-gpu
options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
needs: [setup, run_check_runners]
needs: setup
steps:
- name: Update clone
working-directory: /transformers
......@@ -175,13 +194,14 @@ jobs:
name: Send results to webhook
runs-on: ubuntu-latest
if: always()
needs: [setup, run_check_runners, run_tests_single_gpu, run_tests_multi_gpu]
needs: [check_runner_status, check_runners, setup, run_tests_single_gpu, run_tests_multi_gpu]
steps:
- name: Preliminary job status
shell: bash
# For the meaning of these environment variables, see the job `Setup`
run: |
echo "Runner status: ${{ needs.run_check_runners.result }}"
echo "Runner availability: ${{ needs.check_runner_status.result }}"
echo "Runner status: ${{ needs.check_runners.result }}"
echo "Setup status: ${{ needs.setup.result }}"
- uses: actions/checkout@v2
......@@ -199,8 +219,9 @@ jobs:
CI_SLACK_CHANNEL_DUMMY_TESTS: ${{ secrets.CI_SLACK_CHANNEL_DUMMY_TESTS }}
CI_SLACK_REPORT_CHANNEL_ID: ${{ secrets.CI_SLACK_CHANNEL_ID_PAST_FUTURE }}
CI_EVENT: Past CI - ${{ inputs.framework }}-${{ inputs.version }}
RUNNER_STATUS: ${{ needs.check_runner_status.result }}
RUNNER_ENV_STATUS: ${{ needs.check_runners.result }}
SETUP_STATUS: ${{ needs.setup.result }}
RUNNER_STATUS: ${{ needs.run_check_runners.result }}
# We pass `needs.setup.outputs.matrix` as the argument. A processing in `notification_service.py` to change
# `models/bert` to `models_bert` is required, as the artifact names use `_` instead of `/`.
run: |
......
......@@ -27,9 +27,43 @@ env:
RUN_PT_TF_CROSS_TESTS: 1
jobs:
check_runner_status:
name: Check Runner Status
runs-on: ubuntu-latest
steps:
- name: Checkout transformers
uses: actions/checkout@v2
with:
fetch-depth: 2
- name: Check Runner Status
run: python utils/check_self_hosted_runner.py --target_runners single-gpu-ci-runner-docker,multi-gpu-ci-runner-docker --token ${{ secrets.ACCESS_REPO_INFO_TOKEN }}
check_runners:
name: Check Runners
needs: check_runner_status
strategy:
matrix:
machine_type: [single-gpu, multi-gpu]
runs-on: [self-hosted, docker-gpu, '${{ matrix.machine_type }}']
container:
image: huggingface/transformers-all-latest-gpu
options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
steps:
- name: NVIDIA-SMI
run: |
nvidia-smi
setup:
name: Setup
runs-on: ubuntu-latest
needs: check_runners
strategy:
matrix:
machine_type: [single-gpu, multi-gpu]
runs-on: [self-hosted, docker-gpu, '${{ matrix.machine_type }}']
container:
image: huggingface/transformers-all-latest-gpu
options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
outputs:
matrix: ${{ steps.set-matrix.outputs.matrix }}
test_map: ${{ steps.set-matrix.outputs.test_map }}
......@@ -111,24 +145,9 @@ jobs:
echo "::set-output name=matrix::$keys"
echo "::set-output name=test_map::$test_map"
run_check_runners:
name: Check Runners
needs: setup
strategy:
matrix:
machine_type: [single-gpu, multi-gpu]
runs-on: [self-hosted, docker-gpu, '${{ matrix.machine_type }}']
container:
image: huggingface/transformers-all-latest-gpu
options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
steps:
- name: NVIDIA-SMI
run: |
nvidia-smi
run_tests_single_gpu:
name: Model tests
needs: [setup, run_check_runners]
needs: setup
# `dummy` means there is no test to run
if: contains(fromJson(needs.setup.outputs.matrix), 'dummy') != true
strategy:
......@@ -213,7 +232,7 @@ jobs:
run_tests_multi_gpu:
name: Model tests
needs: [setup, run_check_runners]
needs: setup
# `dummy` means there is no test to run
if: contains(fromJson(needs.setup.outputs.matrix), 'dummy') != true
strategy:
......@@ -300,7 +319,7 @@ jobs:
run_tests_torch_cuda_extensions_single_gpu:
name: Torch CUDA extension tests
needs: [setup, run_check_runners]
needs: setup
if: contains(fromJson(needs.setup.outputs.matrix), 'deepspeed') || contains(fromJson(needs.setup.outputs.matrix), 'extended')
strategy:
fail-fast: false
......@@ -382,7 +401,7 @@ jobs:
run_tests_torch_cuda_extensions_multi_gpu:
name: Torch CUDA extension tests
needs: [setup, run_check_runners]
needs: setup
if: contains(fromJson(needs.setup.outputs.matrix), 'deepspeed') || contains(fromJson(needs.setup.outputs.matrix), 'extended')
strategy:
fail-fast: false
......@@ -467,8 +486,9 @@ jobs:
runs-on: ubuntu-latest
if: always()
needs: [
check_runner_status,
check_runners,
setup,
run_check_runners,
run_tests_single_gpu,
run_tests_multi_gpu,
run_tests_torch_cuda_extensions_single_gpu,
......@@ -479,8 +499,9 @@ jobs:
shell: bash
# For the meaning of these environment variables, see the job `Setup`
run: |
echo "Runner availability: ${{ needs.check_runner_status.result }}"
echo "Setup status: ${{ needs.setup.result }}"
echo "Runner status: ${{ needs.run_check_runners.result }}"
echo "Runner status: ${{ needs.check_runners.result }}"
# Necessary to get the correct branch name and commit SHA for `workflow_run` event
# We also take into account the `push` event (we might want to test some changes in a branch)
......@@ -527,8 +548,9 @@ jobs:
CI_TITLE_PUSH: ${{ github.event.head_commit.message }}
CI_TITLE_WORKFLOW_RUN: ${{ github.event.workflow_run.head_commit.message }}
CI_SHA: ${{ env.CI_SHA }}
RUNNER_STATUS: ${{ needs.check_runner_status.result }}
RUNNER_ENV_STATUS: ${{ needs.check_runners.result }}
SETUP_STATUS: ${{ needs.setup.result }}
RUNNER_STATUS: ${{ needs.run_check_runners.result }}
# We pass `needs.setup.outputs.matrix` as the argument. A processing in `notification_service.py` to change
# `models/bert` to `models_bert` is required, as the artifact names use `_` instead of `/`.
......
......@@ -22,8 +22,21 @@ env:
RUN_PT_TF_CROSS_TESTS: 1
jobs:
run_check_runners:
check_runner_status:
name: Check Runner Status
runs-on: ubuntu-latest
steps:
- name: Checkout transformers
uses: actions/checkout@v2
with:
fetch-depth: 2
- name: Check Runner Status
run: python utils/check_self_hosted_runner.py --target_runners single-gpu-scheduled-ci-runner-docker,multi-gpu-scheduled-ci-runner-docker --token ${{ secrets.ACCESS_REPO_INFO_TOKEN }}
check_runners:
name: Check Runners
needs: check_runner_status
strategy:
matrix:
machine_type: [single-gpu, multi-gpu]
......@@ -38,7 +51,7 @@ jobs:
setup:
name: Setup
needs: run_check_runners
needs: check_runners
strategy:
matrix:
machine_type: [single-gpu, multi-gpu]
......@@ -82,7 +95,7 @@ jobs:
container:
image: huggingface/transformers-all-latest-gpu
options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
needs: [run_check_runners, setup]
needs: setup
steps:
- name: Echo folder ${{ matrix.folders }}
shell: bash
......@@ -135,7 +148,7 @@ jobs:
container:
image: huggingface/transformers-all-latest-gpu
options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
needs: [run_check_runners, setup]
needs: setup
steps:
- name: Echo folder ${{ matrix.folders }}
shell: bash
......@@ -183,7 +196,7 @@ jobs:
container:
image: huggingface/transformers-all-latest-gpu
options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
needs: [run_check_runners, setup]
needs: setup
steps:
- name: Update clone
working-directory: /transformers
......@@ -226,7 +239,7 @@ jobs:
container:
image: huggingface/transformers-pytorch-gpu
options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
needs: [run_check_runners, setup]
needs: setup
steps:
- name: Update clone
working-directory: /transformers
......@@ -270,7 +283,7 @@ jobs:
container:
image: huggingface/transformers-tensorflow-gpu
options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
needs: [run_check_runners, setup]
needs: setup
steps:
- name: Update clone
working-directory: /transformers
......@@ -312,7 +325,7 @@ jobs:
matrix:
machine_type: [single-gpu, multi-gpu]
runs-on: ${{ format('{0}-{1}', matrix.machine_type, 'docker') }}
needs: [run_check_runners, setup]
needs: setup
container:
image: huggingface/transformers-pytorch-deepspeed-latest-gpu
options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
......@@ -362,7 +375,8 @@ jobs:
runs-on: ubuntu-latest
if: always()
needs: [
run_check_runners,
check_runner_status,
check_runners,
setup,
run_tests_single_gpu,
run_tests_multi_gpu,
......@@ -376,7 +390,8 @@ jobs:
shell: bash
# For the meaning of these environment variables, see the job `Setup`
run: |
echo "Runner status: ${{ needs.run_check_runners.result }}"
echo "Runner availability: ${{ needs.check_runner_status.result }}"
echo "Runner status: ${{ needs.check_runners.result }}"
echo "Setup status: ${{ needs.setup.result }}"
- uses: actions/checkout@v2
......@@ -389,8 +404,9 @@ jobs:
CI_SLACK_CHANNEL_DUMMY_TESTS: ${{ secrets.CI_SLACK_CHANNEL_DUMMY_TESTS }}
CI_SLACK_REPORT_CHANNEL_ID: ${{ secrets.CI_SLACK_CHANNEL_ID_DAILY }}
CI_EVENT: scheduled
RUNNER_STATUS: ${{ needs.check_runner_status.result }}
RUNNER_ENV_STATUS: ${{ needs.check_runners.result }}
SETUP_STATUS: ${{ needs.setup.result }}
RUNNER_STATUS: ${{ needs.run_check_runners.result }}
# We pass `needs.setup.outputs.matrix` as the argument. A processing in `notification_service.py` to change
# `models/bert` to `models_bert` is required, as the artifact names use `_` instead of `/`.
run: |
......
import argparse
import json
import subprocess
def get_runner_status(target_runners, token):
cmd = (
f'curl -H "Accept: application/vnd.github+json" -H "Authorization: Bearer {token}"'
" https://api.github.com/repos/huggingface/transformers/actions/runners"
)
output = subprocess.run(cmd, shell=True, stdout=subprocess.PIPE)
o = output.stdout.decode("utf-8")
status = json.loads(o)
runners = status["runners"]
for runner in runners:
if runner["name"] in target_runners:
if runner["status"] == "offline":
raise ValueError(f"{runner['name']} is offline!")
if __name__ == "__main__":
def list_str(values):
return values.split(",")
parser = argparse.ArgumentParser()
# Required parameters
parser.add_argument(
"--target_runners",
default=None,
type=list_str,
required=True,
help="Comma-separated list of runners to check status.",
)
parser.add_argument(
"--token", default=None, type=str, required=True, help="A token that has actions:read permission."
)
args = parser.parse_args()
get_runner_status(args.target_runners, args.token)
......@@ -387,7 +387,7 @@ class Message:
return json.dumps(blocks)
@staticmethod
def error_out(title, ci_title="", setup_failed=False, runner_failed=False):
def error_out(title, ci_title="", runner_not_available=False, runner_failed=False, setup_failed=False):
blocks = []
title_block = {"type": "header", "text": {"type": "plain_text", "text": title}}
......@@ -397,10 +397,12 @@ class Message:
ci_title_block = {"type": "section", "text": {"type": "mrkdwn", "text": ci_title}}
blocks.append(ci_title_block)
if setup_failed:
text = "💔 Setup job failed. Tests are not run. 😭"
if runner_not_available:
text = "💔 CI runners are not available! Tests are not run. 😭"
elif runner_failed:
text = "💔 CI runners have problems! Tests are not run. 😭"
elif setup_failed:
text = "💔 Setup job failed. Tests are not run. 😭"
else:
text = "💔 There was an issue running the tests. 😭"
......@@ -654,10 +656,13 @@ def prepare_reports(title, header, reports, to_truncate=True):
if __name__ == "__main__":
setup_status = os.environ.get("SETUP_STATUS")
runner_status = os.environ.get("RUNNER_STATUS")
runner_env_status = os.environ.get("RUNNER_ENV_STATUS")
setup_status = os.environ.get("SETUP_STATUS")
runner_not_available = True if runner_status is not None and runner_status != "success" else False
runner_failed = True if runner_env_status is not None and runner_env_status != "success" else False
setup_failed = True if setup_status is not None and setup_status != "success" else False
runner_failed = True if runner_status is not None and runner_status != "success" else False
org = "huggingface"
repo = "transformers"
......@@ -718,8 +723,8 @@ if __name__ == "__main__":
else:
ci_title = ""
if setup_failed or runner_failed:
Message.error_out(title, ci_title, setup_failed, runner_failed)
if runner_not_available or runner_failed or setup_failed:
Message.error_out(title, ci_title, runner_not_available, runner_failed, setup_failed)
exit(0)
arguments = sys.argv[1:][0]
......@@ -728,7 +733,7 @@ if __name__ == "__main__":
# Need to change from elements like `models/bert` to `models_bert` (the ones used as artifact names).
models = [x.replace("models/", "models_") for x in models]
except SyntaxError:
Message.error_out()
Message.error_out(title, ci_title)
raise ValueError("Errored out.")
github_actions_job_links = get_job_links()
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment