Unverified Commit 12cc1233 authored by Yih-Dar's avatar Yih-Dar Committed by GitHub
Browse files

Better way to run AMD CI with different flavors (#26634)



* Enable testing against mi250

* Change BERT to trigger tests

* Revert BERT's change

* AMD CI

* AMD CI

---------
Co-authored-by: default avatarMorgan Funtowicz <funtowiczmo@gmail.com>
Co-authored-by: default avatarydshieh <ydshieh@users.noreply.github.com>
parent 3ef71345
name: Self-hosted runner (AMD mi210 CI caller)
on:
workflow_run:
workflows: ["Self-hosted runner (push-caller)"]
branches: ["main"]
types: [completed]
push:
branches:
- run_amd_push_ci_caller*
paths:
- "src/**"
- "tests/**"
- ".github/**"
- "templates/**"
- "utils/**"
jobs:
run_amd_ci:
name: AMD mi210
if: (cancelled() != true) && ((github.event_name != 'schedule') || ((github.event_name == 'push') && startsWith(github.ref_name, 'run_amd_push_ci_caller')))
uses: ./.github/workflows/self-push-amd.yml
with:
gpu_flavor: mi210
secrets: inherit
name: Self-hosted runner (AMD mi250 CI caller)
on:
workflow_run:
workflows: ["Self-hosted runner (push-caller)"]
branches: ["main"]
types: [completed]
push:
branches:
- run_amd_push_ci_caller*
paths:
- "src/**"
- "tests/**"
- ".github/**"
- "templates/**"
- "utils/**"
jobs:
run_amd_ci:
name: AMD mi250
if: (cancelled() != true) && ((github.event_name != 'schedule') || ((github.event_name == 'push') && startsWith(github.ref_name, 'run_amd_push_ci_caller')))
uses: ./.github/workflows/self-push-amd.yml
with:
gpu_flavor: mi250
secrets: inherit
name: Self-hosted runner AMD GPU (push) name: Self-hosted runner AMD GPU (push)
on: on:
workflow_run: workflow_call:
workflows: ["Self-hosted runner (push-caller)"] inputs:
branches: ["main"] gpu_flavor:
types: [completed] required: true
push: type: string
branches:
- ci_*
- ci-*
paths:
- "src/**"
- "tests/**"
- ".github/**"
- "templates/**"
- "utils/**"
repository_dispatch:
env: env:
HF_HOME: /mnt/cache HF_HOME: /mnt/cache
...@@ -45,8 +35,7 @@ jobs: ...@@ -45,8 +35,7 @@ jobs:
strategy: strategy:
matrix: matrix:
machine_type: [single-gpu, multi-gpu] machine_type: [single-gpu, multi-gpu]
gpu_flavor: [mi210] runs-on: [self-hosted, docker-gpu, amd-gpu, '${{ matrix.machine_type }}', '${{ inputs.gpu_flavor }}']
runs-on: [self-hosted, docker-gpu, amd-gpu, '${{ matrix.machine_type }}', '${{ matrix.gpu_flavor }}']
container: container:
image: huggingface/transformers-pytorch-amd-gpu-push-ci # <--- We test only for PyTorch for now image: huggingface/transformers-pytorch-amd-gpu-push-ci # <--- We test only for PyTorch for now
options: --device /dev/kfd --device /dev/dri --env HIP_VISIBLE_DEVICES --env ROCR_VISIBLE_DEVICES --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ options: --device /dev/kfd --device /dev/dri --env HIP_VISIBLE_DEVICES --env ROCR_VISIBLE_DEVICES --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
...@@ -65,8 +54,7 @@ jobs: ...@@ -65,8 +54,7 @@ jobs:
strategy: strategy:
matrix: matrix:
machine_type: [single-gpu, multi-gpu] machine_type: [single-gpu, multi-gpu]
gpu_flavor: [mi210] runs-on: [self-hosted, docker-gpu, amd-gpu, '${{ matrix.machine_type }}', '${{ inputs.gpu_flavor }}']
runs-on: [self-hosted, docker-gpu, amd-gpu, '${{ matrix.machine_type }}', '${{ matrix.gpu_flavor }}']
container: container:
image: huggingface/transformers-pytorch-amd-gpu-push-ci # <--- We test only for PyTorch for now image: huggingface/transformers-pytorch-amd-gpu-push-ci # <--- We test only for PyTorch for now
options: --device /dev/kfd --device /dev/dri --env HIP_VISIBLE_DEVICES --env ROCR_VISIBLE_DEVICES --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ options: --device /dev/kfd --device /dev/dri --env HIP_VISIBLE_DEVICES --env ROCR_VISIBLE_DEVICES --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
...@@ -164,8 +152,7 @@ jobs: ...@@ -164,8 +152,7 @@ jobs:
matrix: matrix:
folders: ${{ fromJson(needs.setup_gpu.outputs.matrix) }} folders: ${{ fromJson(needs.setup_gpu.outputs.matrix) }}
machine_type: [single-gpu, multi-gpu] machine_type: [single-gpu, multi-gpu]
gpu_flavor: [mi210] runs-on: [self-hosted, docker-gpu, amd-gpu, '${{ matrix.machine_type }}', '${{ inputs.gpu_flavor }}']
runs-on: [self-hosted, docker-gpu, amd-gpu, '${{ matrix.machine_type }}', '${{ matrix.gpu_flavor }}']
container: container:
image: huggingface/transformers-pytorch-amd-gpu-push-ci # <--- We test only for PyTorch for now image: huggingface/transformers-pytorch-amd-gpu-push-ci # <--- We test only for PyTorch for now
options: --device /dev/kfd --device /dev/dri --env HIP_VISIBLE_DEVICES --env ROCR_VISIBLE_DEVICES --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ options: --device /dev/kfd --device /dev/dri --env HIP_VISIBLE_DEVICES --env ROCR_VISIBLE_DEVICES --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
...@@ -321,7 +308,7 @@ jobs: ...@@ -321,7 +308,7 @@ jobs:
CI_SLACK_CHANNEL_DUMMY_TESTS: ${{ secrets.CI_SLACK_CHANNEL_DUMMY_TESTS }} CI_SLACK_CHANNEL_DUMMY_TESTS: ${{ secrets.CI_SLACK_CHANNEL_DUMMY_TESTS }}
CI_SLACK_REPORT_CHANNEL_ID: ${{ secrets.CI_SLACK_CHANNEL_ID_AMD }} CI_SLACK_REPORT_CHANNEL_ID: ${{ secrets.CI_SLACK_CHANNEL_ID_AMD }}
ACCESS_REPO_INFO_TOKEN: ${{ secrets.ACCESS_REPO_INFO_TOKEN }} ACCESS_REPO_INFO_TOKEN: ${{ secrets.ACCESS_REPO_INFO_TOKEN }}
CI_EVENT: push CI_EVENT: Push CI (AMD) - ${{ inputs.gpu_flavor }}
CI_TITLE_PUSH: ${{ github.event.head_commit.message }} CI_TITLE_PUSH: ${{ github.event.head_commit.message }}
CI_TITLE_WORKFLOW_RUN: ${{ github.event.workflow_run.head_commit.message }} CI_TITLE_WORKFLOW_RUN: ${{ github.event.workflow_run.head_commit.message }}
CI_SHA: ${{ env.CI_SHA }} CI_SHA: ${{ env.CI_SHA }}
......
...@@ -897,6 +897,9 @@ if __name__ == "__main__": ...@@ -897,6 +897,9 @@ if __name__ == "__main__":
job_name_prefix = f"{framework} {version}" job_name_prefix = f"{framework} {version}"
elif ci_event.startswith("Nightly CI"): elif ci_event.startswith("Nightly CI"):
job_name_prefix = "Nightly CI" job_name_prefix = "Nightly CI"
elif ci_event.startswith("Push CI (AMD) - "):
flavor = ci_event.replace("Push CI (AMD) - ", "")
job_name_prefix = f"AMD {flavor}"
for model in model_results.keys(): for model in model_results.keys():
for artifact_path in available_artifacts[f"run_all_tests_gpu_{model}_test_reports"].paths: for artifact_path in available_artifacts[f"run_all_tests_gpu_{model}_test_reports"].paths:
...@@ -962,7 +965,7 @@ if __name__ == "__main__": ...@@ -962,7 +965,7 @@ if __name__ == "__main__":
"Torch CUDA extension tests": "run_tests_torch_cuda_extensions_gpu_test_reports", "Torch CUDA extension tests": "run_tests_torch_cuda_extensions_gpu_test_reports",
} }
if ci_event in ["push", "Nightly CI"] or ci_event.startswith("Past CI"): if ci_event in ["push", "Nightly CI"] or ci_event.startswith("Past CI") or ci_event.startswith("Push CI (AMD)"):
del additional_files["Examples directory"] del additional_files["Examples directory"]
del additional_files["PyTorch pipelines"] del additional_files["PyTorch pipelines"]
del additional_files["TensorFlow pipelines"] del additional_files["TensorFlow pipelines"]
...@@ -1027,6 +1030,6 @@ if __name__ == "__main__": ...@@ -1027,6 +1030,6 @@ if __name__ == "__main__":
message = Message(title, ci_title, model_results, additional_results, selected_warnings=selected_warnings) message = Message(title, ci_title, model_results, additional_results, selected_warnings=selected_warnings)
# send report only if there is any failure (for push CI) # send report only if there is any failure (for push CI)
if message.n_failures or ci_event != "push": if message.n_failures or (ci_event != "push" and not ci_event.startswith("Push CI (AMD)")):
message.post() message.post()
message.post_reply() message.post_reply()
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment