Unverified Commit 25b769e5 authored by Dillon Cullinan's avatar Dillon Cullinan Committed by GitHub
Browse files

ci: OPS-3142: Add multi-gpu test job (#6189)


Signed-off-by: default avatarDillon Cullinan <dcullinan@nvidia.com>
parent 1349b890
...@@ -27,6 +27,11 @@ on: ...@@ -27,6 +27,11 @@ on:
required: false required: false
type: boolean type: boolean
default: true default: true
run_multi_gpu_tests:
description: 'Whether to run multi-gpu tests'
required: false
type: boolean
default: false
copy_to_acr: copy_to_acr:
description: 'Whether to copy images to ACR' description: 'Whether to copy images to ACR'
required: false required: false
...@@ -117,6 +122,7 @@ jobs: ...@@ -117,6 +122,7 @@ jobs:
builder_name: ${{ inputs.builder_name }} builder_name: ${{ inputs.builder_name }}
build_image: ${{ inputs.build_image }} build_image: ${{ inputs.build_image }}
run_tests: ${{ inputs.run_tests }} run_tests: ${{ inputs.run_tests }}
run_multi_gpu_tests: ${{ inputs.run_multi_gpu_tests }}
copy_to_acr: ${{ inputs.copy_to_acr && matrix.platform == 'amd64' }} # no reason to copy ARM images to ACR copy_to_acr: ${{ inputs.copy_to_acr && matrix.platform == 'amd64' }} # no reason to copy ARM images to ACR
push_image: ${{ inputs.push_image }} push_image: ${{ inputs.push_image }}
build_timeout_minutes: ${{ inputs.build_timeout_minutes }} build_timeout_minutes: ${{ inputs.build_timeout_minutes }}
......
...@@ -27,6 +27,11 @@ on: ...@@ -27,6 +27,11 @@ on:
required: false required: false
type: boolean type: boolean
default: true default: true
run_multi_gpu_tests:
description: 'Whether to run multi-gpu tests'
required: false
type: boolean
default: false
copy_to_acr: copy_to_acr:
description: 'Whether to copy images to ACR' description: 'Whether to copy images to ACR'
required: false required: false
...@@ -322,9 +327,68 @@ jobs: ...@@ -322,9 +327,68 @@ jobs:
parallel_mode: 'none' parallel_mode: 'none'
dind_as_sidecar: 'true' dind_as_sidecar: 'true'
# ============================================================================ # ============================================================================
# COPY TO ACR # MULTI-GPU TESTS
# ============================================================================ # ============================================================================
multi-gpu-test:
# Multi-GPU support limited to AMD64 only
if: |
inputs.run_multi_gpu_tests &&
inputs.build_image &&
( inputs.platform != 'arm64' )
needs: [build]
name: Multi-gpu Test ${{ inputs.framework }}-cuda${{ inputs.cuda_version }}-${{ inputs.platform }}
runs-on: prod-tester-amd-gpu-4-v1
env:
FRAMEWORK: ${{ inputs.framework }}
steps:
- name: Checkout repository
uses: actions/checkout@08eba0b27e820071cde6df949e0beb9ba4906955 # v4.3.0
- name: Calculate target tag
id: calculate-target-tag
shell: bash
run: |
CUDA_VERSION_RAW=${{ inputs.cuda_version }}
CUDA_VERSION=${CUDA_VERSION_RAW%%.*}
echo "cuda_version_plain=${CUDA_VERSION}" >> $GITHUB_OUTPUT
TEST_IMAGE=${{ secrets.AWS_ACCOUNT_ID }}.dkr.ecr.${{ secrets.AWS_DEFAULT_REGION }}.amazonaws.com/ai-dynamo/dynamo:${{ needs.build.outputs.target_tag_plain }}-cuda${CUDA_VERSION}-${{ inputs.platform }}
echo "test_image=${TEST_IMAGE}" >> $GITHUB_OUTPUT
- name: Docker Login
uses: ./.github/actions/docker-login
with:
aws_default_region: ${{ secrets.AWS_DEFAULT_REGION }}
aws_account_id: ${{ secrets.AWS_ACCOUNT_ID }}
azure_acr_hostname: ${{ secrets.AZURE_ACR_HOSTNAME }}
azure_acr_user: ${{ secrets.AZURE_ACR_USER }}
azure_acr_password: ${{ secrets.AZURE_ACR_PASSWORD }}
- name: Pull relevant images
shell: bash
run: |
start_time=$(date +%s)
docker pull ${{ steps.calculate-target-tag.outputs.test_image }}
docker pull quay.io/minio/minio
end_time=$(date +%s)
duration=$((end_time - start_time))
echo "⏱️ Image pull duration: ${duration}s"
# Run GPU tests sequentially (only on amd64 runners with GPU)
# These are e2e tests marked with gpu_2 or gpu_4 that require GPU hardware
- name: Run GPU tests (sequential)
uses: ./.github/actions/pytest
with:
image_tag: ${{ steps.calculate-target-tag.outputs.test_image }}
pytest_marks: '(gpu_2 or gpu_4) and pre_merge'
framework: ${{ inputs.framework }}
test_type: "pre_merge_gpu"
platform_arch: ${{ inputs.platform }}
enable_mypy: 'false' # already covered by CPU tests
hf_token: ${{ secrets.HF_TOKEN }}
parallel_mode: 'none'
dind_as_sidecar: 'true'
# ============================================================================
# COPY TO ACR
# ============================================================================
copy-to-acr: copy-to-acr:
needs: [build, test] needs: [build, test]
# Run if copy_to_acr is true AND build succeeded AND (test succeeded OR test was skipped) # Run if copy_to_acr is true AND build succeeded AND (test succeeded OR test was skipped)
......
...@@ -184,6 +184,7 @@ jobs: ...@@ -184,6 +184,7 @@ jobs:
${{ github.ref_name == 'main' && 'main-vllm' || '' }} ${{ github.ref_name == 'main' && 'main-vllm' || '' }}
${{ github.ref_name == 'main' && format('main-vllm-{0}', github.sha) || '' }} ${{ github.ref_name == 'main' && format('main-vllm-{0}', github.sha) || '' }}
builder_name: ${{ needs.changed-files.outputs.builder_name }} builder_name: ${{ needs.changed-files.outputs.builder_name }}
run_multi_gpu_tests: true
test_gpu_timeout_minutes: 35 test_gpu_timeout_minutes: 35
build_timeout_minutes: ${{ github.ref_name == 'main' && 120 || 60 }} build_timeout_minutes: ${{ github.ref_name == 'main' && 120 || 60 }}
secrets: inherit secrets: inherit
...@@ -204,6 +205,7 @@ jobs: ...@@ -204,6 +205,7 @@ jobs:
${{ github.ref_name == 'main' && 'main-sglang' || '' }} ${{ github.ref_name == 'main' && 'main-sglang' || '' }}
${{ github.ref_name == 'main' && format('main-sglang-{0}', github.sha) || '' }} ${{ github.ref_name == 'main' && format('main-sglang-{0}', github.sha) || '' }}
builder_name: ${{ needs.changed-files.outputs.builder_name }} builder_name: ${{ needs.changed-files.outputs.builder_name }}
run_multi_gpu_tests: true
build_timeout_minutes: ${{ github.ref_name == 'main' && 120 || 60 }} build_timeout_minutes: ${{ github.ref_name == 'main' && 120 || 60 }}
secrets: inherit secrets: inherit
...@@ -223,6 +225,7 @@ jobs: ...@@ -223,6 +225,7 @@ jobs:
${{ github.ref_name == 'main' && 'main-trtllm' || '' }} ${{ github.ref_name == 'main' && 'main-trtllm' || '' }}
${{ github.ref_name == 'main' && format('main-trtllm-{0}', github.sha) || '' }} ${{ github.ref_name == 'main' && format('main-trtllm-{0}', github.sha) || '' }}
builder_name: ${{ needs.changed-files.outputs.builder_name }} builder_name: ${{ needs.changed-files.outputs.builder_name }}
run_multi_gpu_tests: true
build_timeout_minutes: ${{ github.ref_name == 'main' && 120 || 60 }} build_timeout_minutes: ${{ github.ref_name == 'main' && 120 || 60 }}
secrets: inherit secrets: inherit
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment