"...git@developer.sourcefind.cn:2222/OpenDAS/vllm_cscc.git" did not exist on "e760fcef2265a62a6a9cfbafdb207e7c3d5c3b36"
Unverified Commit 7e48f3bd authored by Anant Sharma's avatar Anant Sharma Committed by GitHub
Browse files

ci: fold container-validation-dynamo into pr, post-merge, and nightly (#8525)


Signed-off-by: default avatarAnant Sharma <anants@nvidia.com>
parent dfde02c5
......@@ -52,18 +52,15 @@ Example: `lib/**/*.rs` matches all Rust files under `lib/`.
## Adding a New Filter Group
If you create a new filter in `filters.yaml`, you must also update the workflows:
1. Add the filter to `filters.yaml`
2. Update **both** workflow files to include the new filter in the uncovered files check:
- `.github/workflows/container-validation-backends.yml`
- `.github/workflows/container-validation-dynamo.yml`
In each workflow, find the `COVERED_FILES` line and add your new filter:
```bash
COVERED_FILES=$(echo "... ${{ steps.filter.outputs.YOURFILTER_all_modified_files }} ..." | ...)
```
If you create a new filter in `filters.yaml`, you must also update the shared
changed-files action so the coverage check knows about it:
1. Add the filter to `filters.yaml`.
2. Edit `.github/actions/changed-files/action.yml`:
- Expose the new filter as an output (see the existing `core`, `planner`,
`vllm`, `sglang`, `trtllm`, etc. entries at the top of the file).
- Add its `*_all_modified_files` to the `COVERED_FILES` line in the
"Check for uncovered files" step.
If you skip this step, CI will fail with "uncovered files" even though your filter exists.
# SPDX-FileCopyrightText: Copyright (c) 2024-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
name: NVIDIA Dynamo Github Validation
on:
push:
branches:
- main
- release/*.*.*
- "pull-request/[0-9]+"
concurrency:
# The group name is a ternary operation. If the ref_name is 'main',
# then the group name uses the run_id to ensure a unique group for
# 'main' pushes. Otherwise, the group name is the ref_name, so that
# workflows on the same PR/branch have the same group name for cancelling.
group: dynamo-build-test-${{ github.ref_name == 'main' && github.run_id || github.ref_name }}
cancel-in-progress: ${{ github.ref != 'refs/heads/main' }}
env:
CUDA_VERSION: '12.9'
BUILDER_NAME: b-${{ github.run_id }}-${{ github.run_attempt }}
jobs:
changed-files:
runs-on: ubuntu-latest
outputs:
core: ${{ github.ref_name == 'main' || startsWith(github.ref_name, 'release/') || steps.changes.outputs.core == 'true' }}
planner: ${{ github.ref_name == 'main' || startsWith(github.ref_name, 'release/') || steps.changes.outputs.planner == 'true' }}
frontend: ${{ github.ref_name == 'main' || startsWith(github.ref_name, 'release/') || steps.changes.outputs.frontend == 'true' }}
sglang: ${{ github.ref_name == 'main' || startsWith(github.ref_name, 'release/') || steps.changes.outputs.sglang == 'true' }}
vllm: ${{ github.ref_name == 'main' || startsWith(github.ref_name, 'release/') || steps.changes.outputs.vllm == 'true' }}
trtllm: ${{ github.ref_name == 'main' || startsWith(github.ref_name, 'release/') || steps.changes.outputs.trtllm == 'true' }}
builder_name: ${{ steps.export-builder-name.outputs.builder_name }}
steps:
- name: Checkout code
uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
# Do not use fetch-depth: 0 — changed-files now works with shallow clone
- name: Check for changes
id: changes
uses: ./.github/actions/changed-files
with:
gh_token: ${{ github.token }}
- name: Export builder name
id: export-builder-name
run: |
echo "builder_name=${{ env.BUILDER_NAME }}" >> $GITHUB_OUTPUT
dynamo-status-check:
runs-on: ubuntu-latest
needs: [changed-files, build, rust-checks, mypy, test-parallel, test-sequential, test-generic-gpu]
if: always()
steps:
- name: "Check all dependent jobs"
run: |
echo '${{ toJson(needs) }}' | jq -e 'to_entries | map(.value.result) | all(. as $result | ["success", "skipped"] | any($result == .))'
build:
needs: changed-files
if: >-
needs.changed-files.outputs.core == 'true' ||
needs.changed-files.outputs.planner == 'true' ||
needs.changed-files.outputs.frontend == 'true' ||
needs.changed-files.outputs.sglang == 'true' ||
needs.changed-files.outputs.vllm == 'true' ||
needs.changed-files.outputs.trtllm == 'true'
runs-on: prod-builder-v3
name: Build
timeout-minutes: 60
outputs:
# Only pass the non-secret tag suffix between jobs (GitHub blanks outputs containing secrets)
runtime_tag_suffix: ${{ steps.define_image_tag.outputs.runtime_tag_suffix }}
test_tag_suffix: ${{ steps.define_image_tag.outputs.test_tag_suffix }}
steps:
- name: Checkout repository
uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
with:
lfs: true
- name: Initialize Dynamo Builder
uses: ./.github/actions/init-dynamo-builder
with:
builder_name: ${{ needs.changed-files.outputs.builder_name }}
flavor: general
arch: linux/amd64
- name: Docker Login
if: github.event.pull_request.head.repo.full_name == github.repository || github.event_name == 'push'
uses: ./.github/actions/docker-login
with:
ngc_ci_access_token: ${{ secrets.NGC_CI_ACCESS_TOKEN }}
aws_default_region: ${{ secrets.AWS_DEFAULT_REGION }}
aws_account_id: ${{ secrets.AWS_ACCOUNT_ID }}
azure_acr_hostname: ${{ secrets.AZURE_ACR_HOSTNAME }}
azure_acr_user: ${{ secrets.AZURE_ACR_USER }}
azure_acr_password: ${{ secrets.AZURE_ACR_PASSWORD }}
- name: Define Image Tag
id: define_image_tag
env:
ECR_HOSTNAME: ${{ secrets.AWS_ACCOUNT_ID }}.dkr.ecr.${{ secrets.AWS_DEFAULT_REGION }}.amazonaws.com
run: |
CUDA_MAJOR=${CUDA_VERSION%%.*}
RUNTIME_TAG_SUFFIX="${{ github.sha }}-dynamo-runtime-cuda${CUDA_MAJOR}-amd64"
TEST_TAG_SUFFIX="${{ github.sha }}-dynamo-test-cuda${CUDA_MAJOR}-amd64"
echo "runtime_tag_suffix=${RUNTIME_TAG_SUFFIX}" >> $GITHUB_OUTPUT
echo "test_tag_suffix=${TEST_TAG_SUFFIX}" >> $GITHUB_OUTPUT
echo "runtime_remote_tag=${ECR_HOSTNAME}/ai-dynamo/dynamo:${RUNTIME_TAG_SUFFIX}" >> $GITHUB_OUTPUT
echo "test_remote_tag=${ECR_HOSTNAME}/ai-dynamo/dynamo:${TEST_TAG_SUFFIX}" >> $GITHUB_OUTPUT
- name: Generate Dockerfile
shell: bash
run: |
echo "Generating Dockerfile for target: runtime and framework: dynamo"
python ./container/render.py \
--target=runtime \
--framework=dynamo \
--platform=amd64 \
--cuda-version=${{ env.CUDA_VERSION }} \
--show-result \
--output-short-filename
- name: Build and Push Runtime Image
uses: ./.github/actions/docker-remote-build
with:
image_tag: ${{ steps.define_image_tag.outputs.runtime_remote_tag }}
framework: dynamo
target: runtime
platform: amd64
cuda_version: ${{ env.CUDA_VERSION }}
ci_token: ${{ secrets.CI_TOKEN }}
aws_default_region: ${{ secrets.AWS_DEFAULT_REGION }}
aws_account_id: ${{ secrets.AWS_ACCOUNT_ID }}
sccache_s3_bucket: ${{ secrets.SCCACHE_S3_BUCKET }}
push_image: 'true'
- name: Build and Push Test Image
env:
ECR_HOSTNAME: ${{ secrets.AWS_ACCOUNT_ID }}.dkr.ecr.${{ secrets.AWS_DEFAULT_REGION }}.amazonaws.com
shell: bash
run: |
CACHE_TAG="test-dynamo-cuda${CUDA_VERSION%%.*}-amd64-cache"
CACHE_ARGS="--cache-from type=registry,ref=${ECR_HOSTNAME}/ai-dynamo/dynamo:main-${CACHE_TAG}"
CACHE_ARGS+=" --cache-from type=registry,ref=${ECR_HOSTNAME}/ai-dynamo/dynamo:release-${CACHE_TAG}"
if [[ "$GITHUB_REF_NAME" == "main" ]]; then
CACHE_ARGS+=" --cache-to type=registry,ref=${ECR_HOSTNAME}/ai-dynamo/dynamo:main-${CACHE_TAG},mode=max"
elif [[ "$GITHUB_REF_NAME" =~ ^release ]]; then
CACHE_ARGS+=" --cache-to type=registry,ref=${ECR_HOSTNAME}/ai-dynamo/dynamo:release-${CACHE_TAG},mode=max"
fi
docker buildx build \
--progress=plain \
--push \
--platform linux/amd64 \
-f container/Dockerfile.test \
--build-arg BASE_IMAGE=${{ steps.define_image_tag.outputs.runtime_remote_tag }} \
${CACHE_ARGS} \
-t ${{ steps.define_image_tag.outputs.test_remote_tag }} .
rust-checks:
needs: [changed-files, build]
if: needs.changed-files.outputs.core == 'true'
runs-on: prod-tester-amd-gpu-v1
name: Rust Checks
timeout-minutes: 30
env:
CONTAINER_ID: test_${{ github.run_id }}_${{ github.run_attempt }}_rust_dynamo
IMAGE_TAG: ${{ secrets.AWS_ACCOUNT_ID }}.dkr.ecr.${{ secrets.AWS_DEFAULT_REGION }}.amazonaws.com/ai-dynamo/dynamo:${{ needs.build.outputs.runtime_tag_suffix }}
steps:
- name: Checkout repository
uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
- name: Docker Login
uses: ./.github/actions/docker-login
with:
aws_default_region: ${{ secrets.AWS_DEFAULT_REGION }}
aws_account_id: ${{ secrets.AWS_ACCOUNT_ID }}
- name: Pull runtime image
run: |
source ./.github/scripts/retry_docker.sh
retry_pull ${{ env.IMAGE_TAG }}
- name: Run Rust checks (block-manager + media-ffmpeg + integration tests)
run: |
docker run --rm --runtime=nvidia --gpus all --user root -w /workspace/lib/llm \
--name ${{ env.CONTAINER_ID }}_rust_checks \
-e SCCACHE_BUCKET=${{ secrets.SCCACHE_S3_BUCKET }} \
-e SCCACHE_REGION=${{ secrets.AWS_DEFAULT_REGION }} \
-e AWS_ACCESS_KEY_ID=${{ secrets.AWS_ACCESS_KEY_ID }} \
-e AWS_SECRET_ACCESS_KEY=${{ secrets.AWS_SECRET_ACCESS_KEY }} \
${{ env.IMAGE_TAG }} \
bash -ec 'ARCH_ALT=x86_64 /workspace/container/use-sccache.sh install && \
eval $(/workspace/container/use-sccache.sh setup-env) && \
rustup component add rustfmt clippy && \
cargo fmt -- --check && \
cargo clippy --features block-manager,media-ffmpeg,testing-nixl,integration --no-deps --all-targets -- -D warnings && \
cargo test --locked --all-targets --features=block-manager,media-ffmpeg,testing-nixl,integration -- --nocapture && \
cargo clippy -p kvbm-physical --no-deps --all-targets -- -D warnings && \
cargo test --locked -p kvbm-physical --features testing-kvbm -- --nocapture --test-threads=4 && \
/workspace/container/use-sccache.sh show-stats "Rust Checks"'
mypy:
needs: [changed-files, build]
if: >-
needs.changed-files.outputs.core == 'true' ||
needs.changed-files.outputs.planner == 'true' ||
needs.changed-files.outputs.frontend == 'true' ||
needs.changed-files.outputs.sglang == 'true' ||
needs.changed-files.outputs.vllm == 'true' ||
needs.changed-files.outputs.trtllm == 'true'
runs-on: prod-tester-amd-v1
name: Mypy
timeout-minutes: 15
env:
IMAGE_TAG: ${{ secrets.AWS_ACCOUNT_ID }}.dkr.ecr.${{ secrets.AWS_DEFAULT_REGION }}.amazonaws.com/ai-dynamo/dynamo:${{ needs.build.outputs.test_tag_suffix }}
steps:
- name: Checkout repository
uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
- name: Docker Login
uses: ./.github/actions/docker-login
with:
aws_default_region: ${{ secrets.AWS_DEFAULT_REGION }}
aws_account_id: ${{ secrets.AWS_ACCOUNT_ID }}
- name: Pull test image
run: |
source ./.github/scripts/retry_docker.sh
retry_pull ${{ env.IMAGE_TAG }}
- name: Run mypy
run: |
docker run --rm -w /workspace \
--name mypy_${{ github.run_id }}_${{ github.run_attempt }} \
${{ env.IMAGE_TAG }} \
bash -c '
MYPYPATH=components/src:lib/bindings/python/src
# Always check shared (non-backend) code
TARGETS=$(find components/src/dynamo -maxdepth 1 -mindepth 1 -type d ! -name sglang ! -name vllm ! -name trtllm | sort | tr "\n" " ")
# Add only the backends that changed
${{ needs.changed-files.outputs.sglang == 'true' && 'TARGETS="$TARGETS components/src/dynamo/sglang"' || ':' }}
${{ needs.changed-files.outputs.vllm == 'true' && 'TARGETS="$TARGETS components/src/dynamo/vllm"' || ':' }}
${{ needs.changed-files.outputs.trtllm == 'true' && 'TARGETS="$TARGETS components/src/dynamo/trtllm"' || ':' }}
MYPYPATH=$MYPYPATH mypy --explicit-package-bases $TARGETS
'
docker run --rm -w /workspace \
--name mypy_bindings_${{ github.run_id }}_${{ github.run_attempt }} \
${{ env.IMAGE_TAG }} \
bash -c 'MYPYPATH=lib/bindings/python/src mypy -p dynamo'
test-parallel:
needs: [changed-files, build, mypy]
if: needs.changed-files.outputs.core == 'true' || needs.changed-files.outputs.planner == 'true' || needs.changed-files.outputs.frontend == 'true'
runs-on: prod-builder-amd-v1
name: Pytest (parallel)
timeout-minutes: 30
env:
IMAGE_TAG: ${{ secrets.AWS_ACCOUNT_ID }}.dkr.ecr.${{ secrets.AWS_DEFAULT_REGION }}.amazonaws.com/ai-dynamo/dynamo:${{ needs.build.outputs.test_tag_suffix }}
steps:
- name: Checkout repository
uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
- name: Docker Login
uses: ./.github/actions/docker-login
with:
aws_default_region: ${{ secrets.AWS_DEFAULT_REGION }}
aws_account_id: ${{ secrets.AWS_ACCOUNT_ID }}
- name: Pull test image
run: |
source ./.github/scripts/retry_docker.sh
retry_pull ${{ env.IMAGE_TAG }}
- name: Run pytest (parallel tests with xdist)
uses: ./.github/actions/pytest
with:
image_tag: ${{ env.IMAGE_TAG }}
pytest_marks: "pre_merge and parallel and not (vllm or sglang or trtllm) and (gpu_0)"
test_suite_name: dynamo
test_type: "pre_merge_parallel"
platform_arch: amd64
hf_token: ${{ secrets.HF_TOKEN }}
parallel_mode: '4'
dind_as_sidecar: 'false'
test-sequential:
needs: [changed-files, build, mypy]
if: needs.changed-files.outputs.core == 'true' || needs.changed-files.outputs.planner == 'true' || needs.changed-files.outputs.frontend == 'true'
runs-on: prod-builder-amd-v1
name: Pytest (sequential)
timeout-minutes: 30
env:
IMAGE_TAG: ${{ secrets.AWS_ACCOUNT_ID }}.dkr.ecr.${{ secrets.AWS_DEFAULT_REGION }}.amazonaws.com/ai-dynamo/dynamo:${{ needs.build.outputs.test_tag_suffix }}
steps:
- name: Checkout repository
uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
- name: Docker Login
uses: ./.github/actions/docker-login
with:
aws_default_region: ${{ secrets.AWS_DEFAULT_REGION }}
aws_account_id: ${{ secrets.AWS_ACCOUNT_ID }}
- name: Pull test image
run: |
source ./.github/scripts/retry_docker.sh
retry_pull ${{ env.IMAGE_TAG }}
- name: Run pytest (sequential tests)
uses: ./.github/actions/pytest
with:
image_tag: ${{ env.IMAGE_TAG }}
pytest_marks: "pre_merge and not parallel and not (vllm or sglang or trtllm) and (gpu_0)"
test_suite_name: dynamo
test_type: "pre_merge_sequential"
platform_arch: amd64
hf_token: ${{ secrets.HF_TOKEN }}
parallel_mode: 'none'
dind_as_sidecar: 'false'
test-generic-gpu:
needs: [changed-files, build, mypy]
if: needs.changed-files.outputs.core == 'true'
runs-on: prod-tester-amd-gpu-v1
name: Pytest (GPU)
timeout-minutes: 30
env:
IMAGE_TAG: ${{ secrets.AWS_ACCOUNT_ID }}.dkr.ecr.${{ secrets.AWS_DEFAULT_REGION }}.amazonaws.com/ai-dynamo/dynamo:${{ needs.build.outputs.test_tag_suffix }}
steps:
- name: Checkout repository
uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
- name: Docker Login
uses: ./.github/actions/docker-login
with:
aws_default_region: ${{ secrets.AWS_DEFAULT_REGION }}
aws_account_id: ${{ secrets.AWS_ACCOUNT_ID }}
- name: Pull test image
run: |
source ./.github/scripts/retry_docker.sh
retry_pull ${{ env.IMAGE_TAG }}
- name: Run pytest (gpu)
uses: ./.github/actions/pytest
with:
image_tag: ${{ env.IMAGE_TAG }}
pytest_marks: "pre_merge and none and gpu_1"
framework: dynamo
test_type: "pre_merge_gpu"
platform_arch: amd64
hf_token: ${{ secrets.HF_TOKEN }}
parallel_mode: 'none'
dind_as_sidecar: 'true'
# SPDX-FileCopyrightText: Copyright (c) 2024-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
# Dynamo Pipeline — single entry point for building the dynamo runtime image
# and running all of its associated checks (rust, mypy, pytest parallel/
# sequential/GPU).
name: Dynamo Pipeline
on:
workflow_call:
inputs:
builder_name:
description: 'BuildKit builder name'
required: true
type: string
cuda_version:
description: 'CUDA version for the runtime image (e.g. 12.9)'
required: false
type: string
default: '12.9'
fresh_builder:
description: 'Always create a fresh K8s BuildKit builder (for nightly)'
required: false
type: boolean
default: false
build_timeout_minutes:
required: false
type: number
default: 30
no_cache:
description: 'Disable BuildKit cache-from/to. Nightly sets true for regression detection.'
required: false
type: boolean
default: false
cpu_parallel_test_markers:
required: true
type: string
cpu_sequential_test_markers:
required: true
type: string
gpu_test_markers:
required: true
type: string
secrets:
AWS_DEFAULT_REGION:
required: true
AWS_ACCOUNT_ID:
required: true
AZURE_ACR_HOSTNAME:
required: true
AZURE_ACR_USER:
required: true
AZURE_ACR_PASSWORD:
required: true
SCCACHE_S3_BUCKET:
required: false
HF_TOKEN:
required: false
jobs:
image:
uses: ./.github/workflows/shared-build-image.yml
with:
framework: dynamo
target: runtime
cuda_version: '["${{ inputs.cuda_version }}"]'
platform: 'linux/amd64,linux/arm64'
builder_name: ${{ inputs.builder_name }}
fresh_builder: ${{ inputs.fresh_builder }}
no_cache: ${{ inputs.no_cache }}
build_timeout_minutes: ${{ inputs.build_timeout_minutes }}
secrets: inherit
# rust-gpu-checks + mypy run inside the built images because they depend on
# native libs (NIXL, ffmpeg, CUDA) and pinned Python package versions that
# aren't reproducible on plain ubuntu-latest runners.
rust-gpu:
needs: image
runs-on: prod-tester-amd-gpu-v1
timeout-minutes: 30
env:
CONTAINER_ID: test_${{ github.run_id }}_${{ github.run_attempt }}_rust_dynamo
steps:
- name: Checkout repository
uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
- name: Calculate runtime image tag
id: image
shell: bash
env:
ECR_REPOSITORY: ${{ secrets.AWS_ACCOUNT_ID }}.dkr.ecr.${{ secrets.AWS_DEFAULT_REGION }}.amazonaws.com/ai-dynamo/dynamo
run: |
CUDA_VERSION="${{ inputs.cuda_version }}"
CUDA_MAJOR=${CUDA_VERSION%%.*}
IMAGE_TAG=${{ github.sha }}-${{ needs.image.outputs.target_tag_plain }}-cuda${CUDA_MAJOR}
echo "runtime_image=${ECR_REPOSITORY}:${IMAGE_TAG}" >> $GITHUB_OUTPUT
- name: Docker Login
uses: ./.github/actions/docker-login
with:
aws_default_region: ${{ secrets.AWS_DEFAULT_REGION }}
aws_account_id: ${{ secrets.AWS_ACCOUNT_ID }}
- name: Pull runtime image
run: |
source ./.github/scripts/retry_docker.sh
retry_pull ${{ steps.image.outputs.runtime_image }}
- name: Run Rust checks (block-manager + media-ffmpeg + integration tests)
env:
SCCACHE_S3_BUCKET: ${{ secrets.SCCACHE_S3_BUCKET }}
AWS_DEFAULT_REGION: ${{ secrets.AWS_DEFAULT_REGION }}
run: |
docker run --rm --runtime=nvidia --gpus all --user root -w /workspace/lib/llm \
--name ${{ env.CONTAINER_ID }}_rust_checks \
-e SCCACHE_BUCKET="${SCCACHE_S3_BUCKET}" \
-e SCCACHE_REGION="${AWS_DEFAULT_REGION}" \
-e AWS_ROLE_ARN \
-e AWS_WEB_IDENTITY_TOKEN_FILE=/run/secrets/aws-token \
-v "${AWS_WEB_IDENTITY_TOKEN_FILE}:/run/secrets/aws-token:ro" \
${{ steps.image.outputs.runtime_image }} \
bash -ec 'ARCH_ALT=x86_64 /workspace/container/use-sccache.sh install && \
eval $(/workspace/container/use-sccache.sh setup-env) && \
rustup component add rustfmt clippy && \
cargo fmt -- --check && \
cargo clippy --features block-manager,media-ffmpeg,testing-nixl,integration --no-deps --all-targets -- -D warnings && \
cargo test --locked --all-targets --features=block-manager,media-ffmpeg,testing-nixl,integration -- --nocapture && \
cargo clippy -p kvbm-physical --no-deps --all-targets -- -D warnings && \
cargo test --locked -p kvbm-physical --features testing-kvbm -- --nocapture --test-threads=4 && \
/workspace/container/use-sccache.sh show-stats "Rust Checks"'
mypy:
needs: image
runs-on: prod-tester-amd-v1
timeout-minutes: 15
steps:
- name: Checkout repository
uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
- name: Calculate test image tag
id: image
shell: bash
env:
ECR_REPOSITORY: ${{ secrets.AWS_ACCOUNT_ID }}.dkr.ecr.${{ secrets.AWS_DEFAULT_REGION }}.amazonaws.com/ai-dynamo/dynamo
run: |
CUDA_VERSION="${{ inputs.cuda_version }}"
CUDA_MAJOR=${CUDA_VERSION%%.*}
IMAGE_TAG=${{ github.sha }}-${{ needs.image.outputs.target_tag_plain }}-cuda${CUDA_MAJOR}-test
echo "test_image=${ECR_REPOSITORY}:${IMAGE_TAG}" >> $GITHUB_OUTPUT
- name: Docker Login
uses: ./.github/actions/docker-login
with:
aws_default_region: ${{ secrets.AWS_DEFAULT_REGION }}
aws_account_id: ${{ secrets.AWS_ACCOUNT_ID }}
- name: Pull test image
run: |
source ./.github/scripts/retry_docker.sh
retry_pull ${{ steps.image.outputs.test_image }}
- name: Run mypy (components)
run: |
docker run --rm -w /workspace \
--name mypy_${{ github.run_id }}_${{ github.run_attempt }} \
${{ steps.image.outputs.test_image }} \
bash -c '
MYPYPATH=components/src:lib/bindings/python/src
TARGETS=$(find components/src/dynamo -maxdepth 1 -mindepth 1 -type d | sort | tr "\n" " ")
MYPYPATH=$MYPYPATH mypy --explicit-package-bases $TARGETS
'
- name: Run mypy (bindings)
run: |
docker run --rm -w /workspace \
--name mypy_bindings_${{ github.run_id }}_${{ github.run_attempt }} \
${{ steps.image.outputs.test_image }} \
bash -c 'MYPYPATH=lib/bindings/python/src mypy -p dynamo'
# TODO: real xdist parallelism port conflicts. cpu_parallel_mode='none'
# below runs -n 0, so this job is NOT actually parallel right now.
parallel:
name: test
needs: image
uses: ./.github/workflows/shared-test.yml
with:
test_suite_name: dynamo
test_type: parallel
amd_runner: prod-tester-amd-v1
target_tag_plain: ${{ needs.image.outputs.target_tag_plain }}
cuda_version: '["${{ inputs.cuda_version }}"]'
platform: '["amd64", "arm64"]'
run_sanity_check: false
run_cpu_only_tests: true
cpu_only_test_markers: ${{ inputs.cpu_parallel_test_markers }}
cpu_only_test_timeout_minutes: 30
cpu_parallel_mode: 'none'
run_gpu_tests: false
secrets: inherit
sequential:
name: test
needs: image
uses: ./.github/workflows/shared-test.yml
with:
test_suite_name: dynamo
test_type: sequential
amd_runner: prod-tester-amd-v1
target_tag_plain: ${{ needs.image.outputs.target_tag_plain }}
cuda_version: '["${{ inputs.cuda_version }}"]'
platform: '["amd64", "arm64"]'
run_sanity_check: false
run_cpu_only_tests: true
cpu_only_test_markers: ${{ inputs.cpu_sequential_test_markers }}
cpu_only_test_timeout_minutes: 30
cpu_parallel_mode: 'none'
run_gpu_tests: false
secrets: inherit
gpu:
name: test
needs: image
uses: ./.github/workflows/shared-test.yml
with:
test_suite_name: dynamo
test_type: gpu
amd_runner: prod-tester-amd-gpu-v1
target_tag_plain: ${{ needs.image.outputs.target_tag_plain }}
cuda_version: '["${{ inputs.cuda_version }}"]'
platform: '["amd64"]'
run_sanity_check: false
run_cpu_only_tests: false
run_gpu_tests: true
gpu_test_markers: ${{ inputs.gpu_test_markers }}
gpu_test_timeout_minutes: 30
secrets: inherit
......@@ -119,6 +119,23 @@ jobs:
multi_gpu_test_timeout_minutes: 120
secrets: inherit
# ============================================================================
# DYNAMO RUNTIME PIPELINE
# ============================================================================
dynamo-pipeline:
name: dynamo-runtime
needs: [create-fresh-builder]
uses: ./.github/workflows/dynamo-pipeline.yml
with:
builder_name: ${{ needs.create-fresh-builder.outputs.builder_name }}
fresh_builder: true
no_cache: true
build_timeout_minutes: 90
cpu_parallel_test_markers: 'parallel and not (vllm or sglang or trtllm) and (gpu_0)'
cpu_sequential_test_markers: 'not parallel and not (vllm or sglang or trtllm) and (gpu_0)'
gpu_test_markers: 'none and gpu_1'
secrets: inherit
# ============================================================================
# CLEANUP
# ============================================================================
......@@ -126,7 +143,7 @@ jobs:
name: Clean K8s builder if exists
runs-on: prod-default-small-v2
if: always()
needs: [vllm-pipeline, sglang-pipeline, trtllm-pipeline, create-fresh-builder]
needs: [vllm-pipeline, sglang-pipeline, trtllm-pipeline, dynamo-pipeline, create-fresh-builder]
permissions:
contents: read
steps:
......@@ -149,7 +166,7 @@ jobs:
name: Notify Slack
runs-on: ubuntu-slim
if: always() && failure()
needs: [ vllm-pipeline, sglang-pipeline, trtllm-pipeline ]
needs: [ vllm-pipeline, sglang-pipeline, trtllm-pipeline, dynamo-pipeline ]
permissions:
contents: read
steps:
......
......@@ -117,6 +117,19 @@ jobs:
multi_gpu_test_timeout_minutes: 60
secrets: inherit
# ============================================================================
# DYNAMO RUNTIME PIPELINE
# ============================================================================
dynamo-pipeline:
name: dynamo-runtime
uses: ./.github/workflows/dynamo-pipeline.yml
with:
builder_name: b-${{ github.run_id }}-${{ github.run_attempt }}
cpu_parallel_test_markers: '(pre_merge or post_merge) and parallel and not (vllm or sglang or trtllm) and (gpu_0)'
cpu_sequential_test_markers: '(pre_merge or post_merge) and not parallel and not (vllm or sglang or trtllm) and (gpu_0)'
gpu_test_markers: '(pre_merge or post_merge) and none and gpu_1'
secrets: inherit
# ============================================================================
# DEV PIPELINES
# ============================================================================
......@@ -434,7 +447,7 @@ jobs:
name: Clean K8s builder if exists
runs-on: prod-default-small-v2
if: always()
needs: [planner-pipeline, vllm-pipeline, sglang-pipeline, trtllm-pipeline, vllm-dev-pipeline, sglang-dev-pipeline, trtllm-dev-pipeline, vllm-efa-pipeline, trtllm-efa-pipeline, operator, frontend-image]
needs: [planner-pipeline, vllm-pipeline, sglang-pipeline, trtllm-pipeline, dynamo-pipeline, vllm-dev-pipeline, sglang-dev-pipeline, trtllm-dev-pipeline, vllm-efa-pipeline, trtllm-efa-pipeline, operator, frontend-image]
steps:
- name: Checkout repository
uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
......@@ -455,7 +468,7 @@ jobs:
name: Notify Slack
runs-on: ubuntu-slim
if: always() && failure()
needs: [ planner-pipeline, vllm-pipeline, sglang-pipeline, trtllm-pipeline, vllm-dev-pipeline, sglang-dev-pipeline, trtllm-dev-pipeline, vllm-efa-pipeline, trtllm-efa-pipeline, operator, frontend-image, deploy-operator, deploy-test-vllm, deploy-test-sglang, deploy-test-trtllm, deploy-test-gaie ]
needs: [ planner-pipeline, vllm-pipeline, sglang-pipeline, trtllm-pipeline, dynamo-pipeline, vllm-dev-pipeline, sglang-dev-pipeline, trtllm-dev-pipeline, vllm-efa-pipeline, trtllm-efa-pipeline, operator, frontend-image, deploy-operator, deploy-test-vllm, deploy-test-sglang, deploy-test-trtllm, deploy-test-gaie ]
permissions:
contents: read
steps:
......
......@@ -30,6 +30,7 @@ jobs:
vllm: ${{ steps.changes.outputs.vllm }}
sglang: ${{ steps.changes.outputs.sglang }}
trtllm: ${{ steps.changes.outputs.trtllm }}
frontend: ${{ steps.changes.outputs.frontend }}
builder_name: ${{ steps.export-builder-name.outputs.builder_name }}
steps:
- name: Checkout code
......@@ -83,6 +84,17 @@ jobs:
run: |
echo '${{ toJson(needs) }}' | jq -e 'to_entries | map(.value.result) | all(. as $result | ["success", "skipped", "cancelled"] | any($result == .))'
dynamo-status-check:
runs-on: ubuntu-slim
needs:
- changed-files
- dynamo-pipeline
if: always()
steps:
- name: Check all dynamo jobs
run: |
echo '${{ toJson(needs) }}' | jq -e 'to_entries | map(.value.result) | all(. as $result | ["success", "skipped"] | any($result == .))'
# ============================================================================
# Operator
# ============================================================================
......@@ -371,6 +383,28 @@ jobs:
run_gpu_tests: false
secrets: inherit
# ============================================================================
# DYNAMO RUNTIME PIPELINE
# ============================================================================
dynamo-pipeline:
name: dynamo-runtime
needs: [changed-files]
if: |
needs.changed-files.outputs.core == 'true' ||
needs.changed-files.outputs.planner == 'true' ||
needs.changed-files.outputs.frontend == 'true' ||
needs.changed-files.outputs.vllm == 'true' ||
needs.changed-files.outputs.sglang == 'true' ||
needs.changed-files.outputs.trtllm == 'true'
uses: ./.github/workflows/dynamo-pipeline.yml
with:
builder_name: ${{ needs.changed-files.outputs.builder_name }}
cpu_parallel_test_markers: 'pre_merge and parallel and not (vllm or sglang or trtllm) and (gpu_0)'
cpu_sequential_test_markers: 'pre_merge and not parallel and not (vllm or sglang or trtllm) and (gpu_0)'
gpu_test_markers: 'pre_merge and none and gpu_1'
secrets: inherit
# ============================================================================
# IMAGE COMPLIANCE PIPELINES
# ============================================================================
......@@ -586,6 +620,7 @@ jobs:
- sglang-multi-gpu-test
- trtllm-copy-to-acr
- trtllm-multi-gpu-test
- dynamo-pipeline
steps:
- name: Checkout repository
uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment