Unverified Commit 72762dac authored by Anant Sharma's avatar Anant Sharma Committed by GitHub
Browse files

ci: migrate container-validation-dynamo to self-hosted runners (#6381)


Signed-off-by: default avatarAnant Sharma <anants@nvidia.com>
parent fc229004
...@@ -8,18 +8,26 @@ on: ...@@ -8,18 +8,26 @@ on:
branches: branches:
- main - main
- release/*.*.* - release/*.*.*
pull_request: - "pull-request/[0-9]+"
# Cancel any previous check runs for the same pull request to avoid redundant workflows.
concurrency: concurrency:
group: ${{ github.event_name == 'pull_request' && format('{0}-{1}', github.workflow, github.event.pull_request.number) || format('{0}-{1}', github.workflow, github.run_id) }} # The group name is a ternary operation. If the ref_name is 'main',
cancel-in-progress: ${{ github.event_name == 'pull_request' }} # then the group name uses the run_id to ensure a unique group for
# 'main' pushes. Otherwise, the group name is the ref_name, so that
# workflows on the same PR/branch have the same group name for cancelling.
group: dynamo-build-test-${{ github.ref_name == 'main' && github.run_id || github.ref_name }}
cancel-in-progress: ${{ github.ref != 'refs/heads/main' }}
env:
CUDA_VERSION: '12.9'
BUILDER_NAME: b-${{ github.run_id }}-${{ github.run_attempt }}
jobs: jobs:
changed-files: changed-files:
runs-on: ubuntu-latest runs-on: ubuntu-latest
outputs: outputs:
core: ${{ steps.changes.outputs.core }} core: ${{ steps.changes.outputs.core }}
builder_name: ${{ steps.export-builder-name.outputs.builder_name }}
steps: steps:
- name: Checkout code - name: Checkout code
uses: actions/checkout@08eba0b27e820071cde6df949e0beb9ba4906955 # v4.3.0 uses: actions/checkout@08eba0b27e820071cde6df949e0beb9ba4906955 # v4.3.0
...@@ -30,115 +38,181 @@ jobs: ...@@ -30,115 +38,181 @@ jobs:
uses: ./.github/actions/changed-files uses: ./.github/actions/changed-files
with: with:
gh_token: ${{ github.token }} gh_token: ${{ github.token }}
- name: Export builder name
id: export-builder-name
run: |
echo "builder_name=${{ env.BUILDER_NAME }}" >> $GITHUB_OUTPUT
dynamo-status-check: dynamo-status-check:
runs-on: ubuntu-latest runs-on: ubuntu-latest
needs: [changed-files, build-test] needs: [changed-files, build, rust-checks, test-parallel, test-sequential]
if: always() if: always()
steps: steps:
- name: "Check all dependent jobs" - name: "Check all dependent jobs"
run: | run: |
echo '${{ toJson(needs) }}' | jq -e 'to_entries | map(.value.result) | all(. as $result | ["success", "skipped"] | any($result == .))' echo '${{ toJson(needs) }}' | jq -e 'to_entries | map(.value.result) | all(. as $result | ["success", "skipped"] | any($result == .))'
build-test: build:
needs: changed-files needs: changed-files
if: needs.changed-files.outputs.core == 'true' if: needs.changed-files.outputs.core == 'true'
runs-on: runs-on: prod-builder-v3
group: Fastchecker name: Build
name: Build and Test - dynamo timeout-minutes: 60
env: outputs:
CONTAINER_ID: test_${{ github.run_id }}_${{ github.run_attempt }}_${{ github.job }}_dynamo # Only pass the non-secret tag suffix between jobs (GitHub blanks outputs containing secrets)
PYTEST_XML_FILE: pytest_test_report.xml image_tag_suffix: ${{ steps.define_image_tag.outputs.image_tag_suffix }}
PYTEST_PARALLEL_XML_FILE: pytest_parallel.xml
steps: steps:
- name: Checkout repository - name: Checkout repository
uses: actions/checkout@v4 uses: actions/checkout@08eba0b27e820071cde6df949e0beb9ba4906955 # v4.3.0
with: with:
lfs: true lfs: true
- name: Set up Docker Buildx - name: Initialize Dynamo Builder
uses: docker/setup-buildx-action@v3 uses: ./.github/actions/init-dynamo-builder
- name: Login to NGC with:
builder_name: ${{ needs.changed-files.outputs.builder_name }}
flavor: general
arch: amd64
- name: Docker Login
if: github.event.pull_request.head.repo.full_name == github.repository || github.event_name == 'push' if: github.event.pull_request.head.repo.full_name == github.repository || github.event_name == 'push'
uses: ./.github/actions/docker-login uses: ./.github/actions/docker-login
with: with:
ngc_ci_access_token: ${{ secrets.NGC_CI_ACCESS_TOKEN }} ngc_ci_access_token: ${{ secrets.NGC_CI_ACCESS_TOKEN }}
aws_default_region: ${{ secrets.AWS_DEFAULT_REGION }}
aws_account_id: ${{ secrets.AWS_ACCOUNT_ID }}
azure_acr_hostname: ${{ secrets.AZURE_ACR_HOSTNAME }}
azure_acr_user: ${{ secrets.AZURE_ACR_USER }}
azure_acr_password: ${{ secrets.AZURE_ACR_PASSWORD }}
- name: Define Image Tag - name: Define Image Tag
id: define_image_tag id: define_image_tag
env:
ECR_HOSTNAME: ${{ secrets.AWS_ACCOUNT_ID }}.dkr.ecr.${{ secrets.AWS_DEFAULT_REGION }}.amazonaws.com
run: | run: |
echo "image_tag=dynamo:latest-dev" >> $GITHUB_OUTPUT CUDA_MAJOR=${CUDA_VERSION%%.*}
IMAGE_TAG_SUFFIX="${{ github.sha }}-dynamo-dev-cuda${CUDA_MAJOR}-amd64"
echo "image_tag_suffix=${IMAGE_TAG_SUFFIX}" >> $GITHUB_OUTPUT
echo "remote_tag=${ECR_HOSTNAME}/ai-dynamo/dynamo:${IMAGE_TAG_SUFFIX}" >> $GITHUB_OUTPUT
- name: Generate Dockerfile - name: Generate Dockerfile
shell: bash shell: bash
run: | run: |
echo "Generating Dockerfile for target: ${{ inputs.target }} and framework: ${{ inputs.framework }}" echo "Generating Dockerfile for target: dev and framework: dynamo"
python ./container/render.py \ python ./container/render.py \
--target=dev \ --target=dev \
--framework=dynamo \ --framework=dynamo \
--platform=amd64 \ --platform=amd64 \
--cuda-version=${{ env.CUDA_VERSION }} \
--show-result \ --show-result \
--output-short-filename --output-short-filename
- name: Build image - name: Build and Push Container
env: uses: ./.github/actions/docker-remote-build
GITHUB_TOKEN: ${{ secrets.CI_TOKEN }} with:
run: | image_tag: ${{ steps.define_image_tag.outputs.remote_tag }}
docker buildx build \ framework: dynamo
--progress=plain \ target: dev
--tag ${{ steps.define_image_tag.outputs.image_tag }} \ platform: amd64
-f ./container/rendered.Dockerfile \ cuda_version: ${{ env.CUDA_VERSION }}
--build-arg ENABLE_MEDIA_FFMPEG=true \ ci_token: ${{ secrets.CI_TOKEN }}
--build-arg ENABLE_KVBM=true \ aws_default_region: ${{ secrets.AWS_DEFAULT_REGION }}
--load . aws_account_id: ${{ secrets.AWS_ACCOUNT_ID }}
- name: Start services with docker-compose sccache_s3_bucket: ${{ secrets.SCCACHE_S3_BUCKET }}
working-directory: ./deploy aws_access_key_id: ${{ secrets.AWS_ACCESS_KEY_ID }}
run: | aws_secret_access_key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
docker compose up -d nats-server etcd-server push_image: 'true'
rust-checks:
needs: [changed-files, build]
if: needs.changed-files.outputs.core == 'true'
runs-on: prod-builder-amd-v1
name: Rust Checks
timeout-minutes: 30
env:
CONTAINER_ID: test_${{ github.run_id }}_${{ github.run_attempt }}_rust_dynamo
IMAGE_TAG: ${{ secrets.AWS_ACCOUNT_ID }}.dkr.ecr.${{ secrets.AWS_DEFAULT_REGION }}.amazonaws.com/ai-dynamo/dynamo:${{ needs.build.outputs.image_tag_suffix }}
steps:
- name: Checkout repository
uses: actions/checkout@08eba0b27e820071cde6df949e0beb9ba4906955 # v4.3.0
- name: Docker Login
uses: ./.github/actions/docker-login
with:
aws_default_region: ${{ secrets.AWS_DEFAULT_REGION }}
aws_account_id: ${{ secrets.AWS_ACCOUNT_ID }}
- name: Pull image
run: docker pull ${{ env.IMAGE_TAG }}
- name: Run Rust checks (block-manager + media-ffmpeg + integration tests) - name: Run Rust checks (block-manager + media-ffmpeg + integration tests)
run: | run: |
docker run --rm -w /workspace/lib/llm \ docker run --rm -w /workspace/lib/llm \
--name ${{ env.CONTAINER_ID }}_rust_checks \ --name ${{ env.CONTAINER_ID }}_rust_checks \
${{ steps.define_image_tag.outputs.image_tag }} \ -e SCCACHE_BUCKET=${{ secrets.SCCACHE_S3_BUCKET }} \
bash -ec 'rustup component add rustfmt clippy && \ -e SCCACHE_REGION=${{ secrets.AWS_DEFAULT_REGION }} \
-e AWS_ACCESS_KEY_ID=${{ secrets.AWS_ACCESS_KEY_ID }} \
-e AWS_SECRET_ACCESS_KEY=${{ secrets.AWS_SECRET_ACCESS_KEY }} \
${{ env.IMAGE_TAG }} \
bash -ec 'ARCH_ALT=x86_64 /workspace/container/use-sccache.sh install && \
eval $(/workspace/container/use-sccache.sh setup-env) && \
rustup component add rustfmt clippy && \
cargo fmt -- --check && \ cargo fmt -- --check && \
cargo clippy --features block-manager,media-ffmpeg --no-deps --all-targets -- -D warnings && \ cargo clippy --features block-manager,media-ffmpeg,testing-nixl,integration --no-deps --all-targets -- -D warnings && \
cargo test --locked --all-targets --features=block-manager,media-ffmpeg,testing-nixl && \ cargo test --locked --all-targets --features=block-manager,media-ffmpeg,testing-nixl,integration -- --nocapture && \
cargo test --locked --features integration -- --nocapture' /workspace/container/use-sccache.sh show-stats "Rust Checks"'
- name: Cleanup services
if: always() test-parallel:
working-directory: ./deploy needs: [changed-files, build]
run: | if: needs.changed-files.outputs.core == 'true'
docker compose down runs-on: prod-builder-amd-v1
name: Pytest (parallel)
timeout-minutes: 30
env:
IMAGE_TAG: ${{ secrets.AWS_ACCOUNT_ID }}.dkr.ecr.${{ secrets.AWS_DEFAULT_REGION }}.amazonaws.com/ai-dynamo/dynamo:${{ needs.build.outputs.image_tag_suffix }}
steps:
- name: Checkout repository
uses: actions/checkout@08eba0b27e820071cde6df949e0beb9ba4906955 # v4.3.0
- name: Docker Login
uses: ./.github/actions/docker-login
with:
aws_default_region: ${{ secrets.AWS_DEFAULT_REGION }}
aws_account_id: ${{ secrets.AWS_ACCOUNT_ID }}
- name: Pull image
run: docker pull ${{ env.IMAGE_TAG }}
- name: Run pytest (parallel tests with xdist) - name: Run pytest (parallel tests with xdist)
env: uses: ./.github/actions/pytest
PYTEST_MARKS: "pre_merge and parallel and not (vllm or sglang or trtllm) and (gpu_0 or gpu_1)" with:
run: | image_tag: ${{ env.IMAGE_TAG }}
docker run -w /workspace \ pytest_marks: "pre_merge and parallel and not (vllm or sglang or trtllm) and (gpu_0)"
--name ${{ env.CONTAINER_ID }}_pytest_parallel \ framework: dynamo
${{ steps.define_image_tag.outputs.image_tag }} \ test_type: "pre_merge_parallel"
bash -c "pytest --mypy --basetemp=/tmp/pytest-parallel --junitxml=${{ env.PYTEST_PARALLEL_XML_FILE }} --durations=10 -n 4 -m \"${{ env.PYTEST_MARKS }}\"" platform_arch: amd64
- name: Copy parallel test report from Container enable_mypy: 'true'
if: always() parallel_mode: '4'
run: | dind_as_sidecar: 'false'
docker cp ${{ env.CONTAINER_ID }}_pytest_parallel:/workspace/${{ env.PYTEST_PARALLEL_XML_FILE }} . || echo "No parallel test report found"
test-sequential:
needs: [changed-files, build]
if: needs.changed-files.outputs.core == 'true'
runs-on: prod-builder-amd-v1
name: Pytest (sequential)
timeout-minutes: 30
env:
IMAGE_TAG: ${{ secrets.AWS_ACCOUNT_ID }}.dkr.ecr.${{ secrets.AWS_DEFAULT_REGION }}.amazonaws.com/ai-dynamo/dynamo:${{ needs.build.outputs.image_tag_suffix }}
steps:
- name: Checkout repository
uses: actions/checkout@08eba0b27e820071cde6df949e0beb9ba4906955 # v4.3.0
- name: Docker Login
uses: ./.github/actions/docker-login
with:
aws_default_region: ${{ secrets.AWS_DEFAULT_REGION }}
aws_account_id: ${{ secrets.AWS_ACCOUNT_ID }}
- name: Pull image
run: docker pull ${{ env.IMAGE_TAG }}
- name: Run pytest (sequential tests) - name: Run pytest (sequential tests)
env: uses: ./.github/actions/pytest
PYTEST_MARKS: "pre_merge and not parallel and not (vllm or sglang or trtllm) and (gpu_0 or gpu_1)"
run: |
docker run -w /workspace \
--name ${{ env.CONTAINER_ID }}_pytest \
${{ steps.define_image_tag.outputs.image_tag }} \
bash -c "pytest --mypy --basetemp=/tmp --junitxml=${{ env.PYTEST_XML_FILE }} --durations=10 -m \"${{ env.PYTEST_MARKS }}\" "
- name: Copy test report from test Container
if: always()
run: |
docker cp ${{ env.CONTAINER_ID }}_pytest:/workspace/${{ env.PYTEST_XML_FILE }} .
- name: Archive test report
uses: actions/upload-artifact@v4
if: always()
with: with:
name: dynamo-python-test-results image_tag: ${{ env.IMAGE_TAG }}
if-no-files-found: error pytest_marks: "pre_merge and not parallel and not (vllm or sglang or trtllm) and (gpu_0)"
path: | framework: dynamo
${{ env.PYTEST_XML_FILE }} test_type: "pre_merge_sequential"
${{ env.PYTEST_PARALLEL_XML_FILE }} platform_arch: amd64
enable_mypy: 'false'
parallel_mode: 'none'
dind_as_sidecar: 'false'
event_file: event_file:
name: "Event File" name: "Event File"
......
...@@ -27,8 +27,8 @@ dynamo: ...@@ -27,8 +27,8 @@ dynamo:
nixl_gdrcopy_ref: v2.5.1 nixl_gdrcopy_ref: v2.5.1
nixl_ucx_efa_ref: 9d2b88a1f67faf9876f267658bd077b379b8bb76 nixl_ucx_efa_ref: 9d2b88a1f67faf9876f267658bd077b379b8bb76
nixl_libfabric_ref: v2.3.0 nixl_libfabric_ref: v2.3.0
enable_kvbm: "false" enable_kvbm: "true"
enable_media_ffmpeg: "false" enable_media_ffmpeg: "true"
enable_gpu_memory_service: "false" enable_gpu_memory_service: "false"
ffmpeg_version: "7.1" ffmpeg_version: "7.1"
efa_version: 1.45.1 efa_version: 1.45.1
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment