"lib/vscode:/vscode.git/clone" did not exist on "1da603a48f6a6a41280efec57d013e87b42d35e2"
Unverified Commit 72762dac authored by Anant Sharma's avatar Anant Sharma Committed by GitHub
Browse files

ci: migrate container-validation-dynamo to self-hosted runners (#6381)


Signed-off-by: default avatarAnant Sharma <anants@nvidia.com>
parent fc229004
......@@ -8,18 +8,26 @@ on:
branches:
- main
- release/*.*.*
pull_request:
- "pull-request/[0-9]+"
# Cancel any previous check runs for the same pull request to avoid redundant workflows.
concurrency:
group: ${{ github.event_name == 'pull_request' && format('{0}-{1}', github.workflow, github.event.pull_request.number) || format('{0}-{1}', github.workflow, github.run_id) }}
cancel-in-progress: ${{ github.event_name == 'pull_request' }}
# The group name is a ternary operation. If the ref_name is 'main',
# then the group name uses the run_id to ensure a unique group for
# 'main' pushes. Otherwise, the group name is the ref_name, so that
# workflows on the same PR/branch have the same group name for cancelling.
group: dynamo-build-test-${{ github.ref_name == 'main' && github.run_id || github.ref_name }}
cancel-in-progress: ${{ github.ref != 'refs/heads/main' }}
env:
CUDA_VERSION: '12.9'
BUILDER_NAME: b-${{ github.run_id }}-${{ github.run_attempt }}
jobs:
changed-files:
runs-on: ubuntu-latest
outputs:
core: ${{ steps.changes.outputs.core }}
builder_name: ${{ steps.export-builder-name.outputs.builder_name }}
steps:
- name: Checkout code
uses: actions/checkout@08eba0b27e820071cde6df949e0beb9ba4906955 # v4.3.0
......@@ -30,115 +38,181 @@ jobs:
uses: ./.github/actions/changed-files
with:
gh_token: ${{ github.token }}
- name: Export builder name
id: export-builder-name
run: |
echo "builder_name=${{ env.BUILDER_NAME }}" >> $GITHUB_OUTPUT
dynamo-status-check:
runs-on: ubuntu-latest
needs: [changed-files, build-test]
needs: [changed-files, build, rust-checks, test-parallel, test-sequential]
if: always()
steps:
- name: "Check all dependent jobs"
run: |
echo '${{ toJson(needs) }}' | jq -e 'to_entries | map(.value.result) | all(. as $result | ["success", "skipped"] | any($result == .))'
build-test:
build:
needs: changed-files
if: needs.changed-files.outputs.core == 'true'
runs-on:
group: Fastchecker
name: Build and Test - dynamo
env:
CONTAINER_ID: test_${{ github.run_id }}_${{ github.run_attempt }}_${{ github.job }}_dynamo
PYTEST_XML_FILE: pytest_test_report.xml
PYTEST_PARALLEL_XML_FILE: pytest_parallel.xml
runs-on: prod-builder-v3
name: Build
timeout-minutes: 60
outputs:
# Only pass the non-secret tag suffix between jobs (GitHub blanks outputs containing secrets)
image_tag_suffix: ${{ steps.define_image_tag.outputs.image_tag_suffix }}
steps:
- name: Checkout repository
uses: actions/checkout@v4
uses: actions/checkout@08eba0b27e820071cde6df949e0beb9ba4906955 # v4.3.0
with:
lfs: true
- name: Set up Docker Buildx
uses: docker/setup-buildx-action@v3
- name: Login to NGC
- name: Initialize Dynamo Builder
uses: ./.github/actions/init-dynamo-builder
with:
builder_name: ${{ needs.changed-files.outputs.builder_name }}
flavor: general
arch: amd64
- name: Docker Login
if: github.event.pull_request.head.repo.full_name == github.repository || github.event_name == 'push'
uses: ./.github/actions/docker-login
with:
ngc_ci_access_token: ${{ secrets.NGC_CI_ACCESS_TOKEN }}
aws_default_region: ${{ secrets.AWS_DEFAULT_REGION }}
aws_account_id: ${{ secrets.AWS_ACCOUNT_ID }}
azure_acr_hostname: ${{ secrets.AZURE_ACR_HOSTNAME }}
azure_acr_user: ${{ secrets.AZURE_ACR_USER }}
azure_acr_password: ${{ secrets.AZURE_ACR_PASSWORD }}
- name: Define Image Tag
id: define_image_tag
env:
ECR_HOSTNAME: ${{ secrets.AWS_ACCOUNT_ID }}.dkr.ecr.${{ secrets.AWS_DEFAULT_REGION }}.amazonaws.com
run: |
echo "image_tag=dynamo:latest-dev" >> $GITHUB_OUTPUT
CUDA_MAJOR=${CUDA_VERSION%%.*}
IMAGE_TAG_SUFFIX="${{ github.sha }}-dynamo-dev-cuda${CUDA_MAJOR}-amd64"
echo "image_tag_suffix=${IMAGE_TAG_SUFFIX}" >> $GITHUB_OUTPUT
echo "remote_tag=${ECR_HOSTNAME}/ai-dynamo/dynamo:${IMAGE_TAG_SUFFIX}" >> $GITHUB_OUTPUT
- name: Generate Dockerfile
shell: bash
run: |
echo "Generating Dockerfile for target: ${{ inputs.target }} and framework: ${{ inputs.framework }}"
echo "Generating Dockerfile for target: dev and framework: dynamo"
python ./container/render.py \
--target=dev \
--framework=dynamo \
--platform=amd64 \
--cuda-version=${{ env.CUDA_VERSION }} \
--show-result \
--output-short-filename
- name: Build image
- name: Build and Push Container
uses: ./.github/actions/docker-remote-build
with:
image_tag: ${{ steps.define_image_tag.outputs.remote_tag }}
framework: dynamo
target: dev
platform: amd64
cuda_version: ${{ env.CUDA_VERSION }}
ci_token: ${{ secrets.CI_TOKEN }}
aws_default_region: ${{ secrets.AWS_DEFAULT_REGION }}
aws_account_id: ${{ secrets.AWS_ACCOUNT_ID }}
sccache_s3_bucket: ${{ secrets.SCCACHE_S3_BUCKET }}
aws_access_key_id: ${{ secrets.AWS_ACCESS_KEY_ID }}
aws_secret_access_key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
push_image: 'true'
rust-checks:
needs: [changed-files, build]
if: needs.changed-files.outputs.core == 'true'
runs-on: prod-builder-amd-v1
name: Rust Checks
timeout-minutes: 30
env:
GITHUB_TOKEN: ${{ secrets.CI_TOKEN }}
run: |
docker buildx build \
--progress=plain \
--tag ${{ steps.define_image_tag.outputs.image_tag }} \
-f ./container/rendered.Dockerfile \
--build-arg ENABLE_MEDIA_FFMPEG=true \
--build-arg ENABLE_KVBM=true \
--load .
- name: Start services with docker-compose
working-directory: ./deploy
run: |
docker compose up -d nats-server etcd-server
CONTAINER_ID: test_${{ github.run_id }}_${{ github.run_attempt }}_rust_dynamo
IMAGE_TAG: ${{ secrets.AWS_ACCOUNT_ID }}.dkr.ecr.${{ secrets.AWS_DEFAULT_REGION }}.amazonaws.com/ai-dynamo/dynamo:${{ needs.build.outputs.image_tag_suffix }}
steps:
- name: Checkout repository
uses: actions/checkout@08eba0b27e820071cde6df949e0beb9ba4906955 # v4.3.0
- name: Docker Login
uses: ./.github/actions/docker-login
with:
aws_default_region: ${{ secrets.AWS_DEFAULT_REGION }}
aws_account_id: ${{ secrets.AWS_ACCOUNT_ID }}
- name: Pull image
run: docker pull ${{ env.IMAGE_TAG }}
- name: Run Rust checks (block-manager + media-ffmpeg + integration tests)
run: |
docker run --rm -w /workspace/lib/llm \
--name ${{ env.CONTAINER_ID }}_rust_checks \
${{ steps.define_image_tag.outputs.image_tag }} \
bash -ec 'rustup component add rustfmt clippy && \
-e SCCACHE_BUCKET=${{ secrets.SCCACHE_S3_BUCKET }} \
-e SCCACHE_REGION=${{ secrets.AWS_DEFAULT_REGION }} \
-e AWS_ACCESS_KEY_ID=${{ secrets.AWS_ACCESS_KEY_ID }} \
-e AWS_SECRET_ACCESS_KEY=${{ secrets.AWS_SECRET_ACCESS_KEY }} \
${{ env.IMAGE_TAG }} \
bash -ec 'ARCH_ALT=x86_64 /workspace/container/use-sccache.sh install && \
eval $(/workspace/container/use-sccache.sh setup-env) && \
rustup component add rustfmt clippy && \
cargo fmt -- --check && \
cargo clippy --features block-manager,media-ffmpeg --no-deps --all-targets -- -D warnings && \
cargo test --locked --all-targets --features=block-manager,media-ffmpeg,testing-nixl && \
cargo test --locked --features integration -- --nocapture'
- name: Cleanup services
if: always()
working-directory: ./deploy
run: |
docker compose down
cargo clippy --features block-manager,media-ffmpeg,testing-nixl,integration --no-deps --all-targets -- -D warnings && \
cargo test --locked --all-targets --features=block-manager,media-ffmpeg,testing-nixl,integration -- --nocapture && \
/workspace/container/use-sccache.sh show-stats "Rust Checks"'
test-parallel:
needs: [changed-files, build]
if: needs.changed-files.outputs.core == 'true'
runs-on: prod-builder-amd-v1
name: Pytest (parallel)
timeout-minutes: 30
env:
IMAGE_TAG: ${{ secrets.AWS_ACCOUNT_ID }}.dkr.ecr.${{ secrets.AWS_DEFAULT_REGION }}.amazonaws.com/ai-dynamo/dynamo:${{ needs.build.outputs.image_tag_suffix }}
steps:
- name: Checkout repository
uses: actions/checkout@08eba0b27e820071cde6df949e0beb9ba4906955 # v4.3.0
- name: Docker Login
uses: ./.github/actions/docker-login
with:
aws_default_region: ${{ secrets.AWS_DEFAULT_REGION }}
aws_account_id: ${{ secrets.AWS_ACCOUNT_ID }}
- name: Pull image
run: docker pull ${{ env.IMAGE_TAG }}
- name: Run pytest (parallel tests with xdist)
uses: ./.github/actions/pytest
with:
image_tag: ${{ env.IMAGE_TAG }}
pytest_marks: "pre_merge and parallel and not (vllm or sglang or trtllm) and (gpu_0)"
framework: dynamo
test_type: "pre_merge_parallel"
platform_arch: amd64
enable_mypy: 'true'
parallel_mode: '4'
dind_as_sidecar: 'false'
test-sequential:
needs: [changed-files, build]
if: needs.changed-files.outputs.core == 'true'
runs-on: prod-builder-amd-v1
name: Pytest (sequential)
timeout-minutes: 30
env:
PYTEST_MARKS: "pre_merge and parallel and not (vllm or sglang or trtllm) and (gpu_0 or gpu_1)"
run: |
docker run -w /workspace \
--name ${{ env.CONTAINER_ID }}_pytest_parallel \
${{ steps.define_image_tag.outputs.image_tag }} \
bash -c "pytest --mypy --basetemp=/tmp/pytest-parallel --junitxml=${{ env.PYTEST_PARALLEL_XML_FILE }} --durations=10 -n 4 -m \"${{ env.PYTEST_MARKS }}\""
- name: Copy parallel test report from Container
if: always()
run: |
docker cp ${{ env.CONTAINER_ID }}_pytest_parallel:/workspace/${{ env.PYTEST_PARALLEL_XML_FILE }} . || echo "No parallel test report found"
IMAGE_TAG: ${{ secrets.AWS_ACCOUNT_ID }}.dkr.ecr.${{ secrets.AWS_DEFAULT_REGION }}.amazonaws.com/ai-dynamo/dynamo:${{ needs.build.outputs.image_tag_suffix }}
steps:
- name: Checkout repository
uses: actions/checkout@08eba0b27e820071cde6df949e0beb9ba4906955 # v4.3.0
- name: Docker Login
uses: ./.github/actions/docker-login
with:
aws_default_region: ${{ secrets.AWS_DEFAULT_REGION }}
aws_account_id: ${{ secrets.AWS_ACCOUNT_ID }}
- name: Pull image
run: docker pull ${{ env.IMAGE_TAG }}
- name: Run pytest (sequential tests)
env:
PYTEST_MARKS: "pre_merge and not parallel and not (vllm or sglang or trtllm) and (gpu_0 or gpu_1)"
run: |
docker run -w /workspace \
--name ${{ env.CONTAINER_ID }}_pytest \
${{ steps.define_image_tag.outputs.image_tag }} \
bash -c "pytest --mypy --basetemp=/tmp --junitxml=${{ env.PYTEST_XML_FILE }} --durations=10 -m \"${{ env.PYTEST_MARKS }}\" "
- name: Copy test report from test Container
if: always()
run: |
docker cp ${{ env.CONTAINER_ID }}_pytest:/workspace/${{ env.PYTEST_XML_FILE }} .
- name: Archive test report
uses: actions/upload-artifact@v4
if: always()
uses: ./.github/actions/pytest
with:
name: dynamo-python-test-results
if-no-files-found: error
path: |
${{ env.PYTEST_XML_FILE }}
${{ env.PYTEST_PARALLEL_XML_FILE }}
image_tag: ${{ env.IMAGE_TAG }}
pytest_marks: "pre_merge and not parallel and not (vllm or sglang or trtllm) and (gpu_0)"
framework: dynamo
test_type: "pre_merge_sequential"
platform_arch: amd64
enable_mypy: 'false'
parallel_mode: 'none'
dind_as_sidecar: 'false'
event_file:
name: "Event File"
......
......@@ -27,8 +27,8 @@ dynamo:
nixl_gdrcopy_ref: v2.5.1
nixl_ucx_efa_ref: 9d2b88a1f67faf9876f267658bd077b379b8bb76
nixl_libfabric_ref: v2.3.0
enable_kvbm: "false"
enable_media_ffmpeg: "false"
enable_kvbm: "true"
enable_media_ffmpeg: "true"
enable_gpu_memory_service: "false"
ffmpeg_version: "7.1"
efa_version: 1.45.1
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment