Unverified Commit ab5a31b5 authored by Alec's avatar Alec Committed by GitHub
Browse files

test(planner): isolate planner-family suites [DYN-2534] (#7723)

parent cc22114d
...@@ -2,8 +2,12 @@ name: 'Build Flavor' ...@@ -2,8 +2,12 @@ name: 'Build Flavor'
description: 'Build a single Dynamo framework image (checkout login tag builder dockerfile build test image summary)' description: 'Build a single Dynamo framework image (checkout login tag builder dockerfile build test image summary)'
inputs: inputs:
framework: framework:
description: 'Framework name (vllm, sglang, trtllm)' description: 'Framework name (e.g. dynamo, vllm, sglang, trtllm)'
required: true required: true
builder_flavor:
description: 'Optional BuildKit routing flavor override (vllm, sglang, trtllm, general)'
required: false
default: ''
target: target:
description: 'Target stage for Docker rendering' description: 'Target stage for Docker rendering'
required: true required: true
...@@ -12,7 +16,12 @@ inputs: ...@@ -12,7 +16,12 @@ inputs:
required: true required: true
cuda_version: cuda_version:
description: 'CUDA version to build (e.g., 12.9, 13.0)' description: 'CUDA version to build (e.g., 12.9, 13.0)'
required: true required: false
default: ''
cpu_only:
description: 'Build and tag this image as CPU-only. The shared container render/build path still requires a default CUDA version internally.'
required: false
default: 'false'
builder_name: builder_name:
description: 'Buildkit builder name' description: 'Buildkit builder name'
required: true required: true
...@@ -93,6 +102,12 @@ outputs: ...@@ -93,6 +102,12 @@ outputs:
cuda_version_plain: cuda_version_plain:
description: 'CUDA major version (e.g., 12 from 12.9)' description: 'CUDA major version (e.g., 12 from 12.9)'
value: ${{ steps.calculate-target-tag.outputs.cuda_version_plain }} value: ${{ steps.calculate-target-tag.outputs.cuda_version_plain }}
image_variant_label:
description: 'Image variant label used in job names and cache namespaces (e.g. cpu, cuda12)'
value: ${{ steps.calculate-target-tag.outputs.image_variant_label }}
image_tag_suffix:
description: 'Image tag suffix including leading hyphen (e.g. -cpu, -cuda12)'
value: ${{ steps.calculate-target-tag.outputs.image_tag_suffix }}
runs: runs:
using: "composite" using: "composite"
...@@ -109,44 +124,68 @@ runs: ...@@ -109,44 +124,68 @@ runs:
id: calculate-target-tag id: calculate-target-tag
shell: bash shell: bash
run: | run: |
CUDA_VERSION_RAW=${{ inputs.cuda_version }} if [[ "${{ inputs.cpu_only }}" == "true" ]]; then
CUDA_VERSION=${CUDA_VERSION_RAW%%.*} CUDA_VERSION_RAW="${{ inputs.cuda_version }}"
if [[ -z "$CUDA_VERSION_RAW" ]]; then
# Planner still shares the CUDA-backed wheel builder path today even
# though the final runtime image is CPU-only. Keep that detail
# internal to the build action instead of surfacing it in job names.
CUDA_VERSION_RAW="12.9"
fi
CUDA_VERSION=${CUDA_VERSION_RAW%%.*}
IMAGE_VARIANT_LABEL="cpu"
IMAGE_TAG_SUFFIX="-cpu"
BUILDER_CUDA_VERSION=""
else
CUDA_VERSION_RAW="${{ inputs.cuda_version }}"
if [[ -z "$CUDA_VERSION_RAW" ]]; then
echo "cuda_version is required unless cpu_only=true" >&2
exit 1
fi
CUDA_VERSION=${CUDA_VERSION_RAW%%.*}
IMAGE_VARIANT_LABEL="cuda${CUDA_VERSION}"
IMAGE_TAG_SUFFIX="-cuda${CUDA_VERSION}"
BUILDER_CUDA_VERSION="${CUDA_VERSION_RAW}"
fi
EFA_SUFFIX="" EFA_SUFFIX=""
if [ "${{ inputs.make_efa }}" == "true" ]; then if [ "${{ inputs.make_efa }}" == "true" ]; then
EFA_SUFFIX="-efa" EFA_SUFFIX="-efa"
fi fi
TARGET_TAG_PLAIN="${{ github.sha }}-${{ inputs.framework }}-${{ inputs.target }}${EFA_SUFFIX}" TARGET_TAG_PLAIN="${{ github.sha }}-${{ inputs.framework }}-${{ inputs.target }}${EFA_SUFFIX}"
TEST_TAG_PLAIN="${{ github.sha }}-${{ inputs.framework }}-${{ inputs.target }}${EFA_SUFFIX}-test" TEST_TAG_PLAIN="${{ github.sha }}-${{ inputs.framework }}-${{ inputs.target }}${EFA_SUFFIX}-test"
DEFAULT_TARGET_IMAGE_URI="${{ inputs.aws_account_id }}.dkr.ecr.${{ inputs.aws_default_region }}.amazonaws.com/ai-dynamo/dynamo:${TARGET_TAG_PLAIN}-cuda${CUDA_VERSION}" DEFAULT_TARGET_IMAGE_URI="${{ inputs.aws_account_id }}.dkr.ecr.${{ inputs.aws_default_region }}.amazonaws.com/ai-dynamo/dynamo:${TARGET_TAG_PLAIN}${IMAGE_TAG_SUFFIX}"
TEST_IMAGE_URI="${{ inputs.aws_account_id }}.dkr.ecr.${{ inputs.aws_default_region }}.amazonaws.com/ai-dynamo/dynamo:${TEST_TAG_PLAIN}-cuda${CUDA_VERSION}" TEST_IMAGE_URI="${{ inputs.aws_account_id }}.dkr.ecr.${{ inputs.aws_default_region }}.amazonaws.com/ai-dynamo/dynamo:${TEST_TAG_PLAIN}${IMAGE_TAG_SUFFIX}"
echo "default_target_image_uri=${DEFAULT_TARGET_IMAGE_URI}" >> $GITHUB_OUTPUT echo "default_target_image_uri=${DEFAULT_TARGET_IMAGE_URI}" >> $GITHUB_OUTPUT
echo "test_image_uri=${TEST_IMAGE_URI}" >> $GITHUB_OUTPUT echo "test_image_uri=${TEST_IMAGE_URI}" >> $GITHUB_OUTPUT
echo "target_tag_plain=${TARGET_TAG_PLAIN}" >> $GITHUB_OUTPUT echo "target_tag_plain=${TARGET_TAG_PLAIN}" >> $GITHUB_OUTPUT
echo "test_tag_plain=${TEST_TAG_PLAIN}" >> $GITHUB_OUTPUT echo "test_tag_plain=${TEST_TAG_PLAIN}" >> $GITHUB_OUTPUT
echo "cuda_version_plain=${CUDA_VERSION}" >> $GITHUB_OUTPUT echo "cuda_version_plain=${CUDA_VERSION}" >> $GITHUB_OUTPUT
echo "effective_cuda_version=${CUDA_VERSION_RAW}" >> $GITHUB_OUTPUT
echo "builder_cuda_version=${BUILDER_CUDA_VERSION}" >> $GITHUB_OUTPUT
echo "image_variant_label=${IMAGE_VARIANT_LABEL}" >> $GITHUB_OUTPUT
echo "image_tag_suffix=${IMAGE_TAG_SUFFIX}" >> $GITHUB_OUTPUT
- name: Initialize Dynamo Builder - name: Initialize Dynamo Builder
uses: ./.github/actions/init-dynamo-builder uses: ./.github/actions/init-dynamo-builder
with: with:
builder_name: ${{ inputs.builder_name }} builder_name: ${{ inputs.builder_name }}
flavor: ${{ inputs.framework }} flavor: ${{ inputs.builder_flavor != '' && inputs.builder_flavor || inputs.framework }}
arch: ${{ inputs.platform }} arch: ${{ inputs.platform }}
cuda_version: ${{ inputs.cuda_version }} cuda_version: ${{ steps.calculate-target-tag.outputs.builder_cuda_version }}
fresh_builder: ${{ inputs.fresh_builder }} fresh_builder: ${{ inputs.fresh_builder }}
- name: Calculate extra tags - name: Calculate extra tags
id: extra-tags id: extra-tags
shell: bash shell: bash
env: env:
EXTRA_TAGS: ${{ inputs.extra_tags }} EXTRA_TAGS: ${{ inputs.extra_tags }}
CUDA_VERSION: ${{ inputs.cuda_version }} IMAGE_TAG_SUFFIX: ${{ steps.calculate-target-tag.outputs.image_tag_suffix }}
run: | run: |
CUDA_VERSION_MAJOR=${CUDA_VERSION%%.*}
ECR_REGISTRY="${{ inputs.aws_account_id }}.dkr.ecr.${{ inputs.aws_default_region }}.amazonaws.com" ECR_REGISTRY="${{ inputs.aws_account_id }}.dkr.ecr.${{ inputs.aws_default_region }}.amazonaws.com"
ACR_REGISTRY="${{ inputs.azure_acr_hostname }}" ACR_REGISTRY="${{ inputs.azure_acr_hostname }}"
RESULT="" RESULT=""
if [ -n "$EXTRA_TAGS" ]; then if [ -n "$EXTRA_TAGS" ]; then
while IFS= read -r tag; do while IFS= read -r tag; do
if [ -n "$tag" ]; then if [ -n "$tag" ]; then
RESULT+="${ECR_REGISTRY}/ai-dynamo/dynamo:${tag}-cuda${CUDA_VERSION_MAJOR}"$'\n' RESULT+="${ECR_REGISTRY}/ai-dynamo/dynamo:${tag}${IMAGE_TAG_SUFFIX}"$'\n'
fi fi
done <<< "$EXTRA_TAGS" done <<< "$EXTRA_TAGS"
fi fi
...@@ -165,7 +204,8 @@ runs: ...@@ -165,7 +204,8 @@ runs:
echo "framework: ${{ inputs.framework }}" echo "framework: ${{ inputs.framework }}"
echo "target: ${{ inputs.target }}" echo "target: ${{ inputs.target }}"
echo "platform: ${{ inputs.platform }}" echo "platform: ${{ inputs.platform }}"
echo "cuda_version: ${{ inputs.cuda_version }}" echo "image_variant: ${{ steps.calculate-target-tag.outputs.image_variant_label }}"
echo "effective_build_cuda_version: ${{ steps.calculate-target-tag.outputs.effective_cuda_version }}"
echo "no_cache: ${{ inputs.no_cache }}" echo "no_cache: ${{ inputs.no_cache }}"
echo "extra_tags: ${{ steps.extra-tags.outputs.tags }}" echo "extra_tags: ${{ steps.extra-tags.outputs.tags }}"
echo "push_image: ${{ inputs.push_image }}" echo "push_image: ${{ inputs.push_image }}"
...@@ -183,7 +223,7 @@ runs: ...@@ -183,7 +223,7 @@ runs:
--target=${{ inputs.target }} \ --target=${{ inputs.target }} \
--framework=${{ inputs.framework }} \ --framework=${{ inputs.framework }} \
--platform=${{ inputs.platform }} \ --platform=${{ inputs.platform }} \
--cuda-version=${{ inputs.cuda_version }} \ --cuda-version=${{ steps.calculate-target-tag.outputs.effective_cuda_version }} \
${MAKE_EFA_FLAG} \ ${MAKE_EFA_FLAG} \
--show-result \ --show-result \
--output-short-filename --output-short-filename
...@@ -195,7 +235,7 @@ runs: ...@@ -195,7 +235,7 @@ runs:
framework: ${{ inputs.framework }} framework: ${{ inputs.framework }}
target: ${{ inputs.target }} target: ${{ inputs.target }}
platform: ${{ inputs.platform }} platform: ${{ inputs.platform }}
cuda_version: ${{ inputs.cuda_version }} cuda_version: ${{ steps.calculate-target-tag.outputs.effective_cuda_version }}
aws_default_region: ${{ inputs.aws_default_region }} aws_default_region: ${{ inputs.aws_default_region }}
sccache_s3_bucket: ${{ inputs.sccache_s3_bucket }} sccache_s3_bucket: ${{ inputs.sccache_s3_bucket }}
aws_account_id: ${{ inputs.aws_account_id }} aws_account_id: ${{ inputs.aws_account_id }}
...@@ -213,17 +253,17 @@ runs: ...@@ -213,17 +253,17 @@ runs:
uses: ./.github/actions/builder-refresher uses: ./.github/actions/builder-refresher
with: with:
builder_name: ${{ inputs.builder_name }} builder_name: ${{ inputs.builder_name }}
flavor: ${{ inputs.framework }} flavor: ${{ inputs.builder_flavor != '' && inputs.builder_flavor || inputs.framework }}
arch: ${{ inputs.platform }} arch: ${{ inputs.platform }}
cuda_version: ${{ inputs.cuda_version }} cuda_version: ${{ steps.calculate-target-tag.outputs.builder_cuda_version }}
- name: Build and Push Test Image - name: Build and Push Test Image
if: ${{ inputs.target != 'dev' }} if: ${{ inputs.target != 'dev' }}
shell: bash shell: bash
env: env:
ECR_HOSTNAME: ${{ inputs.aws_account_id }}.dkr.ecr.${{ inputs.aws_default_region }}.amazonaws.com ECR_HOSTNAME: ${{ inputs.aws_account_id }}.dkr.ecr.${{ inputs.aws_default_region }}.amazonaws.com
run: | run: |
CUDA_MAJOR=${{ steps.calculate-target-tag.outputs.cuda_version_plain }} IMAGE_VARIANT="${{ steps.calculate-target-tag.outputs.image_variant_label }}"
CACHE_TAG="test-${{ inputs.framework }}-cuda${CUDA_MAJOR}-cache" CACHE_TAG="test-${{ inputs.framework }}-${IMAGE_VARIANT}-cache"
CACHE_ARGS="--cache-from type=registry,ref=${ECR_HOSTNAME}/ai-dynamo/dynamo:main-${CACHE_TAG}" CACHE_ARGS="--cache-from type=registry,ref=${ECR_HOSTNAME}/ai-dynamo/dynamo:main-${CACHE_TAG}"
CACHE_ARGS+=" --cache-from type=registry,ref=${ECR_HOSTNAME}/ai-dynamo/dynamo:release-${CACHE_TAG}" CACHE_ARGS+=" --cache-from type=registry,ref=${ECR_HOSTNAME}/ai-dynamo/dynamo:release-${CACHE_TAG}"
if [[ "$GITHUB_REF_NAME" == "main" ]]; then if [[ "$GITHUB_REF_NAME" == "main" ]]; then
...@@ -257,7 +297,7 @@ runs: ...@@ -257,7 +297,7 @@ runs:
shell: bash shell: bash
if: ${{ inputs.push_image == 'true' && inputs.show_summary == 'true' }} if: ${{ inputs.push_image == 'true' && inputs.show_summary == 'true' }}
run: | run: |
echo "### 🐳 ${{ inputs.framework }}-cuda${{ inputs.cuda_version }} Default Image" >> $GITHUB_STEP_SUMMARY echo "### 🐳 ${{ inputs.framework }}-${{ steps.calculate-target-tag.outputs.image_variant_label }} Default Image" >> $GITHUB_STEP_SUMMARY
echo "" >> $GITHUB_STEP_SUMMARY echo "" >> $GITHUB_STEP_SUMMARY
echo "| Image URI |" >> $GITHUB_STEP_SUMMARY echo "| Image URI |" >> $GITHUB_STEP_SUMMARY
echo "|-----|" >> $GITHUB_STEP_SUMMARY echo "|-----|" >> $GITHUB_STEP_SUMMARY
......
...@@ -151,7 +151,7 @@ runs: ...@@ -151,7 +151,7 @@ runs:
--name ${{ env.CONTAINER_ID }}_pytest \ --name ${{ env.CONTAINER_ID }}_pytest \
-v "${TEST_RESULTS_DIR}:/workspace/test-results" \ -v "${TEST_RESULTS_DIR}:/workspace/test-results" \
${{ inputs.image_tag }} \ ${{ inputs.image_tag }} \
bash -c "${PYTEST_CMD}" sh -c "umask 0022 && ${PYTEST_CMD}"
TEST_EXIT_CODE=$? TEST_EXIT_CODE=$?
echo "TEST_EXIT_CODE=${TEST_EXIT_CODE}" >> $GITHUB_ENV echo "TEST_EXIT_CODE=${TEST_EXIT_CODE}" >> $GITHUB_ENV
...@@ -244,7 +244,7 @@ runs: ...@@ -244,7 +244,7 @@ runs:
--name ${{ env.CONTAINER_ID }}_pytest \ --name ${{ env.CONTAINER_ID }}_pytest \
-v "${TEST_RESULTS_DIR}:/workspace/test-results" \ -v "${TEST_RESULTS_DIR}:/workspace/test-results" \
${{ inputs.image_tag }} \ ${{ inputs.image_tag }} \
bash -c "${PYTEST_CMD}" sh -c "umask 0022 && ${PYTEST_CMD}"
TEST_EXIT_CODE=$? TEST_EXIT_CODE=$?
echo "TEST_EXIT_CODE=${TEST_EXIT_CODE}" >> $GITHUB_ENV echo "TEST_EXIT_CODE=${TEST_EXIT_CODE}" >> $GITHUB_ENV
......
...@@ -119,8 +119,6 @@ planner: ...@@ -119,8 +119,6 @@ planner:
- 'container/templates/planner.Dockerfile' - 'container/templates/planner.Dockerfile'
- 'components/src/dynamo/planner/**' - 'components/src/dynamo/planner/**'
- 'components/src/dynamo/global_planner/**' - 'components/src/dynamo/global_planner/**'
- 'tests/planner/**'
- 'tests/global_planner/**'
- 'components/src/dynamo/profiler/**' - 'components/src/dynamo/profiler/**'
- 'components/src/dynamo/global_router/**' - 'components/src/dynamo/global_router/**'
......
...@@ -64,7 +64,8 @@ planner: ...@@ -64,7 +64,8 @@ planner:
- changed-files: - changed-files:
- any-glob-to-any-file: - any-glob-to-any-file:
- components/src/dynamo/planner/** - components/src/dynamo/planner/**
- tests/planner/** - components/src/dynamo/profiler/**
- components/src/dynamo/global_planner/**
# Deployment labels # Deployment labels
deployment::k8s: deployment::k8s:
......
...@@ -7,9 +7,14 @@ on: ...@@ -7,9 +7,14 @@ on:
workflow_call: workflow_call:
inputs: inputs:
framework: framework:
description: 'Framework name (vllm, sglang, trtllm)' description: 'Framework name (e.g. dynamo, vllm, sglang, trtllm)'
required: true required: true
type: string type: string
builder_flavor:
description: 'Optional BuildKit routing flavor override (vllm, sglang, trtllm, general)'
required: false
type: string
default: ''
target: target:
description: 'Target stage for Docker rendering' description: 'Target stage for Docker rendering'
required: true required: true
...@@ -20,8 +25,14 @@ on: ...@@ -20,8 +25,14 @@ on:
type: string type: string
cuda_version: cuda_version:
description: 'CUDA version to build (e.g., 12.9, 13.0)' description: 'CUDA version to build (e.g., 12.9, 13.0)'
required: true required: false
type: string type: string
default: ''
cpu_only:
description: 'Build and test this target as a CPU-only image variant'
required: false
type: boolean
default: false
build_timeout_minutes: build_timeout_minutes:
description: 'Timeout in minutes for the build step' description: 'Timeout in minutes for the build step'
required: false required: false
...@@ -148,12 +159,14 @@ jobs: ...@@ -148,12 +159,14 @@ jobs:
# ============================================================================ # ============================================================================
build: build:
if: inputs.build_image if: inputs.build_image
name: Build cuda${{ inputs.cuda_version }} name: Build ${{ inputs.cpu_only && 'cpu' || format('cuda{0}', inputs.cuda_version) }}
runs-on: prod-builder-v3 runs-on: prod-builder-v3
timeout-minutes: ${{ inputs.build_timeout_minutes }} timeout-minutes: ${{ inputs.build_timeout_minutes }}
outputs: outputs:
target_tag_plain: ${{ steps.build.outputs.target_tag_plain }} target_tag_plain: ${{ steps.build.outputs.target_tag_plain }}
test_tag_plain: ${{ steps.build.outputs.test_tag_plain }} test_tag_plain: ${{ steps.build.outputs.test_tag_plain }}
image_variant_label: ${{ steps.build.outputs.image_variant_label }}
image_tag_suffix: ${{ steps.build.outputs.image_tag_suffix }}
compliance_arches: ${{ steps.compliance-arches.outputs.arches }} compliance_arches: ${{ steps.compliance-arches.outputs.arches }}
test_runners: ${{ steps.test-runners.outputs.runners }} test_runners: ${{ steps.test-runners.outputs.runners }}
env: env:
...@@ -194,9 +207,11 @@ jobs: ...@@ -194,9 +207,11 @@ jobs:
uses: ./.github/actions/build-flavor uses: ./.github/actions/build-flavor
with: with:
framework: ${{ inputs.framework }} framework: ${{ inputs.framework }}
builder_flavor: ${{ inputs.builder_flavor }}
target: ${{ inputs.target }} target: ${{ inputs.target }}
platform: ${{ inputs.platform }} platform: ${{ inputs.platform }}
cuda_version: ${{ inputs.cuda_version }} cuda_version: ${{ inputs.cuda_version }}
cpu_only: ${{ inputs.cpu_only }}
builder_name: ${{ inputs.builder_name }} builder_name: ${{ inputs.builder_name }}
aws_default_region: ${{ secrets.AWS_DEFAULT_REGION }} aws_default_region: ${{ secrets.AWS_DEFAULT_REGION }}
aws_account_id: ${{ secrets.AWS_ACCOUNT_ID }} aws_account_id: ${{ secrets.AWS_ACCOUNT_ID }}
...@@ -225,7 +240,7 @@ jobs: ...@@ -225,7 +240,7 @@ jobs:
( inputs.run_cpu_only_tests || inputs.run_single_gpu_tests ) && ( inputs.run_cpu_only_tests || inputs.run_single_gpu_tests ) &&
inputs.build_image inputs.build_image
needs: [build] needs: [build]
name: Test cuda${{ inputs.cuda_version }} (${{ matrix.arch }}) name: Test ${{ inputs.cpu_only && 'cpu' || format('cuda{0}', inputs.cuda_version) }} (${{ matrix.arch }})
strategy: strategy:
fail-fast: false fail-fast: false
matrix: matrix:
...@@ -240,12 +255,9 @@ jobs: ...@@ -240,12 +255,9 @@ jobs:
id: calculate-target-tag id: calculate-target-tag
shell: bash shell: bash
run: | run: |
CUDA_VERSION_RAW=${{ inputs.cuda_version }} RUNTIME_IMAGE=${{ secrets.AWS_ACCOUNT_ID }}.dkr.ecr.${{ secrets.AWS_DEFAULT_REGION }}.amazonaws.com/ai-dynamo/dynamo:${{ needs.build.outputs.target_tag_plain }}${{ needs.build.outputs.image_tag_suffix }}
CUDA_VERSION=${CUDA_VERSION_RAW%%.*}
echo "cuda_version_plain=${CUDA_VERSION}" >> $GITHUB_OUTPUT
RUNTIME_IMAGE=${{ secrets.AWS_ACCOUNT_ID }}.dkr.ecr.${{ secrets.AWS_DEFAULT_REGION }}.amazonaws.com/ai-dynamo/dynamo:${{ needs.build.outputs.target_tag_plain }}-cuda${CUDA_VERSION}
echo "runtime_image=${RUNTIME_IMAGE}" >> $GITHUB_OUTPUT echo "runtime_image=${RUNTIME_IMAGE}" >> $GITHUB_OUTPUT
TEST_IMAGE=${{ secrets.AWS_ACCOUNT_ID }}.dkr.ecr.${{ secrets.AWS_DEFAULT_REGION }}.amazonaws.com/ai-dynamo/dynamo:${{ needs.build.outputs.test_tag_plain }}-cuda${CUDA_VERSION} TEST_IMAGE=${{ secrets.AWS_ACCOUNT_ID }}.dkr.ecr.${{ secrets.AWS_DEFAULT_REGION }}.amazonaws.com/ai-dynamo/dynamo:${{ needs.build.outputs.test_tag_plain }}${{ needs.build.outputs.image_tag_suffix }}
echo "test_image=${TEST_IMAGE}" >> $GITHUB_OUTPUT echo "test_image=${TEST_IMAGE}" >> $GITHUB_OUTPUT
- name: Docker Login - name: Docker Login
uses: ./.github/actions/docker-login uses: ./.github/actions/docker-login
...@@ -275,9 +287,13 @@ jobs: ...@@ -275,9 +287,13 @@ jobs:
# Run the sanity check script inside the container # Run the sanity check script inside the container
# The script is located in /workspace/deploy/sanity_check.py in runtime containers # The script is located in /workspace/deploy/sanity_check.py in runtime containers
export WORKSPACE=/workspace export WORKSPACE=/workspace
SANITY_FLAGS="--runtime-check --no-gpu-check"
if [[ "${{ inputs.target }}" == "planner" ]]; then
SANITY_FLAGS="${SANITY_FLAGS} --no-framework-check"
fi
set +e set +e
docker run --rm "${{ steps.calculate-target-tag.outputs.runtime_image }}" python ${WORKSPACE}/deploy/sanity_check.py --runtime-check --no-gpu-check docker run --rm "${{ steps.calculate-target-tag.outputs.runtime_image }}" python ${WORKSPACE}/deploy/sanity_check.py ${SANITY_FLAGS}
SANITY_CHECK_EXIT_CODE=$? SANITY_CHECK_EXIT_CODE=$?
set -e set -e
if [ ${SANITY_CHECK_EXIT_CODE} -ne 0 ]; then if [ ${SANITY_CHECK_EXIT_CODE} -ne 0 ]; then
...@@ -328,7 +344,7 @@ jobs: ...@@ -328,7 +344,7 @@ jobs:
inputs.run_multi_gpu_tests && inputs.run_multi_gpu_tests &&
inputs.build_image inputs.build_image
needs: [build] needs: [build]
name: Multi-gpu test cuda${{ inputs.cuda_version }} name: Multi-gpu test ${{ inputs.cpu_only && 'cpu' || format('cuda{0}', inputs.cuda_version) }}
runs-on: prod-tester-amd-gpu-4-v1 runs-on: prod-tester-amd-gpu-4-v1
env: env:
FRAMEWORK: ${{ inputs.framework }} FRAMEWORK: ${{ inputs.framework }}
...@@ -339,12 +355,9 @@ jobs: ...@@ -339,12 +355,9 @@ jobs:
id: calculate-target-tag id: calculate-target-tag
shell: bash shell: bash
run: | run: |
CUDA_VERSION_RAW=${{ inputs.cuda_version }} RUNTIME_IMAGE=${{ secrets.AWS_ACCOUNT_ID }}.dkr.ecr.${{ secrets.AWS_DEFAULT_REGION }}.amazonaws.com/ai-dynamo/dynamo:${{ needs.build.outputs.target_tag_plain }}${{ needs.build.outputs.image_tag_suffix }}
CUDA_VERSION=${CUDA_VERSION_RAW%%.*}
echo "cuda_version_plain=${CUDA_VERSION}" >> $GITHUB_OUTPUT
RUNTIME_IMAGE=${{ secrets.AWS_ACCOUNT_ID }}.dkr.ecr.${{ secrets.AWS_DEFAULT_REGION }}.amazonaws.com/ai-dynamo/dynamo:${{ needs.build.outputs.target_tag_plain }}-cuda${CUDA_VERSION}
echo "runtime_image=${RUNTIME_IMAGE}" >> $GITHUB_OUTPUT echo "runtime_image=${RUNTIME_IMAGE}" >> $GITHUB_OUTPUT
TEST_IMAGE=${{ secrets.AWS_ACCOUNT_ID }}.dkr.ecr.${{ secrets.AWS_DEFAULT_REGION }}.amazonaws.com/ai-dynamo/dynamo:${{ needs.build.outputs.test_tag_plain }}-cuda${CUDA_VERSION} TEST_IMAGE=${{ secrets.AWS_ACCOUNT_ID }}.dkr.ecr.${{ secrets.AWS_DEFAULT_REGION }}.amazonaws.com/ai-dynamo/dynamo:${{ needs.build.outputs.test_tag_plain }}${{ needs.build.outputs.image_tag_suffix }}
echo "test_image=${TEST_IMAGE}" >> $GITHUB_OUTPUT echo "test_image=${TEST_IMAGE}" >> $GITHUB_OUTPUT
- name: Docker Login - name: Docker Login
uses: ./.github/actions/docker-login uses: ./.github/actions/docker-login
...@@ -390,7 +403,7 @@ jobs: ...@@ -390,7 +403,7 @@ jobs:
fail-fast: false fail-fast: false
matrix: matrix:
arch: ${{ fromJson(needs.build.outputs.compliance_arches) }} arch: ${{ fromJson(needs.build.outputs.compliance_arches) }}
name: Compliance cuda${{ inputs.cuda_version }}-${{ matrix.arch }} name: Compliance ${{ inputs.cpu_only && 'cpu' || format('cuda{0}', inputs.cuda_version) }}-${{ matrix.arch }}
runs-on: prod-builder-v3 runs-on: prod-builder-v3
steps: steps:
- name: Checkout repository - name: Checkout repository
...@@ -407,10 +420,8 @@ jobs: ...@@ -407,10 +420,8 @@ jobs:
id: images id: images
shell: bash shell: bash
run: | run: |
CUDA_VERSION_RAW=${{ inputs.cuda_version }} echo "image_variant_label=${{ needs.build.outputs.image_variant_label }}" >> $GITHUB_OUTPUT
CUDA_VERSION=${CUDA_VERSION_RAW%%.*} RUNTIME_IMAGE=${{ secrets.AWS_ACCOUNT_ID }}.dkr.ecr.${{ secrets.AWS_DEFAULT_REGION }}.amazonaws.com/ai-dynamo/dynamo:${{ needs.build.outputs.target_tag_plain }}${{ needs.build.outputs.image_tag_suffix }}
echo "cuda_major=${CUDA_VERSION}" >> $GITHUB_OUTPUT
RUNTIME_IMAGE=${{ secrets.AWS_ACCOUNT_ID }}.dkr.ecr.${{ secrets.AWS_DEFAULT_REGION }}.amazonaws.com/ai-dynamo/dynamo:${{ needs.build.outputs.target_tag_plain }}-cuda${CUDA_VERSION}
echo "runtime_image=${RUNTIME_IMAGE}" >> $GITHUB_OUTPUT echo "runtime_image=${RUNTIME_IMAGE}" >> $GITHUB_OUTPUT
# Sanitize arch for artifact name: linux/amd64 -> amd64 (artifact names can't contain /) # Sanitize arch for artifact name: linux/amd64 -> amd64 (artifact names can't contain /)
ARCH="${{ matrix.arch }}" ARCH="${{ matrix.arch }}"
...@@ -419,9 +430,10 @@ jobs: ...@@ -419,9 +430,10 @@ jobs:
uses: ./.github/actions/compliance-scan uses: ./.github/actions/compliance-scan
with: with:
image: ${{ steps.images.outputs.runtime_image }} image: ${{ steps.images.outputs.runtime_image }}
artifact_name: compliance-${{ inputs.framework }}-${{ inputs.target }}${{ inputs.make_efa && '-efa' || '' }}-cuda${{ steps.images.outputs.cuda_major }}-${{ steps.images.outputs.arch_suffix }} artifact_name: compliance-${{ inputs.framework }}-${{ inputs.target }}${{ inputs.make_efa && '-efa' || '' }}-${{ steps.images.outputs.image_variant_label }}-${{ steps.images.outputs.arch_suffix }}
arch: ${{ matrix.arch }} arch: ${{ matrix.arch }}
framework: ${{ inputs.framework }} framework: ${{ inputs.framework }}
target: ${{ inputs.target }}
cuda_version: ${{ inputs.cuda_version }} cuda_version: ${{ inputs.cuda_version }}
...@@ -436,7 +448,7 @@ jobs: ...@@ -436,7 +448,7 @@ jobs:
inputs.copy_to_acr && inputs.copy_to_acr &&
needs.build.result == 'success' && needs.build.result == 'success' &&
(needs.test.result == 'success' || needs.test.result == 'skipped') (needs.test.result == 'success' || needs.test.result == 'skipped')
name: copy-to-acr cuda${{ inputs.cuda_version }} name: copy-to-acr ${{ inputs.cpu_only && 'cpu' || format('cuda{0}', inputs.cuda_version) }}
runs-on: prod-default-small-v2 runs-on: prod-default-small-v2
outputs: outputs:
target_tag_plain: ${{ needs.build.outputs.target_tag_plain }} target_tag_plain: ${{ needs.build.outputs.target_tag_plain }}
...@@ -444,24 +456,16 @@ jobs: ...@@ -444,24 +456,16 @@ jobs:
- name: Checkout repository - name: Checkout repository
uses: actions/checkout@08eba0b27e820071cde6df949e0beb9ba4906955 # v4.3.0 uses: actions/checkout@08eba0b27e820071cde6df949e0beb9ba4906955 # v4.3.0
- name: Calculate target tag
id: calculate-target-tag
shell: bash
run: |
CUDA_VERSION_RAW=${{ inputs.cuda_version }}
CUDA_VERSION=${CUDA_VERSION_RAW%%.*}
echo "cuda_version_plain=${CUDA_VERSION}" >> $GITHUB_OUTPUT
- name: Copy image to target registry - name: Copy image to target registry
timeout-minutes: ${{ inputs.copy_timeout_minutes }} timeout-minutes: ${{ inputs.copy_timeout_minutes }}
uses: ./.github/actions/skopeo-copy uses: ./.github/actions/skopeo-copy
with: with:
source_registry: ${{ secrets.AWS_ACCOUNT_ID }}.dkr.ecr.${{ secrets.AWS_DEFAULT_REGION }}.amazonaws.com source_registry: ${{ secrets.AWS_ACCOUNT_ID }}.dkr.ecr.${{ secrets.AWS_DEFAULT_REGION }}.amazonaws.com
source_image: ai-dynamo/dynamo source_image: ai-dynamo/dynamo
source_tag: ${{ needs.build.outputs.target_tag_plain }}-cuda${{ steps.calculate-target-tag.outputs.cuda_version_plain }} source_tag: ${{ needs.build.outputs.target_tag_plain }}${{ needs.build.outputs.image_tag_suffix }}
target_registry: ${{ secrets.AZURE_ACR_HOSTNAME }} target_registry: ${{ secrets.AZURE_ACR_HOSTNAME }}
target_image: ai-dynamo/dynamo target_image: ai-dynamo/dynamo
target_tag: ${{ needs.build.outputs.target_tag_plain }}-cuda${{ steps.calculate-target-tag.outputs.cuda_version_plain }} target_tag: ${{ needs.build.outputs.target_tag_plain }}${{ needs.build.outputs.image_tag_suffix }}
source_aws_default_region: ${{ secrets.AWS_DEFAULT_REGION }} source_aws_default_region: ${{ secrets.AWS_DEFAULT_REGION }}
source_aws_account_id: ${{ secrets.AWS_ACCOUNT_ID }} source_aws_account_id: ${{ secrets.AWS_ACCOUNT_ID }}
target_azure_acr_hostname: ${{ secrets.AZURE_ACR_HOSTNAME }} target_azure_acr_hostname: ${{ secrets.AZURE_ACR_HOSTNAME }}
......
...@@ -36,6 +36,7 @@ jobs: ...@@ -36,6 +36,7 @@ jobs:
environment: ${{ github.event_name == 'workflow_dispatch' && 'protected-deploy' || '' }} environment: ${{ github.event_name == 'workflow_dispatch' && 'protected-deploy' || '' }}
outputs: outputs:
core: ${{ steps.changes.outputs.core }} core: ${{ steps.changes.outputs.core }}
planner: ${{ steps.changes.outputs.planner }}
operator: ${{ steps.changes.outputs.operator }} operator: ${{ steps.changes.outputs.operator }}
deploy: ${{ steps.changes.outputs.deploy }} deploy: ${{ steps.changes.outputs.deploy }}
vllm: ${{ steps.changes.outputs.vllm }} vllm: ${{ steps.changes.outputs.vllm }}
...@@ -59,7 +60,7 @@ jobs: ...@@ -59,7 +60,7 @@ jobs:
backend-status-check: backend-status-check:
runs-on: ubuntu-latest runs-on: ubuntu-latest
needs: [changed-files, vllm-pipeline, sglang-pipeline, trtllm-pipeline, operator] # THIS list determines blocking jobs needs: [changed-files, planner-pipeline, vllm-pipeline, sglang-pipeline, trtllm-pipeline, operator] # THIS list determines blocking jobs
if: always() if: always()
steps: steps:
- name: "Check all dependent jobs" - name: "Check all dependent jobs"
...@@ -176,6 +177,30 @@ jobs: ...@@ -176,6 +177,30 @@ jobs:
# ============================================================================ # ============================================================================
# FRAMEWORK PIPELINES (Build → Test → Copy) # FRAMEWORK PIPELINES (Build → Test → Copy)
# ============================================================================ # ============================================================================
# ============================================================================
# PLANNER PIPELINE
# ============================================================================
planner-pipeline:
name: planner
needs: [changed-files]
if: needs.changed-files.outputs.core == 'true' || needs.changed-files.outputs.planner == 'true'
uses: ./.github/workflows/build-test-distribute-flavor.yml
with:
framework: dynamo
builder_flavor: general
target: planner
platform: 'linux/amd64'
cpu_only: true
builder_name: ${{ needs.changed-files.outputs.builder_name }}
build_timeout_minutes: 45
run_cpu_only_tests: true
cpu_only_test_markers: 'pre_merge and planner and gpu_0'
cpu_only_test_timeout_minutes: 30
run_single_gpu_tests: false
run_multi_gpu_tests: false
copy_to_acr: false
secrets: inherit
# ============================================================================ # ============================================================================
# VLLM PIPELINE # VLLM PIPELINE
# ============================================================================ # ============================================================================
...@@ -406,7 +431,7 @@ jobs: ...@@ -406,7 +431,7 @@ jobs:
name: Clean K8s builder if exists name: Clean K8s builder if exists
runs-on: prod-default-small-v2 runs-on: prod-default-small-v2
if: always() if: always()
needs: [vllm-pipeline, sglang-pipeline, trtllm-pipeline, operator, changed-files] needs: [planner-pipeline, vllm-pipeline, sglang-pipeline, trtllm-pipeline, operator, changed-files]
steps: steps:
- name: Checkout repository - name: Checkout repository
uses: actions/checkout@08eba0b27e820071cde6df949e0beb9ba4906955 # v4.3.0 uses: actions/checkout@08eba0b27e820071cde6df949e0beb9ba4906955 # v4.3.0
......
...@@ -33,7 +33,6 @@ CODEOWNERS @ai-dynamo/Devops ...@@ -33,7 +33,6 @@ CODEOWNERS @ai-dynamo/Devops
/components/src/dynamo/global_planner/ @ai-dynamo/python-codeowners @ai-dynamo/Devops /components/src/dynamo/global_planner/ @ai-dynamo/python-codeowners @ai-dynamo/Devops
/examples/global_planner/ @ai-dynamo/python-codeowners @ai-dynamo/Devops /examples/global_planner/ @ai-dynamo/python-codeowners @ai-dynamo/Devops
/components/src/dynamo/profiler/ @ai-dynamo/python-codeowners @ai-dynamo/Devops /components/src/dynamo/profiler/ @ai-dynamo/python-codeowners @ai-dynamo/Devops
/tests/planner/ @ai-dynamo/python-codeowners @ai-dynamo/Devops
# recipes # recipes
......
...@@ -44,3 +44,4 @@ When both modes are enabled, throughput-based scaling provides a lower bound on ...@@ -44,3 +44,4 @@ When both modes are enabled, throughput-based scaling provides a lower bound on
- **User docs**: [Planner Guide](../../../../docs/components/planner/planner-guide.md) (deployment, configuration, examples) - **User docs**: [Planner Guide](../../../../docs/components/planner/planner-guide.md) (deployment, configuration, examples)
- **Design docs**: [Planner Design](../../../../docs/design-docs/planner-design.md) (architecture, algorithms) - **Design docs**: [Planner Design](../../../../docs/design-docs/planner-design.md) (architecture, algorithms)
- **Manual workflows**: [tests/manual/README.md](tests/manual/README.md) (dry run helpers, perf configs, and manual scaling scripts)
...@@ -323,7 +323,7 @@ class PreSweptResultsHelper: ...@@ -323,7 +323,7 @@ class PreSweptResultsHelper:
if __name__ == "__main__": if __name__ == "__main__":
# demo of how to use merge_raw_data # demo of how to use merge_raw_data
merge_raw_data( merge_raw_data(
"/home/jasonzho/repo/dynamo/tests/planner/profiling_results/H200_TP1P_TP1D/selected_prefill_interpolation/raw_data.npz", "/home/jasonzho/repo/dynamo/components/src/dynamo/planner/tests/data/profiling_results/H200_TP1P_TP1D/selected_prefill_interpolation/raw_data.npz",
configs={ configs={
"gpu_type": "h200_sxm", "gpu_type": "h200_sxm",
"model": "nvidia/Llama-3.1-8B-Instruct-FP8", "model": "nvidia/Llama-3.1-8B-Instruct-FP8",
...@@ -339,7 +339,7 @@ if __name__ == "__main__": ...@@ -339,7 +339,7 @@ if __name__ == "__main__":
mode="prefill", mode="prefill",
) )
merge_raw_data( merge_raw_data(
"/home/jasonzho/repo/dynamo/tests/planner/profiling_results/H200_TP1P_TP1D/selected_decode_interpolation/raw_data.npz", "/home/jasonzho/repo/dynamo/components/src/dynamo/planner/tests/data/profiling_results/H200_TP1P_TP1D/selected_decode_interpolation/raw_data.npz",
configs={ configs={
"gpu_type": "h200_sxm", "gpu_type": "h200_sxm",
"model": "nvidia/Llama-3.1-8B-Instruct-FP8", "model": "nvidia/Llama-3.1-8B-Instruct-FP8",
......
...@@ -16,9 +16,9 @@ from dynamo.planner import SubComponentType, TargetReplica, VirtualConnector ...@@ -16,9 +16,9 @@ from dynamo.planner import SubComponentType, TargetReplica, VirtualConnector
pytestmark = [ pytestmark = [
pytest.mark.gpu_0, pytest.mark.gpu_0,
pytest.mark.pre_merge, pytest.mark.pre_merge,
pytest.mark.unit, pytest.mark.integration,
pytest.mark.planner,
pytest.mark.sglang, pytest.mark.sglang,
pytest.mark.planner,
] ]
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
......
...@@ -8,4 +8,4 @@ e2e_scaling_results/ ...@@ -8,4 +8,4 @@ e2e_scaling_results/
# Python cache # Python cache
__pycache__/ __pycache__/
*.pyc *.pyc
*.pyo *.pyo
\ No newline at end of file
...@@ -10,7 +10,7 @@ This directory contains comprehensive testing tools for validating the SLA plann ...@@ -10,7 +10,7 @@ This directory contains comprehensive testing tools for validating the SLA plann
The SLA planner monitors metrics every 60 seconds (default adjustment interval) and scales The SLA planner monitors metrics every 60 seconds (default adjustment interval) and scales
prefill/decode workers based on TTFT, ITL, and request patterns. prefill/decode workers based on TTFT, ITL, and request patterns.
To setup the environment, simply use the released docker images for any backends, or build your own docker image following the READMEs in `./examples/backends/<vllm/sglang/trtllm>/README.md`, or follow the `Developing Locally` section in [README.md](../../README.md) to setup the environment locally. If using the local environment, make sure to install dependencies by running `UV_GIT_LFS=1 uv pip install --no-cache -r container/deps/requirements.common.txt -r container/deps/requirements.planner.txt` To setup the environment, simply use the released docker images for any backends, or build your own docker image following the READMEs in `./examples/backends/<vllm/sglang/trtllm>/README.md`, or follow the `Developing Locally` section in [README.md](../../../../../../README.md) to setup the environment locally. If using the local environment, make sure to install dependencies by running `UV_GIT_LFS=1 uv pip install --no-cache -r container/deps/requirements.common.txt -r container/deps/requirements.planner.txt`
## Pre-Requisite: Pre-Deployment Profiling Data ## Pre-Requisite: Pre-Deployment Profiling Data
...@@ -19,18 +19,18 @@ You have two options to obtain the pre-deployment profiling data: ...@@ -19,18 +19,18 @@ You have two options to obtain the pre-deployment profiling data:
### Option A: Use Test Configuration (Quickstart) ### Option A: Use Test Configuration (Quickstart)
Use the pre-configured test deployment with sample profiling data, we provide the results and the deployment configuration for the following models x hardware configurations: Use the pre-configured test deployment with sample profiling data, we provide the results and the deployment configuration for the following models x hardware configurations:
- `nvidia/Llama-3.1-8B-Instruct-FP8` on H200 with max context length 16384, TP1 Prefill, and TP1 Decode. At ISL/OSL 3000/150, it achieves 40k tokens/s/gpu prefill with 80ms TTFT and 10k tokens/s/gpu decode with 10ms ITL. See `profiling_results/H200_TP1P_TP1D/`. - `nvidia/Llama-3.1-8B-Instruct-FP8` on H200 with max context length 16384, TP1 Prefill, and TP1 Decode. At ISL/OSL 3000/150, it achieves 40k tokens/s/gpu prefill with 80ms TTFT and 10k tokens/s/gpu decode with 10ms ITL. See `../tests/data/profiling_results/H200_TP1P_TP1D/`.
### Option B: Use Your Own Profiling Results ### Option B: Use Your Own Profiling Results
1. Run pre-deployment profiling for your specific setup. See the [pre-deployment profiling documentation](../../docs/components/profiler/profiler-guide.md) for detailed instructions. 1. Run pre-deployment profiling for your specific setup. See the [pre-deployment profiling documentation](../../../../../../docs/components/profiler/profiler-guide.md) for detailed instructions.
## Interpolator Testing ## Interpolator Testing
SLA planner uses two interpolators to estimate the performance of prefill and decode. You can test the interpolators with the following command: SLA planner uses two interpolators to estimate the performance of prefill and decode. You can test the interpolators with the following command:
```bash ```bash
python components/planner/src/dynamo/planner/utils/perf_interpolation.py \ python components/src/dynamo/planner/core/throughput/interpolation.py \
--profile_results_dir <path_to_profile_results> \ --profile_results_dir <path_to_profile_results> \
--isl <ISL> \ --isl <ISL> \
--osl <OSL> \ --osl <OSL> \
...@@ -43,8 +43,8 @@ The script will perform the interpolation based on ISL, OSL, and TTFT and ITL SL ...@@ -43,8 +43,8 @@ The script will perform the interpolation based on ISL, OSL, and TTFT and ITL SL
For example, to test the interpolator for `nvidia/Llama-3.1-8B-Instruct-FP8` on H200 (target TTFT=200ms, ITL=10ms): For example, to test the interpolator for `nvidia/Llama-3.1-8B-Instruct-FP8` on H200 (target TTFT=200ms, ITL=10ms):
```bash ```bash
python components/planner/src/dynamo/planner/utils/perf_interpolation.py \ python components/src/dynamo/planner/core/throughput/interpolation.py \
--profile_results_dir tests/planner/profiling_results/H200_TP1P_TP1D/ \ --profile_results_dir components/src/dynamo/planner/tests/data/profiling_results/H200_TP1P_TP1D/ \
--isl 3000 \ --isl 3000 \
--osl 300 \ --osl 300 \
--ttft 200 \ --ttft 200 \
...@@ -53,7 +53,7 @@ python components/planner/src/dynamo/planner/utils/perf_interpolation.py \ ...@@ -53,7 +53,7 @@ python components/planner/src/dynamo/planner/utils/perf_interpolation.py \
# output: # output:
ISL=3000, OSL=300 ISL=3000, OSL=300
TTFT=200ms, ITL=10ms TTFT=200ms, ITL=10ms
Using profile results from tests/planner/profiling_results/H200_TP1P_TP1D/ Using profile results from components/src/dynamo/planner/tests/data/profiling_results/H200_TP1P_TP1D/
Interpolating prefill performance ... Interpolating prefill performance ...
Estimated TTFT=60.00ms <= target TTFT=200.00ms. Requests can queue 140.00ms maximally while meeting TTFT SLA. Estimated TTFT=60.00ms <= target TTFT=200.00ms. Requests can queue 140.00ms maximally while meeting TTFT SLA.
...@@ -67,7 +67,7 @@ Interpolating decode performance ... ...@@ -67,7 +67,7 @@ Interpolating decode performance ...
## Generating Load Dataset ## Generating Load Dataset
We provide a tool to generate load dataset with varying request rate. More details can be found in [sin_load_generator](../../benchmarks/sin_load_generator/README.md). We provide a tool to generate load dataset with varying request rate. More details can be found in [sin_load_generator](../../../../../../benchmarks/sin_load_generator/README.md).
From previous interpolator testing, ISL 3000 and OSL 300 can handle ~15 request/s/gpu for both prefill and decode. From previous interpolator testing, ISL 3000 and OSL 300 can handle ~15 request/s/gpu for both prefill and decode.
To test planner's performance for different request rates, we can generate a load dataset with request rate varying between 12 to 36 request/s. To test planner's performance for different request rates, we can generate a load dataset with request rate varying between 12 to 36 request/s.
...@@ -99,22 +99,8 @@ Before testing SLA planner on real deployments, we provide a dry run feature to ...@@ -99,22 +99,8 @@ Before testing SLA planner on real deployments, we provide a dry run feature to
To dry run SLA planner, To dry run SLA planner,
```bash ```bash
python components/planner/test/planner_sla_dryrun.py \ python components/src/dynamo/planner/tests/manual/unit/planner_sla_dryrun.py \
--<SLA planner arguments> \ --config '{"environment":"kubernetes","backend":"vllm","ttft":200,"itl":10,"profile_results_dir":"components/src/dynamo/planner/tests/data/profiling_results/H200_TP1P_TP1D","throughput_adjustment_interval":60,"no_correction":true}' \
--dry-run \
--start-num-p <num_prefill_workers_to_start_with> \
--start-num-d <num_decode_workers_to_start_with> \
--output-plot <path_to_output_plot>
```
For example, to dry run SLA planner for the previous FP8 8B on H200 using the generated `rr-5-45_i3000o300.jsonl` dataset,
```bash
python components/planner/test/planner_sla_dryrun.py \
--ttft 200 \
--itl 10 \
--adjustment-interval 60 \
--profile-results-dir tests/planner/profiling_results/H200_TP1P_TP1D/ \
--dataset rr-5-45_i3000o300.jsonl \ --dataset rr-5-45_i3000o300.jsonl \
--start-num-p 1 \ --start-num-p 1 \
--start-num-d 1 \ --start-num-d 1 \
...@@ -139,7 +125,7 @@ This directory contains comprehensive tests for validating the SLA planner's sca ...@@ -139,7 +125,7 @@ This directory contains comprehensive tests for validating the SLA planner's sca
### Test Types ### Test Types
1. **Unit Tests** (`test_replica_calculation.py`) - Test the mathematical formulas for calculating prefill and decode replicas in isolation 1. **Unit Tests** (`components/src/dynamo/planner/tests/unit/test_replica_calculation.py`) - Test the mathematical formulas for calculating prefill and decode replicas in isolation
2. **End-to-End Tests** (`scaling/run_scaling_test.sh`) - Test complete workflow including Kubernetes deployment, load generation, and pod scaling validation 2. **End-to-End Tests** (`scaling/run_scaling_test.sh`) - Test complete workflow including Kubernetes deployment, load generation, and pod scaling validation
3. **End-to-End Perf Tests** (see instructions below) - Compare performance (goodput and goodput/GPU) on deployments with and without sla planner 3. **End-to-End Perf Tests** (see instructions below) - Compare performance (goodput and goodput/GPU) on deployments with and without sla planner
...@@ -149,12 +135,7 @@ This directory contains comprehensive tests for validating the SLA planner's sca ...@@ -149,12 +135,7 @@ This directory contains comprehensive tests for validating the SLA planner's sca
Test the replica calculation logic without requiring Kubernetes: Test the replica calculation logic without requiring Kubernetes:
```bash ```bash
# Set PYTHONPATH to include planner components PYTHONPATH=components/src python -m pytest components/src/dynamo/planner/tests/unit/test_replica_calculation.py -v
PYTHONPATH=components/src python -m pytest tests/planner/test_replica_calculation.py -v
# Or from the tests/planner directory:
cd tests/planner
PYTHONPATH=../../components/src python -m pytest test_replica_calculation.py -v
``` ```
**Note**: The unit tests automatically mock external dependencies (prometheus_client, runtime modules) to ensure they can run in isolation without requiring the full Dynamo environment. **Note**: The unit tests automatically mock external dependencies (prometheus_client, runtime modules) to ensure they can run in isolation without requiring the full Dynamo environment.
...@@ -165,85 +146,12 @@ Test complete scaling behavior including Kubernetes deployment and load generati ...@@ -165,85 +146,12 @@ Test complete scaling behavior including Kubernetes deployment and load generati
**Prerequisites:** **Prerequisites:**
- **[kube-prometheus-stack](../../docs/kubernetes/observability/metrics.md) installed and running.** The SLA planner requires Prometheus to observe metrics and make scaling decisions. - **[kube-prometheus-stack](../../../../../../docs/kubernetes/observability/metrics.md) installed and running.** The SLA planner requires Prometheus to observe metrics and make scaling decisions.
- Ensure the Dynamo operator was installed with the Prometheus endpoint configured (see [SLA Planner Quickstart Guide](../../docs/components/planner/planner-guide.md#prerequisites) for details). - Ensure the Dynamo operator was installed with the Prometheus endpoint configured (see [SLA Planner Quickstart Guide](../../../../../../docs/components/planner/planner-guide.md#prerequisites) for details).
**Prepare the test deployment manifest:**
The test requires modifying `examples/backends/vllm/deploy/disagg_planner.yaml` with test-specific planner arguments:
1. Copy the base deployment:
```bash
cp examples/backends/vllm/deploy/disagg_planner.yaml tests/planner/scaling/disagg_planner.yaml
```
2. Edit `tests/planner/scaling/disagg_planner.yaml`. Ensure all services use the correct image. Modify the Planner service args:
```yaml
spec:
services:
Planner:
extraPodSpec:
mainContainer:
args:
- --environment=kubernetes
- --backend=vllm
- --adjustment-interval=60
- --profile-results-dir=/workspace/tests/planner/profiling_results/H200_TP1P_TP1D
- --ttft=100
- --itl=10
- --load-predictor=constant
- --no-correction
```
Remove `volumes` and `volumeMounts`:
```
# Remove these lines or any similar lines
volumeMounts:
- name: planner-profile-data
mountPath: /workspace/profiling_results
readOnly: true
volumes:
- name: planner-profile-data
configMap:
# Must be pre-created before deployment by the profiler
# See docs/components/planner/planner-guide.md for more details
name: planner-profile-data
```
3. Update the model in VllmPrefillWorker and VllmDecodeWorker services:
```yaml
args:
- -m
- dynamo.vllm
- --model
- nvidia/Llama-3.1-8B-Instruct-FP8
- --migration-limit=3
- --max-model-len=8192
```
**Run the test:**
```bash
./scaling/run_scaling_test.sh --namespace <namespace>
```
To save results to `tests/planner/e2e_scaling_results` instead of `/tmp`:
```bash
./scaling/run_scaling_test.sh --namespace <namespace> --save-results
```
**E2E Test Deployment Management:**
- If no deployment exists: creates, tests, and cleans up deployment
- If deployment exists: uses existing deployment and preserves it
- Perfect for development workflows where you want to keep deployments running between tests
**Test Scenario** **Test Scenario**
The main test scenario validates prefill scaling for H200 with 1P1D 2P1D configuration: The main test scenario validates prefill scaling for H200 with 1P1D -> 2P1D configuration:
- **Phase 1**: 8 req/s for 90s (baseline - maintains 1P1D) - **Phase 1**: 8 req/s for 90s (baseline - maintains 1P1D)
- **Phase 2**: 18 req/s for 120s (scaling trigger - scales to 2P1D) - **Phase 2**: 18 req/s for 120s (scaling trigger - scales to 2P1D)
...@@ -252,6 +160,18 @@ The main test scenario validates prefill scaling for H200 with 1P1D → 2P1D con ...@@ -252,6 +160,18 @@ The main test scenario validates prefill scaling for H200 with 1P1D → 2P1D con
- **Total test duration**: ~7 minutes + scaling observation - **Total test duration**: ~7 minutes + scaling observation
- **Smart cleanup**: Only removes deployment if test created it (preserves existing deployments) - **Smart cleanup**: Only removes deployment if test created it (preserves existing deployments)
Run the test with:
```bash
components/src/dynamo/planner/tests/manual/scaling/run_scaling_test.sh --namespace <namespace>
```
To save results to `components/src/dynamo/planner/tests/e2e_scaling_results` instead of `/tmp`:
```bash
components/src/dynamo/planner/tests/manual/scaling/run_scaling_test.sh --namespace <namespace> --save-results
```
### Instructions for End-to-End Perf Tests ### Instructions for End-to-End Perf Tests
In this test, we compare performance (goodput and goodput/GPU) on deployments on the following four deployments using the aforementioned 8b FP8 model on H200 and the dataset used in dryrun: In this test, we compare performance (goodput and goodput/GPU) on deployments on the following four deployments using the aforementioned 8b FP8 model on H200 and the dataset used in dryrun:
...@@ -282,9 +202,9 @@ aiperf profile \ ...@@ -282,9 +202,9 @@ aiperf profile \
--model nvidia/Llama-3.1-8B-Instruct-FP8 \ --model nvidia/Llama-3.1-8B-Instruct-FP8 \
--tokenizer nvidia/Llama-3.1-8B-Instruct-FP8 \ --tokenizer nvidia/Llama-3.1-8B-Instruct-FP8 \
--endpoint-type chat \ --endpoint-type chat \
--url localhost:8000 \ # or the port-forwarded port --url localhost:8000 \
--streaming \ --streaming \
--input-file /workspace/rr-5-45_i3000o300.jsonl \ # path to the generated load dataset \ --input-file /workspace/rr-5-45_i3000o300.jsonl \
--custom-dataset-type mooncake_trace \ --custom-dataset-type mooncake_trace \
--goodput "time_to_first_token:200 inter_token_latency:10" --goodput "time_to_first_token:200 inter_token_latency:10"
``` ```
...@@ -302,5 +222,4 @@ The table below shows the performance improvement of SLA planner across differen ...@@ -302,5 +222,4 @@ The table below shows the performance improvement of SLA planner across differen
|---------------|-----------------|-------------------------| |---------------|-----------------|-------------------------|
| Inefficient P/D ratio | 725% | 600% | | Inefficient P/D ratio | 725% | 600% |
| Inefficient parallelization mapping | 311% | 249% | | Inefficient parallelization mapping | 311% | 249% |
| Best static deployment | 52% | 29% |` | Best static deployment | 52% | 29% |
...@@ -37,7 +37,7 @@ spec: ...@@ -37,7 +37,7 @@ spec:
memory: "100Gi" memory: "100Gi"
extraPodSpec: extraPodSpec:
mainContainer: mainContainer:
image: my-registry/vllm-runtime:my-tag image: my-registry/dynamo-frontend:my-tag
workingDir: /workspace/examples/backends/vllm workingDir: /workspace/examples/backends/vllm
command: command:
- /bin/sh - /bin/sh
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment