Unverified Commit 8c2a4681 authored by atchernych's avatar atchernych Committed by GitHub
Browse files

feat: Add GAIE nightly integration test [DEP-423] (#7458)


Signed-off-by: default avatarAnna Tchernych <atchernych@nvidia.com>
parent 16bca7b6
...@@ -31,17 +31,28 @@ inputs: ...@@ -31,17 +31,28 @@ inputs:
framework: framework:
description: 'Framework name (vllm, sglang, trtllm)' description: 'Framework name (vllm, sglang, trtllm)'
required: true required: false
default: ''
profile: profile:
description: 'Deployment profile (e.g., disagg_router, agg)' description: 'Deployment profile (e.g., disagg_router, agg)'
required: true required: false
default: ''
image: image:
description: 'Full container image reference for the framework runtime' description: 'Full container image reference for the framework runtime'
required: true required: false
default: ''
platform_arch: platform_arch:
description: 'Platform architecture (amd64, arm64)' description: 'Platform architecture (amd64, arm64)'
required: false required: false
default: 'amd64' default: 'amd64'
test_name:
description: 'Name for artifact naming. Defaults to {framework}_{profile}.'
required: false
default: ''
extra_pytest_args:
description: 'Additional pytest arguments (e.g., --frontend-image=...)'
required: false
default: ''
runs: runs:
using: "composite" using: "composite"
...@@ -82,16 +93,33 @@ runs: ...@@ -82,16 +93,33 @@ runs:
FRAMEWORK: ${{ inputs.framework }} FRAMEWORK: ${{ inputs.framework }}
PROFILE: ${{ inputs.profile }} PROFILE: ${{ inputs.profile }}
IMAGE: ${{ inputs.image }} IMAGE: ${{ inputs.image }}
TEST_NAME: ${{ inputs.test_name }}
EXTRA_PYTEST_ARGS: ${{ inputs.extra_pytest_args }}
run: | run: |
mkdir -p test-results mkdir -p test-results
PYTEST_ARGS=""
if [ -n "${FRAMEWORK}" ]; then
PYTEST_ARGS+=" --framework=${FRAMEWORK}"
fi
if [ -n "${PROFILE}" ]; then
PYTEST_ARGS+=" --profile=${PROFILE}"
fi
if [ -n "${IMAGE}" ]; then
PYTEST_ARGS+=" --image=${IMAGE}"
fi
if [ -z "${TEST_NAME}" ]; then
TEST_NAME="${FRAMEWORK:-unknown}_${PROFILE:-unknown}"
fi
pytest tests/deploy/test_deploy.py \ pytest tests/deploy/test_deploy.py \
--framework="${FRAMEWORK}" \
--profile="${PROFILE}" \
--image="${IMAGE}" \
--namespace="${NAMESPACE}" \ --namespace="${NAMESPACE}" \
${PYTEST_ARGS} \
${EXTRA_PYTEST_ARGS} \
-v -s \ -v -s \
--durations=10 \ --durations=10 \
--junitxml=test-results/pytest_deploy_${FRAMEWORK}_${PROFILE}_${{ inputs.platform_arch }}_${{ github.run_id }}_${{ job.check_run_id }}.xml \ --junitxml=test-results/pytest_deploy_${TEST_NAME}_${{ inputs.platform_arch }}_${{ github.run_id }}_${{ job.check_run_id }}.xml \
--log-cli-level=INFO --log-cli-level=INFO
- name: Cleanup Deployment - name: Cleanup Deployment
...@@ -124,10 +152,25 @@ runs: ...@@ -124,10 +152,25 @@ runs:
fi fi
echo "::endgroup::" echo "::endgroup::"
- name: Determine test name for artifacts
id: test-name
if: always()
shell: bash
env:
TEST_NAME: ${{ inputs.test_name }}
FRAMEWORK: ${{ inputs.framework }}
PROFILE: ${{ inputs.profile }}
run: |
if [ -z "${TEST_NAME}" ]; then
TEST_NAME="${FRAMEWORK:-unknown}_${PROFILE:-unknown}"
fi
echo "name=${TEST_NAME}" >> $GITHUB_OUTPUT
- name: Upload Test Results - name: Upload Test Results
uses: actions/upload-artifact@b7c566a772e6b6bfb58ed0dc250532a479d7789f #v6 uses: actions/upload-artifact@b7c566a772e6b6bfb58ed0dc250532a479d7789f #v6
if: always() if: always()
with: with:
name: test-results-${{ inputs.framework }}-${{ inputs.profile }}-${{ inputs.platform_arch }}-${{ github.run_id }}-${{ job.check_run_id }} name: test-results-${{ steps.test-name.outputs.name }}-${{ inputs.platform_arch }}-${{ github.run_id }}-${{ job.check_run_id }}
path: test-results/pytest_deploy_${{ inputs.framework }}_${{ inputs.profile }}_${{ inputs.platform_arch }}_${{ github.run_id }}_${{ job.check_run_id }}.xml path: test-results/pytest_deploy_${{ steps.test-name.outputs.name }}_${{ inputs.platform_arch }}_${{ github.run_id }}_${{ job.check_run_id }}.xml
retention-days: 7 retention-days: 7
...@@ -190,6 +190,16 @@ jobs: ...@@ -190,6 +190,16 @@ jobs:
copy_to_acr: false copy_to_acr: false
secrets: inherit secrets: inherit
# ============================================================================
# FRONTEND IMAGE BUILD
# ============================================================================
frontend-image:
name: Frontend Image
uses: ./.github/workflows/build-frontend-image.yaml
with:
skip_change_detection: true
secrets: inherit
# ============================================================================ # ============================================================================
# Operator # Operator
# ============================================================================ # ============================================================================
...@@ -354,7 +364,7 @@ jobs: ...@@ -354,7 +364,7 @@ jobs:
deploy-cleanup: deploy-cleanup:
if: always() if: always()
needs: [deploy-operator, deploy-test-vllm, deploy-test-sglang, deploy-test-trtllm] needs: [deploy-operator, deploy-test-vllm, deploy-test-sglang, deploy-test-trtllm, deploy-test-gaie]
runs-on: prod-default-small-v2 runs-on: prod-default-small-v2
steps: steps:
- uses: actions/checkout@v4 - uses: actions/checkout@v4
...@@ -366,9 +376,37 @@ jobs: ...@@ -366,9 +376,37 @@ jobs:
vcluster_name: ${{ needs.deploy-operator.outputs.vcluster_name }} vcluster_name: ${{ needs.deploy-operator.outputs.vcluster_name }}
vcluster_namespace: ${{ needs.deploy-operator.outputs.namespace }} vcluster_namespace: ${{ needs.deploy-operator.outputs.namespace }}
# ============================================================================
# GAIE DEPLOY TEST
# ============================================================================
deploy-test-gaie:
name: GAIE Deploy Test
runs-on: prod-default-small-v2
needs: [deploy-operator, frontend-image, vllm-pipeline]
timeout-minutes: 30
permissions:
contents: read
steps:
- name: Checkout code
uses: actions/checkout@08eba0b27e820071cde6df949e0beb9ba4906955 # v4.3.0
- name: Run GAIE Deploy Test
id: deploy-test
uses: ./.github/actions/dynamo-deploy-test
with:
kubeconfig_base64: ${{ secrets.AZURE_AKS_CI_KUBECONFIG_B64 }}
namespace: ${{ needs.deploy-operator.outputs.namespace }}
registry: ${{ secrets.AZURE_ACR_HOSTNAME }}
operator_tag: ${{ needs.deploy-operator.outputs.operator_tag }}
hf_token: ${{ secrets.HF_TOKEN }}
test_name: gaie
extra_pytest_args: >-
-m framework_with_gaie
--frontend-image=${{ secrets.AZURE_ACR_HOSTNAME }}/ai-dynamo/dynamo:${{ github.sha }}-frontend
--image=${{ secrets.AZURE_ACR_HOSTNAME }}/ai-dynamo/dynamo:${{ github.sha }}-vllm-runtime-cuda12-amd64
deploy-status-check: deploy-status-check:
runs-on: ubuntu-latest runs-on: ubuntu-latest
needs: [deploy-operator, deploy-test-vllm, deploy-test-sglang, deploy-test-trtllm] needs: [deploy-operator, deploy-test-vllm, deploy-test-sglang, deploy-test-trtllm, deploy-test-gaie]
if: always() if: always()
steps: steps:
- name: "Check all deploy test jobs" - name: "Check all deploy test jobs"
...@@ -383,7 +421,7 @@ jobs: ...@@ -383,7 +421,7 @@ jobs:
name: Clean K8s builder if exists name: Clean K8s builder if exists
runs-on: prod-default-small-v2 runs-on: prod-default-small-v2
if: always() if: always()
needs: [vllm-pipeline, sglang-pipeline, trtllm-pipeline, vllm-dev-pipeline, sglang-dev-pipeline, trtllm-dev-pipeline, vllm-efa-pipeline, trtllm-efa-pipeline, operator] needs: [vllm-pipeline, sglang-pipeline, trtllm-pipeline, vllm-dev-pipeline, sglang-dev-pipeline, trtllm-dev-pipeline, vllm-efa-pipeline, trtllm-efa-pipeline, operator, frontend-image]
steps: steps:
- name: Checkout repository - name: Checkout repository
uses: actions/checkout@08eba0b27e820071cde6df949e0beb9ba4906955 # v4.3.0 uses: actions/checkout@08eba0b27e820071cde6df949e0beb9ba4906955 # v4.3.0
...@@ -404,7 +442,7 @@ jobs: ...@@ -404,7 +442,7 @@ jobs:
name: Notify Slack name: Notify Slack
runs-on: prod-builder-amd-v1 runs-on: prod-builder-amd-v1
if: always() && failure() if: always() && failure()
needs: [ vllm-pipeline, sglang-pipeline, trtllm-pipeline, vllm-dev-pipeline, sglang-dev-pipeline, trtllm-dev-pipeline, vllm-efa-pipeline, trtllm-efa-pipeline, operator, deploy-operator, deploy-test-vllm, deploy-test-sglang, deploy-test-trtllm ] needs: [ vllm-pipeline, sglang-pipeline, trtllm-pipeline, vllm-dev-pipeline, sglang-dev-pipeline, trtllm-dev-pipeline, vllm-efa-pipeline, trtllm-efa-pipeline, operator, frontend-image, deploy-operator, deploy-test-vllm, deploy-test-sglang, deploy-test-trtllm, deploy-test-gaie ]
permissions: permissions:
contents: read contents: read
steps: steps:
......
...@@ -85,3 +85,4 @@ jobs: ...@@ -85,3 +85,4 @@ jobs:
profile: ${{ matrix.profile }} profile: ${{ matrix.profile }}
image: ${{ secrets.AZURE_ACR_HOSTNAME }}/ai-dynamo/dynamo:${{ github.sha }}-${{ inputs.image_suffix }} image: ${{ secrets.AZURE_ACR_HOSTNAME }}/ai-dynamo/dynamo:${{ github.sha }}-${{ inputs.image_suffix }}
platform_arch: amd64 platform_arch: amd64
extra_pytest_args: -m framework_only
...@@ -122,7 +122,11 @@ For the HttpRoute service make sure to specify the namespace where your gateway ...@@ -122,7 +122,11 @@ For the HttpRoute service make sure to specify the namespace where your gateway
```bash ```bash
cd <dynamo-source-root> cd <dynamo-source-root>
# kubectl get httproutes -n my-model # Make sure you do not have an incompatible HttpRoute running, delete if so. # kubectl get httproutes -n my-model # Make sure you do not have an incompatible HttpRoute running, delete if so.
# Choose disagg or agg example
kubectl apply -f examples/backends/vllm/deploy/gaie/disagg.yaml -n my-model
# or
kubectl apply -f examples/backends/vllm/deploy/gaie/agg.yaml -n my-model kubectl apply -f examples/backends/vllm/deploy/gaie/agg.yaml -n my-model
# make sure to apply the route
kubectl apply -f examples/backends/vllm/deploy/gaie/http-route.yaml -n my-model kubectl apply -f examples/backends/vllm/deploy/gaie/http-route.yaml -n my-model
``` ```
......
...@@ -3,7 +3,7 @@ ...@@ -3,7 +3,7 @@
apiVersion: nvidia.com/v1alpha1 apiVersion: nvidia.com/v1alpha1
kind: DynamoGraphDeployment kind: DynamoGraphDeployment
metadata: metadata:
name: qwen-agg name: qwen
spec: spec:
backendFramework: vllm backendFramework: vllm
services: services:
......
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
apiVersion: nvidia.com/v1alpha1
kind: DynamoGraphDeployment
metadata:
name: qwen
spec:
backendFramework: vllm
services:
Epp:
envFromSecret: hf-token-secret
componentType: epp
replicas: 1
extraPodSpec:
mainContainer:
image: nvcr.io/nvidia/ai-dynamo/epp-image:my-tag
imagePullPolicy: IfNotPresent
env:
- name: DYN_KV_CACHE_BLOCK_SIZE
value: "16"
- name: DYN_MODEL_NAME
value: "Qwen/Qwen3-0.6B"
- name: DYN_ENFORCE_DISAGG
value: "true"
eppConfig:
config:
plugins:
- type: disagg-profile-handler
- name: prefill-filter
type: label-filter
parameters:
label: "nvidia.com/dynamo-sub-component-type"
validValues:
- "prefill"
allowsNoLabel: false
- name: decode-filter
type: label-filter
parameters:
label: "nvidia.com/dynamo-sub-component-type"
validValues:
- "decode"
allowsNoLabel: false
- name: picker
type: max-score-picker
- name: dyn-prefill
type: dyn-prefill-scorer
- name: dyn-decode
type: dyn-decode-scorer
schedulingProfiles:
- name: prefill
plugins:
- pluginRef: prefill-filter
weight: 1
- pluginRef: dyn-prefill
weight: 1
- pluginRef: picker
weight: 1
- name: decode
plugins:
- pluginRef: decode-filter
weight: 1
- pluginRef: dyn-decode
weight: 1
- pluginRef: picker
weight: 1
VllmPrefillWorker:
componentType: worker
subComponentType: prefill
envFromSecret: hf-token-secret
sharedMemory:
size: 2Gi
frontendSidecar:
image: docker.io/lambda108/dynamo:post-rebase
args:
- -m
- dynamo.frontend
- --router-mode
- direct
envFromSecret: hf-token-secret
extraPodSpec:
affinity:
podAffinity:
preferredDuringSchedulingIgnoredDuringExecution:
- weight: 100
podAffinityTerm:
labelSelector:
matchExpressions:
- key: nvidia.com/dynamo-component-type
operator: In
values:
- worker
topologyKey: kubernetes.io/hostname
mainContainer:
env:
- name: SERVED_MODEL_NAME
value: "Qwen/Qwen3-0.6B"
- name: MODEL_PATH
value: "Qwen/Qwen3-0.6B"
args:
- "python3 -m dynamo.vllm --model $MODEL_PATH --served-model-name $SERVED_MODEL_NAME --tensor-parallel-size 1 --data-parallel-size 1 --disaggregation-mode prefill --kv-transfer-config '{\"kv_connector\":\"NixlConnector\",\"kv_role\":\"kv_both\"}' --gpu-memory-utilization 0.90 --enable-prefix-caching --block-size 16 --kv-events-config '{\"enable_kv_cache_events\":true}'"
command:
- /bin/sh
- -c
image: docker.io/lambda108/dynamo:post-rebase
imagePullPolicy: IfNotPresent
workingDir: /workspace/examples/backends/vllm
replicas: 1
resources:
limits:
gpu: "1"
requests:
gpu: "1"
VllmDecodeWorker:
componentType: worker
subComponentType: decode
envFromSecret: hf-token-secret
sharedMemory:
size: 2Gi
frontendSidecar:
image: docker.io/lambda108/dynamo:post-rebase
args:
- -m
- dynamo.frontend
- --router-mode
- direct
envFromSecret: hf-token-secret
extraPodSpec:
affinity:
podAffinity:
preferredDuringSchedulingIgnoredDuringExecution:
- weight: 100
podAffinityTerm:
labelSelector:
matchExpressions:
- key: nvidia.com/dynamo-component-type
operator: In
values:
- worker
topologyKey: kubernetes.io/hostname
mainContainer:
env:
- name: SERVED_MODEL_NAME
value: "Qwen/Qwen3-0.6B"
- name: MODEL_PATH
value: "Qwen/Qwen3-0.6B"
args:
- "python3 -m dynamo.vllm --model $MODEL_PATH --served-model-name $SERVED_MODEL_NAME --tensor-parallel-size 1 --data-parallel-size 1 --kv-transfer-config '{\"kv_connector\":\"NixlConnector\",\"kv_role\":\"kv_both\"}' --gpu-memory-utilization 0.90 --enable-prefix-caching --block-size 16 --kv-events-config '{\"enable_kv_cache_events\":true}'"
command:
- /bin/sh
- -c
image: docker.io/lambda108/dynamo:post-rebase
imagePullPolicy: IfNotPresent
workingDir: /workspace/examples/backends/vllm
replicas: 1
resources:
limits:
gpu: "1"
requests:
gpu: "1"
\ No newline at end of file
...@@ -18,18 +18,18 @@ ...@@ -18,18 +18,18 @@
apiVersion: gateway.networking.k8s.io/v1 apiVersion: gateway.networking.k8s.io/v1
kind: HTTPRoute kind: HTTPRoute
metadata: metadata:
name: qwen-agg-route name: qwen-route
spec: spec:
parentRefs: parentRefs:
- group: gateway.networking.k8s.io - group: gateway.networking.k8s.io
kind: Gateway kind: Gateway
name: inference-gateway name: inference-gateway
namespace: my-model # the namespace where your gateway is deployed. namespace: default # the namespace where your gateway is deployed.
rules: rules:
- backendRefs: - backendRefs:
- group: inference.networking.k8s.io - group: inference.networking.k8s.io
kind: InferencePool kind: InferencePool
name: qwen-agg-pool name: qwen-pool
port: 8000 port: 8000
weight: 1 weight: 1
matches: matches:
......
...@@ -254,6 +254,8 @@ markers = [ ...@@ -254,6 +254,8 @@ markers = [
"k8s: marks tests as requiring Kubernetes", "k8s: marks tests as requiring Kubernetes",
"fault_tolerance: marks tests as fault tolerance tests", "fault_tolerance: marks tests as fault tolerance tests",
"deploy: marks tests as deployment tests", "deploy: marks tests as deployment tests",
"framework_only: marks standard framework deployment tests (vllm, sglang, trtllm)",
"framework_with_gaie: marks tests for GAIE (Gateway API Inference Extension) deployment",
# Built-in markers # Built-in markers
"skip: skip this test", "skip: skip this test",
"skipif: skip if condition is true", "skipif: skip if condition is true",
......
...@@ -22,6 +22,8 @@ spec: ...@@ -22,6 +22,8 @@ spec:
value: "128" value: "128"
- name: DYN_MODEL_NAME - name: DYN_MODEL_NAME
value: "RedHatAI/Llama-3.3-70B-Instruct-FP8-dynamic" value: "RedHatAI/Llama-3.3-70B-Instruct-FP8-dynamic"
- name: DYN_ENFORCE_DISAGG
value: "true"
eppConfig: eppConfig:
config: config:
plugins: plugins:
...@@ -81,6 +83,10 @@ spec: ...@@ -81,6 +83,10 @@ spec:
- direct - direct
envFromSecret: hf-token-secret envFromSecret: hf-token-secret
extraPodSpec: extraPodSpec:
tolerations:
- key: nvidia.com/gpu
operator: Exists
effect: NoSchedule
affinity: affinity:
podAffinity: podAffinity:
preferredDuringSchedulingIgnoredDuringExecution: preferredDuringSchedulingIgnoredDuringExecution:
...@@ -132,6 +138,10 @@ spec: ...@@ -132,6 +138,10 @@ spec:
- direct - direct
envFromSecret: hf-token-secret envFromSecret: hf-token-secret
extraPodSpec: extraPodSpec:
tolerations:
- key: nvidia.com/gpu
operator: Exists
effect: NoSchedule
affinity: affinity:
podAffinity: podAffinity:
preferredDuringSchedulingIgnoredDuringExecution: preferredDuringSchedulingIgnoredDuringExecution:
......
...@@ -40,6 +40,12 @@ def pytest_addoption(parser: pytest.Parser) -> None: ...@@ -40,6 +40,12 @@ def pytest_addoption(parser: pytest.Parser) -> None:
help="Deployment profile to test (e.g., agg, disagg, disagg_router). " help="Deployment profile to test (e.g., agg, disagg, disagg_router). "
"If not specified, runs all profiles for the selected framework.", "If not specified, runs all profiles for the selected framework.",
) )
parser.addoption(
"--frontend-image",
type=str,
default=None,
help="Frontend container image (used by GAIE deploy tests).",
)
@dataclass(frozen=True) @dataclass(frozen=True)
......
...@@ -9,14 +9,23 @@ to chat completion requests correctly. ...@@ -9,14 +9,23 @@ to chat completion requests correctly.
""" """
import logging import logging
import os
import subprocess
import time
from typing import Any, Dict from typing import Any, Dict
import kr8s
import pytest import pytest
import requests import requests
import yaml
from tests.deploy.conftest import DeploymentTarget from tests.deploy.conftest import DeploymentTarget
from tests.utils.client import send_request, wait_for_model_availability from tests.utils.client import send_request, wait_for_model_availability
from tests.utils.managed_deployment import DeploymentSpec, ManagedDeployment from tests.utils.managed_deployment import (
DeploymentSpec,
ManagedDeployment,
_get_workspace_dir,
)
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
...@@ -38,6 +47,7 @@ DEFAULT_REQUEST_TIMEOUT = 120 ...@@ -38,6 +47,7 @@ DEFAULT_REQUEST_TIMEOUT = 120
# Minimum response content length to validate that the model is generating meaningful output. # Minimum response content length to validate that the model is generating meaningful output.
# This matches the validation threshold from the original shell-based deployment tests. # This matches the validation threshold from the original shell-based deployment tests.
MIN_RESPONSE_CONTENT_LENGTH = 100 MIN_RESPONSE_CONTENT_LENGTH = 100
GAIE_MODEL_NAME = "Qwen/Qwen3-0.6B"
def validate_chat_response( def validate_chat_response(
...@@ -100,6 +110,7 @@ def validate_chat_response( ...@@ -100,6 +110,7 @@ def validate_chat_response(
return data return data
@pytest.mark.framework_only
@pytest.mark.k8s @pytest.mark.k8s
@pytest.mark.deploy @pytest.mark.deploy
@pytest.mark.post_merge @pytest.mark.post_merge
...@@ -213,3 +224,203 @@ async def test_deployment( ...@@ -213,3 +224,203 @@ async def test_deployment(
f"Deployment test PASSED for {deployment_target.test_id} " f"Deployment test PASSED for {deployment_target.test_id} "
f"(source: {deployment_target.source}, model: {model}, namespace: {namespace})" f"(source: {deployment_target.source}, model: {model}, namespace: {namespace})"
) )
# GAIE (Gateway API Inference Extension) deployment test
@pytest.mark.framework_with_gaie
@pytest.mark.k8s
@pytest.mark.deploy
@pytest.mark.post_merge
@pytest.mark.e2e
@pytest.mark.timeout(900)
async def test_gaie_deployment(
image: str,
namespace: str,
skip_service_restart: bool,
request,
) -> None:
"""Test GAIE disaggregated deployment with vLLM workers.
Applies the GAIE DynamoGraphDeployment (with CI-built images) and the
companion HTTPRoute, then verifies inference works end-to-end through
the full Gateway path.
"""
frontend_image = request.config.getoption("--frontend-image")
worker_image = image
assert frontend_image, "--frontend-image is required for GAIE deploy test"
assert worker_image, "--image is required for GAIE deploy test"
assert namespace, "--namespace is required for GAIE deploy test"
workspace = _get_workspace_dir()
gaie_dir = os.path.join(workspace, "examples", "backends", "vllm", "deploy", "gaie")
disagg_path = os.path.join(gaie_dir, "disagg.yaml")
httproute_path = os.path.join(gaie_dir, "http-route.yaml")
assert os.path.exists(disagg_path), f"disagg.yaml not found: {disagg_path}"
assert os.path.exists(
httproute_path
), f"http-route.yaml not found: {httproute_path}"
deployment_spec = DeploymentSpec(disagg_path)
deployment_spec.namespace = namespace
logger.info(f"Frontend image: {frontend_image}")
logger.info(f"Worker image: {worker_image}")
deployment_spec.set_image(frontend_image, service_name="Epp")
for worker in ("VllmPrefillWorker", "VllmDecodeWorker"):
deployment_spec.set_image(worker_image, service_name=worker)
deployment_spec.set_frontend_sidecar_image(frontend_image, service_name=worker)
route_hostname = f"{namespace}.example.com"
logger.info(f"HTTPRoute hostname: {route_hostname}")
with open(httproute_path) as f:
httproute_spec = yaml.safe_load(f)
httproute_spec["spec"]["hostnames"] = [route_hostname]
httproute_yaml = yaml.safe_dump(httproute_spec)
logger.info("Applying GAIE HTTPRoute...")
result = subprocess.run(
["kubectl", "apply", "-n", namespace, "-f", "-"],
input=httproute_yaml,
capture_output=True,
text=True,
)
logger.info(f"HTTPRoute apply stdout: {result.stdout}")
if result.stderr:
logger.warning(f"HTTPRoute apply stderr: {result.stderr}")
assert result.returncode == 0, f"Failed to apply HTTPRoute: {result.stderr}"
# Debug: verify namespace state before creating DGD
logger.info(f"Namespace: {namespace}")
ns_check = subprocess.run(
["kubectl", "get", "namespace", namespace],
capture_output=True,
text=True,
)
logger.info(f"Namespace check: {ns_check.stdout.strip()}")
if ns_check.returncode != 0:
logger.error(f"Namespace not found: {ns_check.stderr}")
# Debug: check if operator CRD is registered
crd_check = subprocess.run(
["kubectl", "get", "crd", "dynamographdeployments.nvidia.com"],
capture_output=True,
text=True,
)
logger.info(f"CRD check: {crd_check.stdout.strip()}")
if crd_check.returncode != 0:
logger.error(f"CRD not found: {crd_check.stderr}")
# Debug: check operator pod status
operator_check = subprocess.run(
[
"kubectl",
"get",
"pods",
"-n",
namespace,
"-l",
"app.kubernetes.io/name=dynamo-operator",
],
capture_output=True,
text=True,
)
logger.info(f"Operator pods: {operator_check.stdout.strip()}")
# Debug: log the full deployment spec being submitted
logger.info(f"DGD name: {deployment_spec.name}")
logger.info(f"DGD namespace: {deployment_spec.namespace}")
logger.info(f"DGD services: {[s.name for s in deployment_spec.services]}")
async with ManagedDeployment(
log_dir=request.node.name,
deployment_spec=deployment_spec,
namespace=namespace,
skip_service_restart=skip_service_restart,
frontend_service_name="Epp",
) as deployment:
# Debug: check what DGDs exist after creation
dgd_check = subprocess.run(
["kubectl", "get", "dynamographdeployments", "-n", namespace],
capture_output=True,
text=True,
)
logger.info(f"DGDs after creation: {dgd_check.stdout.strip()}")
pod_check = subprocess.run(
["kubectl", "get", "pods", "-n", namespace, "-o", "wide"],
capture_output=True,
text=True,
)
logger.info(f"Pods after creation: {pod_check.stdout.strip()}")
epp_pods = deployment.get_pods(["Epp"])
epp_pod_list = epp_pods.get("Epp", [])
assert len(epp_pod_list) > 0, "No EPP pods found for GAIE deployment"
logger.info(f"Found EPP pod: {epp_pod_list[0].name}")
gateway_svcs = list(
kr8s.get("services", "inference-gateway", namespace=namespace)
)
assert (
len(gateway_svcs) > 0
), f"inference-gateway service not found in namespace {namespace}"
gateway_pf = gateway_svcs[0].portforward(remote_port=80, local_port=0)
gateway_pf.start()
time.sleep(2)
try:
gateway_url = f"http://localhost:{gateway_pf.local_port}"
logger.info(f"Gateway port-forward established: {gateway_url}")
endpoint = deployment_spec.endpoint
headers = {"Host": route_hostname}
logger.info(f"Using Host header: {route_hostname}")
model_ready = wait_for_model_availability(
url=gateway_url,
endpoint=endpoint,
model=GAIE_MODEL_NAME,
logger=logger,
max_attempts=30,
headers=headers,
)
assert model_ready, (
f"Model '{GAIE_MODEL_NAME}' did not become available "
f"within the timeout period"
)
url = f"{gateway_url}{endpoint}"
payload = {
"model": GAIE_MODEL_NAME,
"messages": [{"role": "user", "content": TEST_PROMPT}],
"max_tokens": DEFAULT_MAX_TOKENS,
"temperature": DEFAULT_TEMPERATURE,
"stream": False,
}
logger.info(f"Sending inference request to {url}")
response = requests.post(
url,
json=payload,
headers=headers,
timeout=DEFAULT_REQUEST_TIMEOUT,
)
validate_chat_response(
response=response,
expected_model=GAIE_MODEL_NAME,
min_content_length=MIN_RESPONSE_CONTENT_LENGTH,
)
data = response.json()
content = data["choices"][0]["message"]["content"]
logger.info(
f"GAIE deployment test PASSED | "
f"model={data['model']}, status={response.status_code}, "
f"response_length={len(content)} chars\n"
f"Model response: {content}"
)
finally:
gateway_pf.stop()
...@@ -159,6 +159,7 @@ def wait_for_model_availability( ...@@ -159,6 +159,7 @@ def wait_for_model_availability(
logger: logging.Logger, logger: logging.Logger,
max_attempts: int = 15, max_attempts: int = 15,
attempt_timeouts: list[float] | None = None, attempt_timeouts: list[float] | None = None,
headers: dict[str, str] | None = None,
) -> bool: ) -> bool:
""" """
Wait for model to be available by sending test requests. Wait for model to be available by sending test requests.
...@@ -197,7 +198,9 @@ def wait_for_model_availability( ...@@ -197,7 +198,9 @@ def wait_for_model_availability(
logger.debug( logger.debug(
f"Testing model availability at {test_url} (attempt {attempt+1}/{max_attempts}, timeout={timeout_val}s)" f"Testing model availability at {test_url} (attempt {attempt+1}/{max_attempts}, timeout={timeout_val}s)"
) )
response = requests.post(test_url, json=test_payload, timeout=timeout_val) response = requests.post(
test_url, json=test_payload, timeout=timeout_val, headers=headers
)
if response.status_code == 200: if response.status_code == 200:
logger.info(f"Model '{model}' is available and responding") logger.info(f"Model '{model}' is available and responding")
......
...@@ -66,6 +66,20 @@ class ServiceSpec: ...@@ -66,6 +66,20 @@ class ServiceSpec:
self._spec["extraPodSpec"]["mainContainer"] = {} self._spec["extraPodSpec"]["mainContainer"] = {}
self._spec["extraPodSpec"]["mainContainer"]["image"] = value self._spec["extraPodSpec"]["mainContainer"]["image"] = value
@property
def frontend_sidecar_image(self) -> Optional[str]:
"""Container image for the frontendSidecar (if present)."""
try:
return self._spec["frontendSidecar"]["image"]
except KeyError:
return None
@frontend_sidecar_image.setter
def frontend_sidecar_image(self, value: str):
if "frontendSidecar" not in self._spec:
self._spec["frontendSidecar"] = {}
self._spec["frontendSidecar"]["image"] = value
@property @property
def envs(self) -> list[dict[str, str]]: def envs(self) -> list[dict[str, str]]:
"""Environment variables for the service""" """Environment variables for the service"""
...@@ -230,6 +244,16 @@ class DeploymentSpec: ...@@ -230,6 +244,16 @@ class DeploymentSpec:
for service in services: for service in services:
service.image = image service.image = image
def set_frontend_sidecar_image(
self, image: str, service_name: Optional[str] = None
):
if service_name is None:
services = self.services
else:
services = [self[service_name]]
for service in services:
service.frontend_sidecar_image = image
def set_tensor_parallel(self, tp_size: int, service_names: Optional[list] = None): def set_tensor_parallel(self, tp_size: int, service_names: Optional[list] = None):
"""Scale deployment for different tensor parallel configurations """Scale deployment for different tensor parallel configurations
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment