"vscode:/vscode.git/clone" did not exist on "3dd5266eee396832a460ed4a971dc2d5ead9b4b2"
Unverified Commit 8c2a4681 authored by atchernych's avatar atchernych Committed by GitHub
Browse files

feat: Add GAIE nightly integration test [DEP-423] (#7458)


Signed-off-by: default avatarAnna Tchernych <atchernych@nvidia.com>
parent 16bca7b6
......@@ -31,17 +31,28 @@ inputs:
framework:
description: 'Framework name (vllm, sglang, trtllm)'
required: true
required: false
default: ''
profile:
description: 'Deployment profile (e.g., disagg_router, agg)'
required: true
required: false
default: ''
image:
description: 'Full container image reference for the framework runtime'
required: true
required: false
default: ''
platform_arch:
description: 'Platform architecture (amd64, arm64)'
required: false
default: 'amd64'
test_name:
description: 'Name for artifact naming. Defaults to {framework}_{profile}.'
required: false
default: ''
extra_pytest_args:
description: 'Additional pytest arguments (e.g., --frontend-image=...)'
required: false
default: ''
runs:
using: "composite"
......@@ -82,16 +93,33 @@ runs:
FRAMEWORK: ${{ inputs.framework }}
PROFILE: ${{ inputs.profile }}
IMAGE: ${{ inputs.image }}
TEST_NAME: ${{ inputs.test_name }}
EXTRA_PYTEST_ARGS: ${{ inputs.extra_pytest_args }}
run: |
mkdir -p test-results
PYTEST_ARGS=""
if [ -n "${FRAMEWORK}" ]; then
PYTEST_ARGS+=" --framework=${FRAMEWORK}"
fi
if [ -n "${PROFILE}" ]; then
PYTEST_ARGS+=" --profile=${PROFILE}"
fi
if [ -n "${IMAGE}" ]; then
PYTEST_ARGS+=" --image=${IMAGE}"
fi
if [ -z "${TEST_NAME}" ]; then
TEST_NAME="${FRAMEWORK:-unknown}_${PROFILE:-unknown}"
fi
pytest tests/deploy/test_deploy.py \
--framework="${FRAMEWORK}" \
--profile="${PROFILE}" \
--image="${IMAGE}" \
--namespace="${NAMESPACE}" \
${PYTEST_ARGS} \
${EXTRA_PYTEST_ARGS} \
-v -s \
--durations=10 \
--junitxml=test-results/pytest_deploy_${FRAMEWORK}_${PROFILE}_${{ inputs.platform_arch }}_${{ github.run_id }}_${{ job.check_run_id }}.xml \
--junitxml=test-results/pytest_deploy_${TEST_NAME}_${{ inputs.platform_arch }}_${{ github.run_id }}_${{ job.check_run_id }}.xml \
--log-cli-level=INFO
- name: Cleanup Deployment
......@@ -124,10 +152,25 @@ runs:
fi
echo "::endgroup::"
- name: Determine test name for artifacts
id: test-name
if: always()
shell: bash
env:
TEST_NAME: ${{ inputs.test_name }}
FRAMEWORK: ${{ inputs.framework }}
PROFILE: ${{ inputs.profile }}
run: |
if [ -z "${TEST_NAME}" ]; then
TEST_NAME="${FRAMEWORK:-unknown}_${PROFILE:-unknown}"
fi
echo "name=${TEST_NAME}" >> $GITHUB_OUTPUT
- name: Upload Test Results
uses: actions/upload-artifact@b7c566a772e6b6bfb58ed0dc250532a479d7789f #v6
if: always()
with:
name: test-results-${{ inputs.framework }}-${{ inputs.profile }}-${{ inputs.platform_arch }}-${{ github.run_id }}-${{ job.check_run_id }}
path: test-results/pytest_deploy_${{ inputs.framework }}_${{ inputs.profile }}_${{ inputs.platform_arch }}_${{ github.run_id }}_${{ job.check_run_id }}.xml
name: test-results-${{ steps.test-name.outputs.name }}-${{ inputs.platform_arch }}-${{ github.run_id }}-${{ job.check_run_id }}
path: test-results/pytest_deploy_${{ steps.test-name.outputs.name }}_${{ inputs.platform_arch }}_${{ github.run_id }}_${{ job.check_run_id }}.xml
retention-days: 7
......@@ -190,6 +190,16 @@ jobs:
copy_to_acr: false
secrets: inherit
# ============================================================================
# FRONTEND IMAGE BUILD
# ============================================================================
frontend-image:
name: Frontend Image
uses: ./.github/workflows/build-frontend-image.yaml
with:
skip_change_detection: true
secrets: inherit
# ============================================================================
# Operator
# ============================================================================
......@@ -354,7 +364,7 @@ jobs:
deploy-cleanup:
if: always()
needs: [deploy-operator, deploy-test-vllm, deploy-test-sglang, deploy-test-trtllm]
needs: [deploy-operator, deploy-test-vllm, deploy-test-sglang, deploy-test-trtllm, deploy-test-gaie]
runs-on: prod-default-small-v2
steps:
- uses: actions/checkout@v4
......@@ -366,9 +376,37 @@ jobs:
vcluster_name: ${{ needs.deploy-operator.outputs.vcluster_name }}
vcluster_namespace: ${{ needs.deploy-operator.outputs.namespace }}
# ============================================================================
# GAIE DEPLOY TEST
# ============================================================================
deploy-test-gaie:
name: GAIE Deploy Test
runs-on: prod-default-small-v2
needs: [deploy-operator, frontend-image, vllm-pipeline]
timeout-minutes: 30
permissions:
contents: read
steps:
- name: Checkout code
uses: actions/checkout@08eba0b27e820071cde6df949e0beb9ba4906955 # v4.3.0
- name: Run GAIE Deploy Test
id: deploy-test
uses: ./.github/actions/dynamo-deploy-test
with:
kubeconfig_base64: ${{ secrets.AZURE_AKS_CI_KUBECONFIG_B64 }}
namespace: ${{ needs.deploy-operator.outputs.namespace }}
registry: ${{ secrets.AZURE_ACR_HOSTNAME }}
operator_tag: ${{ needs.deploy-operator.outputs.operator_tag }}
hf_token: ${{ secrets.HF_TOKEN }}
test_name: gaie
extra_pytest_args: >-
-m framework_with_gaie
--frontend-image=${{ secrets.AZURE_ACR_HOSTNAME }}/ai-dynamo/dynamo:${{ github.sha }}-frontend
--image=${{ secrets.AZURE_ACR_HOSTNAME }}/ai-dynamo/dynamo:${{ github.sha }}-vllm-runtime-cuda12-amd64
deploy-status-check:
runs-on: ubuntu-latest
needs: [deploy-operator, deploy-test-vllm, deploy-test-sglang, deploy-test-trtllm]
needs: [deploy-operator, deploy-test-vllm, deploy-test-sglang, deploy-test-trtllm, deploy-test-gaie]
if: always()
steps:
- name: "Check all deploy test jobs"
......@@ -383,7 +421,7 @@ jobs:
name: Clean K8s builder if exists
runs-on: prod-default-small-v2
if: always()
needs: [vllm-pipeline, sglang-pipeline, trtllm-pipeline, vllm-dev-pipeline, sglang-dev-pipeline, trtllm-dev-pipeline, vllm-efa-pipeline, trtllm-efa-pipeline, operator]
needs: [vllm-pipeline, sglang-pipeline, trtllm-pipeline, vllm-dev-pipeline, sglang-dev-pipeline, trtllm-dev-pipeline, vllm-efa-pipeline, trtllm-efa-pipeline, operator, frontend-image]
steps:
- name: Checkout repository
uses: actions/checkout@08eba0b27e820071cde6df949e0beb9ba4906955 # v4.3.0
......@@ -404,7 +442,7 @@ jobs:
name: Notify Slack
runs-on: prod-builder-amd-v1
if: always() && failure()
needs: [ vllm-pipeline, sglang-pipeline, trtllm-pipeline, vllm-dev-pipeline, sglang-dev-pipeline, trtllm-dev-pipeline, vllm-efa-pipeline, trtllm-efa-pipeline, operator, deploy-operator, deploy-test-vllm, deploy-test-sglang, deploy-test-trtllm ]
needs: [ vllm-pipeline, sglang-pipeline, trtllm-pipeline, vllm-dev-pipeline, sglang-dev-pipeline, trtllm-dev-pipeline, vllm-efa-pipeline, trtllm-efa-pipeline, operator, frontend-image, deploy-operator, deploy-test-vllm, deploy-test-sglang, deploy-test-trtllm, deploy-test-gaie ]
permissions:
contents: read
steps:
......
......@@ -85,3 +85,4 @@ jobs:
profile: ${{ matrix.profile }}
image: ${{ secrets.AZURE_ACR_HOSTNAME }}/ai-dynamo/dynamo:${{ github.sha }}-${{ inputs.image_suffix }}
platform_arch: amd64
extra_pytest_args: -m framework_only
......@@ -122,7 +122,11 @@ For the HttpRoute service make sure to specify the namespace where your gateway
```bash
cd <dynamo-source-root>
# kubectl get httproutes -n my-model # Make sure you do not have an incompatible HttpRoute running, delete if so.
# Choose disagg or agg example
kubectl apply -f examples/backends/vllm/deploy/gaie/disagg.yaml -n my-model
# or
kubectl apply -f examples/backends/vllm/deploy/gaie/agg.yaml -n my-model
# make sure to apply the route
kubectl apply -f examples/backends/vllm/deploy/gaie/http-route.yaml -n my-model
```
......
......@@ -3,7 +3,7 @@
apiVersion: nvidia.com/v1alpha1
kind: DynamoGraphDeployment
metadata:
name: qwen-agg
name: qwen
spec:
backendFramework: vllm
services:
......
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
apiVersion: nvidia.com/v1alpha1
kind: DynamoGraphDeployment
metadata:
name: qwen
spec:
backendFramework: vllm
services:
Epp:
envFromSecret: hf-token-secret
componentType: epp
replicas: 1
extraPodSpec:
mainContainer:
image: nvcr.io/nvidia/ai-dynamo/epp-image:my-tag
imagePullPolicy: IfNotPresent
env:
- name: DYN_KV_CACHE_BLOCK_SIZE
value: "16"
- name: DYN_MODEL_NAME
value: "Qwen/Qwen3-0.6B"
- name: DYN_ENFORCE_DISAGG
value: "true"
eppConfig:
config:
plugins:
- type: disagg-profile-handler
- name: prefill-filter
type: label-filter
parameters:
label: "nvidia.com/dynamo-sub-component-type"
validValues:
- "prefill"
allowsNoLabel: false
- name: decode-filter
type: label-filter
parameters:
label: "nvidia.com/dynamo-sub-component-type"
validValues:
- "decode"
allowsNoLabel: false
- name: picker
type: max-score-picker
- name: dyn-prefill
type: dyn-prefill-scorer
- name: dyn-decode
type: dyn-decode-scorer
schedulingProfiles:
- name: prefill
plugins:
- pluginRef: prefill-filter
weight: 1
- pluginRef: dyn-prefill
weight: 1
- pluginRef: picker
weight: 1
- name: decode
plugins:
- pluginRef: decode-filter
weight: 1
- pluginRef: dyn-decode
weight: 1
- pluginRef: picker
weight: 1
VllmPrefillWorker:
componentType: worker
subComponentType: prefill
envFromSecret: hf-token-secret
sharedMemory:
size: 2Gi
frontendSidecar:
image: docker.io/lambda108/dynamo:post-rebase
args:
- -m
- dynamo.frontend
- --router-mode
- direct
envFromSecret: hf-token-secret
extraPodSpec:
affinity:
podAffinity:
preferredDuringSchedulingIgnoredDuringExecution:
- weight: 100
podAffinityTerm:
labelSelector:
matchExpressions:
- key: nvidia.com/dynamo-component-type
operator: In
values:
- worker
topologyKey: kubernetes.io/hostname
mainContainer:
env:
- name: SERVED_MODEL_NAME
value: "Qwen/Qwen3-0.6B"
- name: MODEL_PATH
value: "Qwen/Qwen3-0.6B"
args:
- "python3 -m dynamo.vllm --model $MODEL_PATH --served-model-name $SERVED_MODEL_NAME --tensor-parallel-size 1 --data-parallel-size 1 --disaggregation-mode prefill --kv-transfer-config '{\"kv_connector\":\"NixlConnector\",\"kv_role\":\"kv_both\"}' --gpu-memory-utilization 0.90 --enable-prefix-caching --block-size 16 --kv-events-config '{\"enable_kv_cache_events\":true}'"
command:
- /bin/sh
- -c
image: docker.io/lambda108/dynamo:post-rebase
imagePullPolicy: IfNotPresent
workingDir: /workspace/examples/backends/vllm
replicas: 1
resources:
limits:
gpu: "1"
requests:
gpu: "1"
VllmDecodeWorker:
componentType: worker
subComponentType: decode
envFromSecret: hf-token-secret
sharedMemory:
size: 2Gi
frontendSidecar:
image: docker.io/lambda108/dynamo:post-rebase
args:
- -m
- dynamo.frontend
- --router-mode
- direct
envFromSecret: hf-token-secret
extraPodSpec:
affinity:
podAffinity:
preferredDuringSchedulingIgnoredDuringExecution:
- weight: 100
podAffinityTerm:
labelSelector:
matchExpressions:
- key: nvidia.com/dynamo-component-type
operator: In
values:
- worker
topologyKey: kubernetes.io/hostname
mainContainer:
env:
- name: SERVED_MODEL_NAME
value: "Qwen/Qwen3-0.6B"
- name: MODEL_PATH
value: "Qwen/Qwen3-0.6B"
args:
- "python3 -m dynamo.vllm --model $MODEL_PATH --served-model-name $SERVED_MODEL_NAME --tensor-parallel-size 1 --data-parallel-size 1 --kv-transfer-config '{\"kv_connector\":\"NixlConnector\",\"kv_role\":\"kv_both\"}' --gpu-memory-utilization 0.90 --enable-prefix-caching --block-size 16 --kv-events-config '{\"enable_kv_cache_events\":true}'"
command:
- /bin/sh
- -c
image: docker.io/lambda108/dynamo:post-rebase
imagePullPolicy: IfNotPresent
workingDir: /workspace/examples/backends/vllm
replicas: 1
resources:
limits:
gpu: "1"
requests:
gpu: "1"
\ No newline at end of file
......@@ -18,18 +18,18 @@
apiVersion: gateway.networking.k8s.io/v1
kind: HTTPRoute
metadata:
name: qwen-agg-route
name: qwen-route
spec:
parentRefs:
- group: gateway.networking.k8s.io
kind: Gateway
name: inference-gateway
namespace: my-model # the namespace where your gateway is deployed.
namespace: default # the namespace where your gateway is deployed.
rules:
- backendRefs:
- group: inference.networking.k8s.io
kind: InferencePool
name: qwen-agg-pool
name: qwen-pool
port: 8000
weight: 1
matches:
......
......@@ -254,6 +254,8 @@ markers = [
"k8s: marks tests as requiring Kubernetes",
"fault_tolerance: marks tests as fault tolerance tests",
"deploy: marks tests as deployment tests",
"framework_only: marks standard framework deployment tests (vllm, sglang, trtllm)",
"framework_with_gaie: marks tests for GAIE (Gateway API Inference Extension) deployment",
# Built-in markers
"skip: skip this test",
"skipif: skip if condition is true",
......
......@@ -22,6 +22,8 @@ spec:
value: "128"
- name: DYN_MODEL_NAME
value: "RedHatAI/Llama-3.3-70B-Instruct-FP8-dynamic"
- name: DYN_ENFORCE_DISAGG
value: "true"
eppConfig:
config:
plugins:
......@@ -81,6 +83,10 @@ spec:
- direct
envFromSecret: hf-token-secret
extraPodSpec:
tolerations:
- key: nvidia.com/gpu
operator: Exists
effect: NoSchedule
affinity:
podAffinity:
preferredDuringSchedulingIgnoredDuringExecution:
......@@ -132,6 +138,10 @@ spec:
- direct
envFromSecret: hf-token-secret
extraPodSpec:
tolerations:
- key: nvidia.com/gpu
operator: Exists
effect: NoSchedule
affinity:
podAffinity:
preferredDuringSchedulingIgnoredDuringExecution:
......
......@@ -40,6 +40,12 @@ def pytest_addoption(parser: pytest.Parser) -> None:
help="Deployment profile to test (e.g., agg, disagg, disagg_router). "
"If not specified, runs all profiles for the selected framework.",
)
parser.addoption(
"--frontend-image",
type=str,
default=None,
help="Frontend container image (used by GAIE deploy tests).",
)
@dataclass(frozen=True)
......
......@@ -9,14 +9,23 @@ to chat completion requests correctly.
"""
import logging
import os
import subprocess
import time
from typing import Any, Dict
import kr8s
import pytest
import requests
import yaml
from tests.deploy.conftest import DeploymentTarget
from tests.utils.client import send_request, wait_for_model_availability
from tests.utils.managed_deployment import DeploymentSpec, ManagedDeployment
from tests.utils.managed_deployment import (
DeploymentSpec,
ManagedDeployment,
_get_workspace_dir,
)
logger = logging.getLogger(__name__)
......@@ -38,6 +47,7 @@ DEFAULT_REQUEST_TIMEOUT = 120
# Minimum response content length to validate that the model is generating meaningful output.
# This matches the validation threshold from the original shell-based deployment tests.
MIN_RESPONSE_CONTENT_LENGTH = 100
GAIE_MODEL_NAME = "Qwen/Qwen3-0.6B"
def validate_chat_response(
......@@ -100,6 +110,7 @@ def validate_chat_response(
return data
@pytest.mark.framework_only
@pytest.mark.k8s
@pytest.mark.deploy
@pytest.mark.post_merge
......@@ -213,3 +224,203 @@ async def test_deployment(
f"Deployment test PASSED for {deployment_target.test_id} "
f"(source: {deployment_target.source}, model: {model}, namespace: {namespace})"
)
# GAIE (Gateway API Inference Extension) deployment test
@pytest.mark.framework_with_gaie
@pytest.mark.k8s
@pytest.mark.deploy
@pytest.mark.post_merge
@pytest.mark.e2e
@pytest.mark.timeout(900)
async def test_gaie_deployment(
image: str,
namespace: str,
skip_service_restart: bool,
request,
) -> None:
"""Test GAIE disaggregated deployment with vLLM workers.
Applies the GAIE DynamoGraphDeployment (with CI-built images) and the
companion HTTPRoute, then verifies inference works end-to-end through
the full Gateway path.
"""
frontend_image = request.config.getoption("--frontend-image")
worker_image = image
assert frontend_image, "--frontend-image is required for GAIE deploy test"
assert worker_image, "--image is required for GAIE deploy test"
assert namespace, "--namespace is required for GAIE deploy test"
workspace = _get_workspace_dir()
gaie_dir = os.path.join(workspace, "examples", "backends", "vllm", "deploy", "gaie")
disagg_path = os.path.join(gaie_dir, "disagg.yaml")
httproute_path = os.path.join(gaie_dir, "http-route.yaml")
assert os.path.exists(disagg_path), f"disagg.yaml not found: {disagg_path}"
assert os.path.exists(
httproute_path
), f"http-route.yaml not found: {httproute_path}"
deployment_spec = DeploymentSpec(disagg_path)
deployment_spec.namespace = namespace
logger.info(f"Frontend image: {frontend_image}")
logger.info(f"Worker image: {worker_image}")
deployment_spec.set_image(frontend_image, service_name="Epp")
for worker in ("VllmPrefillWorker", "VllmDecodeWorker"):
deployment_spec.set_image(worker_image, service_name=worker)
deployment_spec.set_frontend_sidecar_image(frontend_image, service_name=worker)
route_hostname = f"{namespace}.example.com"
logger.info(f"HTTPRoute hostname: {route_hostname}")
with open(httproute_path) as f:
httproute_spec = yaml.safe_load(f)
httproute_spec["spec"]["hostnames"] = [route_hostname]
httproute_yaml = yaml.safe_dump(httproute_spec)
logger.info("Applying GAIE HTTPRoute...")
result = subprocess.run(
["kubectl", "apply", "-n", namespace, "-f", "-"],
input=httproute_yaml,
capture_output=True,
text=True,
)
logger.info(f"HTTPRoute apply stdout: {result.stdout}")
if result.stderr:
logger.warning(f"HTTPRoute apply stderr: {result.stderr}")
assert result.returncode == 0, f"Failed to apply HTTPRoute: {result.stderr}"
# Debug: verify namespace state before creating DGD
logger.info(f"Namespace: {namespace}")
ns_check = subprocess.run(
["kubectl", "get", "namespace", namespace],
capture_output=True,
text=True,
)
logger.info(f"Namespace check: {ns_check.stdout.strip()}")
if ns_check.returncode != 0:
logger.error(f"Namespace not found: {ns_check.stderr}")
# Debug: check if operator CRD is registered
crd_check = subprocess.run(
["kubectl", "get", "crd", "dynamographdeployments.nvidia.com"],
capture_output=True,
text=True,
)
logger.info(f"CRD check: {crd_check.stdout.strip()}")
if crd_check.returncode != 0:
logger.error(f"CRD not found: {crd_check.stderr}")
# Debug: check operator pod status
operator_check = subprocess.run(
[
"kubectl",
"get",
"pods",
"-n",
namespace,
"-l",
"app.kubernetes.io/name=dynamo-operator",
],
capture_output=True,
text=True,
)
logger.info(f"Operator pods: {operator_check.stdout.strip()}")
# Debug: log the full deployment spec being submitted
logger.info(f"DGD name: {deployment_spec.name}")
logger.info(f"DGD namespace: {deployment_spec.namespace}")
logger.info(f"DGD services: {[s.name for s in deployment_spec.services]}")
async with ManagedDeployment(
log_dir=request.node.name,
deployment_spec=deployment_spec,
namespace=namespace,
skip_service_restart=skip_service_restart,
frontend_service_name="Epp",
) as deployment:
# Debug: check what DGDs exist after creation
dgd_check = subprocess.run(
["kubectl", "get", "dynamographdeployments", "-n", namespace],
capture_output=True,
text=True,
)
logger.info(f"DGDs after creation: {dgd_check.stdout.strip()}")
pod_check = subprocess.run(
["kubectl", "get", "pods", "-n", namespace, "-o", "wide"],
capture_output=True,
text=True,
)
logger.info(f"Pods after creation: {pod_check.stdout.strip()}")
epp_pods = deployment.get_pods(["Epp"])
epp_pod_list = epp_pods.get("Epp", [])
assert len(epp_pod_list) > 0, "No EPP pods found for GAIE deployment"
logger.info(f"Found EPP pod: {epp_pod_list[0].name}")
gateway_svcs = list(
kr8s.get("services", "inference-gateway", namespace=namespace)
)
assert (
len(gateway_svcs) > 0
), f"inference-gateway service not found in namespace {namespace}"
gateway_pf = gateway_svcs[0].portforward(remote_port=80, local_port=0)
gateway_pf.start()
time.sleep(2)
try:
gateway_url = f"http://localhost:{gateway_pf.local_port}"
logger.info(f"Gateway port-forward established: {gateway_url}")
endpoint = deployment_spec.endpoint
headers = {"Host": route_hostname}
logger.info(f"Using Host header: {route_hostname}")
model_ready = wait_for_model_availability(
url=gateway_url,
endpoint=endpoint,
model=GAIE_MODEL_NAME,
logger=logger,
max_attempts=30,
headers=headers,
)
assert model_ready, (
f"Model '{GAIE_MODEL_NAME}' did not become available "
f"within the timeout period"
)
url = f"{gateway_url}{endpoint}"
payload = {
"model": GAIE_MODEL_NAME,
"messages": [{"role": "user", "content": TEST_PROMPT}],
"max_tokens": DEFAULT_MAX_TOKENS,
"temperature": DEFAULT_TEMPERATURE,
"stream": False,
}
logger.info(f"Sending inference request to {url}")
response = requests.post(
url,
json=payload,
headers=headers,
timeout=DEFAULT_REQUEST_TIMEOUT,
)
validate_chat_response(
response=response,
expected_model=GAIE_MODEL_NAME,
min_content_length=MIN_RESPONSE_CONTENT_LENGTH,
)
data = response.json()
content = data["choices"][0]["message"]["content"]
logger.info(
f"GAIE deployment test PASSED | "
f"model={data['model']}, status={response.status_code}, "
f"response_length={len(content)} chars\n"
f"Model response: {content}"
)
finally:
gateway_pf.stop()
......@@ -159,6 +159,7 @@ def wait_for_model_availability(
logger: logging.Logger,
max_attempts: int = 15,
attempt_timeouts: list[float] | None = None,
headers: dict[str, str] | None = None,
) -> bool:
"""
Wait for model to be available by sending test requests.
......@@ -197,7 +198,9 @@ def wait_for_model_availability(
logger.debug(
f"Testing model availability at {test_url} (attempt {attempt+1}/{max_attempts}, timeout={timeout_val}s)"
)
response = requests.post(test_url, json=test_payload, timeout=timeout_val)
response = requests.post(
test_url, json=test_payload, timeout=timeout_val, headers=headers
)
if response.status_code == 200:
logger.info(f"Model '{model}' is available and responding")
......
......@@ -66,6 +66,20 @@ class ServiceSpec:
self._spec["extraPodSpec"]["mainContainer"] = {}
self._spec["extraPodSpec"]["mainContainer"]["image"] = value
@property
def frontend_sidecar_image(self) -> Optional[str]:
"""Container image for the frontendSidecar (if present)."""
try:
return self._spec["frontendSidecar"]["image"]
except KeyError:
return None
@frontend_sidecar_image.setter
def frontend_sidecar_image(self, value: str):
if "frontendSidecar" not in self._spec:
self._spec["frontendSidecar"] = {}
self._spec["frontendSidecar"]["image"] = value
@property
def envs(self) -> list[dict[str, str]]:
"""Environment variables for the service"""
......@@ -230,6 +244,16 @@ class DeploymentSpec:
for service in services:
service.image = image
def set_frontend_sidecar_image(
self, image: str, service_name: Optional[str] = None
):
if service_name is None:
services = self.services
else:
services = [self[service_name]]
for service in services:
service.frontend_sidecar_image = image
def set_tensor_parallel(self, tp_size: int, service_names: Optional[list] = None):
"""Scale deployment for different tensor parallel configurations
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment