Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
dynamo
Commits
2176c431
"...git@developer.sourcefind.cn:2222/OpenDAS/vllm_cscc.git" did not exist on "63fe3a700fd23aa7b2cb43cbd1cb7af960832603"
Unverified
Commit
2176c431
authored
Feb 03, 2026
by
Thomas Montfort
Committed by
GitHub
Feb 04, 2026
Browse files
feat(planner): Derive prefill/decode GPU counts from DGD (#5919)
parent
8daacbd7
Changes
7
Hide whitespace changes
Inline
Side-by-side
Showing
7 changed files
with
583 additions
and
4 deletions
+583
-4
components/src/dynamo/planner/defaults.py
components/src/dynamo/planner/defaults.py
+35
-0
components/src/dynamo/planner/kubernetes_connector.py
components/src/dynamo/planner/kubernetes_connector.py
+51
-0
components/src/dynamo/planner/utils/dryrun.py
components/src/dynamo/planner/utils/dryrun.py
+6
-0
components/src/dynamo/planner/utils/planner_argparse.py
components/src/dynamo/planner/utils/planner_argparse.py
+6
-4
components/src/dynamo/planner/utils/planner_core.py
components/src/dynamo/planner/utils/planner_core.py
+65
-0
tests/planner/unit/kubernetes_connector.py
tests/planner/unit/kubernetes_connector.py
+207
-0
tests/planner/unit/test_sla_planner_scaling.py
tests/planner/unit/test_sla_planner_scaling.py
+213
-0
No files found.
components/src/dynamo/planner/defaults.py
View file @
2176c431
...
@@ -183,6 +183,41 @@ class Service(BaseModel):
...
@@ -183,6 +183,41 @@ class Service(BaseModel):
return
None
return
None
def
get_gpu_count
(
self
)
->
int
:
"""Get the GPU count from the service's resource specification.
GPU count is read from spec.services.[ServiceName].resources.limits.gpu,
falling back to requests.gpu if limits is not specified.
Returns:
The number of GPUs configured for this service
Raises:
ValueError: If GPU count is not specified or invalid
"""
resources
=
self
.
service
.
get
(
"resources"
,
{})
limits
=
resources
.
get
(
"limits"
,
{})
requests
=
resources
.
get
(
"requests"
,
{})
# Prefer limits, fall back to requests. For GPUs, Kubernetes device plugins
# typically treat requests and limits as equivalent since GPUs are
# non-compressible and allocated exclusively (no fractional sharing).
gpu_str
=
limits
.
get
(
"gpu"
)
or
requests
.
get
(
"gpu"
)
if
gpu_str
is
None
:
raise
ValueError
(
f
"No GPU count specified for service '
{
self
.
name
}
'. "
f
"Please set resources.limits.gpu or resources.requests.gpu in the DGD."
)
try
:
return
int
(
gpu_str
)
except
(
ValueError
,
TypeError
):
raise
ValueError
(
f
"Invalid GPU count '
{
gpu_str
}
' for service '
{
self
.
name
}
'. "
f
"GPU count must be an integer."
)
# TODO: still supporting framework component names for backwards compatibility
# TODO: still supporting framework component names for backwards compatibility
# Should be deprecated in favor of service subComponentType
# Should be deprecated in favor of service subComponentType
...
...
components/src/dynamo/planner/kubernetes_connector.py
View file @
2176c431
...
@@ -227,6 +227,57 @@ class KubernetesConnector(PlannerConnector):
...
@@ -227,6 +227,57 @@ class KubernetesConnector(PlannerConnector):
return
model_name
return
model_name
def
get_gpu_counts
(
self
,
deployment
:
Optional
[
dict
]
=
None
,
require_prefill
:
bool
=
True
,
require_decode
:
bool
=
True
,
)
->
tuple
[
int
,
int
]:
"""Get the GPU counts for prefill and decode services from the deployment.
Args:
deployment: Optional deployment dict, fetched if not provided
require_prefill: Whether to require prefill service
require_decode: Whether to require decode service
Returns:
Tuple of (prefill_gpu_count, decode_gpu_count)
Raises:
DeploymentValidationError: If GPU counts cannot be determined from DGD
"""
if
deployment
is
None
:
deployment
=
self
.
kube_api
.
get_graph_deployment
(
self
.
graph_deployment_name
)
prefill_gpu_count
=
0
decode_gpu_count
=
0
errors
=
[]
if
require_prefill
:
try
:
prefill_service
=
get_service_from_sub_component_type_or_name
(
deployment
,
SubComponentType
.
PREFILL
,
)
prefill_gpu_count
=
prefill_service
.
get_gpu_count
()
except
(
PlannerError
,
ValueError
)
as
e
:
errors
.
append
(
f
"Failed to get prefill GPU count:
{
e
}
"
)
if
require_decode
:
try
:
decode_service
=
get_service_from_sub_component_type_or_name
(
deployment
,
SubComponentType
.
DECODE
,
)
decode_gpu_count
=
decode_service
.
get_gpu_count
()
except
(
PlannerError
,
ValueError
)
as
e
:
errors
.
append
(
f
"Failed to get decode GPU count:
{
e
}
"
)
if
errors
:
raise
DeploymentValidationError
(
errors
)
return
prefill_gpu_count
,
decode_gpu_count
async
def
wait_for_deployment_ready
(
self
):
async
def
wait_for_deployment_ready
(
self
):
"""Wait for the deployment to be ready"""
"""Wait for the deployment to be ready"""
await
self
.
kube_api
.
wait_for_graph_deployment_ready
(
await
self
.
kube_api
.
wait_for_graph_deployment_ready
(
...
...
components/src/dynamo/planner/utils/dryrun.py
View file @
2176c431
...
@@ -16,6 +16,12 @@ from dynamo.planner.utils.trace_data_extractor import extract_metrics_from_moonc
...
@@ -16,6 +16,12 @@ from dynamo.planner.utils.trace_data_extractor import extract_metrics_from_moonc
def
run_sla_planner_dryrun
(
args
:
argparse
.
Namespace
)
->
None
:
def
run_sla_planner_dryrun
(
args
:
argparse
.
Namespace
)
->
None
:
# Dryrun mode: use defaults if GPU counts not provided (no DGD available)
if
args
.
prefill_engine_num_gpu
is
None
:
args
.
prefill_engine_num_gpu
=
1
if
args
.
decode_engine_num_gpu
is
None
:
args
.
decode_engine_num_gpu
=
1
warmup_metrics
=
None
warmup_metrics
=
None
if
getattr
(
args
,
"load_predictor_warmup_trace"
,
None
):
if
getattr
(
args
,
"load_predictor_warmup_trace"
,
None
):
warmup_metrics
=
extract_metrics_from_mooncake
(
warmup_metrics
=
extract_metrics_from_mooncake
(
...
...
components/src/dynamo/planner/utils/planner_argparse.py
View file @
2176c431
...
@@ -78,14 +78,16 @@ def create_sla_planner_parser() -> argparse.ArgumentParser:
...
@@ -78,14 +78,16 @@ def create_sla_planner_parser() -> argparse.ArgumentParser:
parser
.
add_argument
(
parser
.
add_argument
(
"--decode-engine-num-gpu"
,
"--decode-engine-num-gpu"
,
type
=
int
,
type
=
int
,
default
=
SLAPlannerDefaults
.
decode_engine_num_gpu
,
default
=
None
,
help
=
"Number of GPUs for decode engine"
,
help
=
"Number of GPUs per decode engine. In Kubernetes mode, this is auto-detected "
"from DGD resources but can be overridden (e.g., for mockers without GPU resources)."
,
)
)
parser
.
add_argument
(
parser
.
add_argument
(
"--prefill-engine-num-gpu"
,
"--prefill-engine-num-gpu"
,
type
=
int
,
type
=
int
,
default
=
SLAPlannerDefaults
.
prefill_engine_num_gpu
,
default
=
None
,
help
=
"Number of GPUs for prefill engine"
,
help
=
"Number of GPUs per prefill engine. In Kubernetes mode, this is auto-detected "
"from DGD resources but can be overridden (e.g., for mockers without GPU resources)."
,
)
)
parser
.
add_argument
(
parser
.
add_argument
(
"--profile-results-dir"
,
"--profile-results-dir"
,
...
...
components/src/dynamo/planner/utils/planner_core.py
View file @
2176c431
...
@@ -18,6 +18,7 @@ from dynamo.planner import (
...
@@ -18,6 +18,7 @@ from dynamo.planner import (
VirtualConnector
,
VirtualConnector
,
)
)
from
dynamo.planner.defaults
import
WORKER_COMPONENT_NAMES
from
dynamo.planner.defaults
import
WORKER_COMPONENT_NAMES
from
dynamo.planner.utils.exceptions
import
DeploymentValidationError
from
dynamo.planner.utils.load_predictor
import
LOAD_PREDICTORS
from
dynamo.planner.utils.load_predictor
import
LOAD_PREDICTORS
from
dynamo.planner.utils.perf_interpolation
import
(
from
dynamo.planner.utils.perf_interpolation
import
(
DecodeInterpolator
,
DecodeInterpolator
,
...
@@ -206,6 +207,54 @@ def _apply_component_gpu_budget(
...
@@ -206,6 +207,54 @@ def _apply_component_gpu_budget(
return
next_num
return
next_num
def
_initialize_gpu_counts
(
args
:
argparse
.
Namespace
,
connector
,
require_prefill
:
bool
,
require_decode
:
bool
,
)
->
None
:
"""Initialize GPU counts from DGD (Kubernetes) or CLI args (virtual).
In Kubernetes mode: reads from DGD, falls back to CLI flags if not found
(useful for mockers that don't specify GPU resources).
In virtual mode: requires CLI flags, errors if not provided.
Raises:
DeploymentValidationError: If GPU counts cannot be determined
"""
# Try to read from DGD in Kubernetes mode
if
hasattr
(
connector
,
"get_gpu_counts"
):
try
:
prefill_gpu
,
decode_gpu
=
connector
.
get_gpu_counts
(
require_prefill
=
require_prefill
,
require_decode
=
require_decode
,
)
args
.
prefill_engine_num_gpu
=
prefill_gpu
args
.
decode_engine_num_gpu
=
decode_gpu
logger
.
info
(
f
"Detected GPU counts from DGD: prefill=
{
prefill_gpu
}
, decode=
{
decode_gpu
}
"
)
return
except
Exception
as
e
:
# Fall back to CLI flags (e.g., for mockers without GPU resources in DGD)
logger
.
warning
(
f
"Could not read GPU counts from DGD (
{
e
}
), falling back to CLI flags"
)
# Use CLI flags (virtual mode, or K8s fallback when DGD lacks GPU resources)
errors
=
[]
if
require_prefill
and
args
.
prefill_engine_num_gpu
is
None
:
errors
.
append
(
"Missing --prefill-engine-num-gpu flag"
)
if
require_decode
and
args
.
decode_engine_num_gpu
is
None
:
errors
.
append
(
"Missing --decode-engine-num-gpu flag"
)
if
errors
:
raise
DeploymentValidationError
(
errors
)
logger
.
info
(
f
"Using GPU counts from CLI: prefill=
{
args
.
prefill_engine_num_gpu
}
, "
f
"decode=
{
args
.
decode_engine_num_gpu
}
"
)
class
BasePlanner
:
class
BasePlanner
:
component_type
:
SubComponentType
component_type
:
SubComponentType
...
@@ -636,6 +685,14 @@ class BasePlanner:
...
@@ -636,6 +685,14 @@ class BasePlanner:
)
)
logger
.
info
(
"Successfully validated the deployment"
)
logger
.
info
(
"Successfully validated the deployment"
)
# Initialize GPU counts
_initialize_gpu_counts
(
self
.
args
,
self
.
connector
,
require_prefill
=
require_prefill
,
require_decode
=
require_decode
,
)
await
self
.
connector
.
wait_for_deployment_ready
()
await
self
.
connector
.
wait_for_deployment_ready
()
model_name
=
await
self
.
_get_model_name
(
model_name
=
await
self
.
_get_model_name
(
...
@@ -808,6 +865,14 @@ class DisaggPlanner:
...
@@ -808,6 +865,14 @@ class DisaggPlanner:
)
)
logger
.
info
(
"Successfully validated the deployment"
)
logger
.
info
(
"Successfully validated the deployment"
)
# Initialize GPU counts
_initialize_gpu_counts
(
self
.
args
,
self
.
prefill_planner
.
connector
,
require_prefill
=
True
,
require_decode
=
True
,
)
await
self
.
prefill_planner
.
connector
.
wait_for_deployment_ready
()
await
self
.
prefill_planner
.
connector
.
wait_for_deployment_ready
()
model_name
=
await
self
.
prefill_planner
.
_get_model_name
(
model_name
=
await
self
.
prefill_planner
.
_get_model_name
(
...
...
tests/planner/unit/kubernetes_connector.py
View file @
2176c431
...
@@ -19,6 +19,7 @@ from unittest.mock import AsyncMock, Mock, call, patch
...
@@ -19,6 +19,7 @@ from unittest.mock import AsyncMock, Mock, call, patch
import
pytest
import
pytest
from
dynamo.planner.defaults
import
(
from
dynamo.planner.defaults
import
(
Service
,
SubComponentType
,
SubComponentType
,
get_service_from_sub_component_type_or_name
,
get_service_from_sub_component_type_or_name
,
)
)
...
@@ -681,3 +682,209 @@ def test_get_model_name_agree_returns_model_name(kubernetes_connector, mock_kube
...
@@ -681,3 +682,209 @@ def test_get_model_name_agree_returns_model_name(kubernetes_connector, mock_kube
# Assert
# Assert
assert
result
==
"agreed-model"
assert
result
==
"agreed-model"
# Tests for Service.get_gpu_count()
def
test_service_get_gpu_count_valid
():
"""Test that get_gpu_count returns correct GPU count from resources.limits.gpu"""
service
=
Service
(
name
=
"test-service"
,
service
=
{
"replicas"
:
1
,
"resources"
:
{
"limits"
:
{
"gpu"
:
"4"
}},
},
)
assert
service
.
get_gpu_count
()
==
4
def
test_service_get_gpu_count_from_requests_fallback
():
"""Test that get_gpu_count falls back to requests.gpu when limits.gpu is missing"""
service
=
Service
(
name
=
"test-service"
,
service
=
{
"replicas"
:
1
,
"resources"
:
{
"requests"
:
{
"gpu"
:
"2"
}},
},
)
assert
service
.
get_gpu_count
()
==
2
def
test_service_get_gpu_count_limits_preferred_over_requests
():
"""Test that limits.gpu is preferred over requests.gpu when both are present"""
service
=
Service
(
name
=
"test-service"
,
service
=
{
"replicas"
:
1
,
"resources"
:
{
"limits"
:
{
"gpu"
:
"4"
},
"requests"
:
{
"gpu"
:
"2"
},
},
},
)
assert
service
.
get_gpu_count
()
==
4
def
test_service_get_gpu_count_integer_value
():
"""Test that get_gpu_count works with integer GPU values"""
service
=
Service
(
name
=
"test-service"
,
service
=
{
"replicas"
:
1
,
"resources"
:
{
"limits"
:
{
"gpu"
:
2
}},
},
)
assert
service
.
get_gpu_count
()
==
2
def
test_service_get_gpu_count_missing_raises_error
():
"""Test that get_gpu_count raises ValueError when GPU count is missing"""
service
=
Service
(
name
=
"test-service"
,
service
=
{
"replicas"
:
1
},
)
with
pytest
.
raises
(
ValueError
)
as
exc_info
:
service
.
get_gpu_count
()
assert
"No GPU count specified"
in
str
(
exc_info
.
value
)
assert
"test-service"
in
str
(
exc_info
.
value
)
def
test_service_get_gpu_count_invalid_raises_error
():
"""Test that get_gpu_count raises ValueError for invalid GPU count"""
service
=
Service
(
name
=
"test-service"
,
service
=
{
"replicas"
:
1
,
"resources"
:
{
"limits"
:
{
"gpu"
:
"invalid"
}},
},
)
with
pytest
.
raises
(
ValueError
)
as
exc_info
:
service
.
get_gpu_count
()
assert
"Invalid GPU count"
in
str
(
exc_info
.
value
)
# Tests for KubernetesConnector.get_gpu_counts()
def
test_get_gpu_counts_both_services
(
kubernetes_connector
,
mock_kube_api
):
"""Test get_gpu_counts returns correct counts for both prefill and decode"""
mock_deployment
=
{
"metadata"
:
{
"name"
:
"test-graph"
},
"spec"
:
{
"services"
:
{
"prefill-worker"
:
{
"replicas"
:
1
,
"subComponentType"
:
"prefill"
,
"resources"
:
{
"limits"
:
{
"gpu"
:
"2"
}},
},
"decode-worker"
:
{
"replicas"
:
1
,
"subComponentType"
:
"decode"
,
"resources"
:
{
"limits"
:
{
"gpu"
:
"4"
}},
},
}
},
}
mock_kube_api
.
get_graph_deployment
.
return_value
=
mock_deployment
prefill_gpu
,
decode_gpu
=
kubernetes_connector
.
get_gpu_counts
()
assert
prefill_gpu
==
2
assert
decode_gpu
==
4
def
test_get_gpu_counts_prefill_only
(
kubernetes_connector
,
mock_kube_api
):
"""Test get_gpu_counts with require_decode=False"""
mock_deployment
=
{
"metadata"
:
{
"name"
:
"test-graph"
},
"spec"
:
{
"services"
:
{
"prefill-worker"
:
{
"replicas"
:
1
,
"subComponentType"
:
"prefill"
,
"resources"
:
{
"limits"
:
{
"gpu"
:
"2"
}},
},
}
},
}
mock_kube_api
.
get_graph_deployment
.
return_value
=
mock_deployment
prefill_gpu
,
decode_gpu
=
kubernetes_connector
.
get_gpu_counts
(
require_prefill
=
True
,
require_decode
=
False
)
assert
prefill_gpu
==
2
assert
decode_gpu
==
0
def
test_get_gpu_counts_decode_only
(
kubernetes_connector
,
mock_kube_api
):
"""Test get_gpu_counts with require_prefill=False"""
mock_deployment
=
{
"metadata"
:
{
"name"
:
"test-graph"
},
"spec"
:
{
"services"
:
{
"decode-worker"
:
{
"replicas"
:
1
,
"subComponentType"
:
"decode"
,
"resources"
:
{
"limits"
:
{
"gpu"
:
"4"
}},
},
}
},
}
mock_kube_api
.
get_graph_deployment
.
return_value
=
mock_deployment
prefill_gpu
,
decode_gpu
=
kubernetes_connector
.
get_gpu_counts
(
require_prefill
=
False
,
require_decode
=
True
)
assert
prefill_gpu
==
0
assert
decode_gpu
==
4
def
test_get_gpu_counts_missing_gpu_raises_error
(
kubernetes_connector
,
mock_kube_api
):
"""Test get_gpu_counts raises DeploymentValidationError when GPU count missing"""
mock_deployment
=
{
"metadata"
:
{
"name"
:
"test-graph"
},
"spec"
:
{
"services"
:
{
"prefill-worker"
:
{
"replicas"
:
1
,
"subComponentType"
:
"prefill"
,
# No resources.limits.gpu
},
"decode-worker"
:
{
"replicas"
:
1
,
"subComponentType"
:
"decode"
,
"resources"
:
{
"limits"
:
{
"gpu"
:
"4"
}},
},
}
},
}
mock_kube_api
.
get_graph_deployment
.
return_value
=
mock_deployment
with
pytest
.
raises
(
DeploymentValidationError
)
as
exc_info
:
kubernetes_connector
.
get_gpu_counts
()
assert
"prefill GPU count"
in
str
(
exc_info
.
value
)
def
test_get_gpu_counts_service_not_found_raises_error
(
kubernetes_connector
,
mock_kube_api
):
"""Test get_gpu_counts raises DeploymentValidationError when service not found"""
mock_deployment
=
{
"metadata"
:
{
"name"
:
"test-graph"
},
"spec"
:
{
"services"
:
{
"prefill-worker"
:
{
"replicas"
:
1
,
"subComponentType"
:
"prefill"
,
"resources"
:
{
"limits"
:
{
"gpu"
:
"2"
}},
},
# No decode service
}
},
}
mock_kube_api
.
get_graph_deployment
.
return_value
=
mock_deployment
with
pytest
.
raises
(
DeploymentValidationError
)
as
exc_info
:
kubernetes_connector
.
get_gpu_counts
()
assert
"decode GPU count"
in
str
(
exc_info
.
value
)
tests/planner/unit/test_sla_planner_scaling.py
View file @
2176c431
...
@@ -9,10 +9,12 @@ from unittest.mock import Mock, patch
...
@@ -9,10 +9,12 @@ from unittest.mock import Mock, patch
import
pytest
import
pytest
from
dynamo.planner.utils.exceptions
import
DeploymentValidationError
from
dynamo.planner.utils.planner_core
import
(
from
dynamo.planner.utils.planner_core
import
(
DecodePlanner
,
DecodePlanner
,
PlannerSharedState
,
PlannerSharedState
,
PrefillPlanner
,
PrefillPlanner
,
_initialize_gpu_counts
,
)
)
pytestmark
=
[
pytestmark
=
[
...
@@ -201,3 +203,214 @@ def test_disagg_scale_down():
...
@@ -201,3 +203,214 @@ def test_disagg_scale_down():
assert
low_d
==
_expected_decode
(
args
,
decode_planner
,
samples
[
1
])
assert
low_d
==
_expected_decode
(
args
,
decode_planner
,
samples
[
1
])
assert
low_p
<
high_p
assert
low_p
<
high_p
assert
low_d
<
high_d
assert
low_d
<
high_d
# Tests for _initialize_gpu_counts
class
TestInitializeGpuCounts
:
def
test_kubernetes_mode_reads_from_dgd
(
self
):
"""Test that GPU counts are read from DGD in Kubernetes mode"""
args
=
argparse
.
Namespace
()
args
.
prefill_engine_num_gpu
=
None
args
.
decode_engine_num_gpu
=
None
connector
=
Mock
()
connector
.
get_gpu_counts
=
Mock
(
return_value
=
(
2
,
4
))
_initialize_gpu_counts
(
args
,
connector
,
require_prefill
=
True
,
require_decode
=
True
)
assert
args
.
prefill_engine_num_gpu
==
2
assert
args
.
decode_engine_num_gpu
==
4
connector
.
get_gpu_counts
.
assert_called_once_with
(
require_prefill
=
True
,
require_decode
=
True
)
def
test_kubernetes_mode_prefill_only
(
self
):
"""Test GPU count initialization for prefill-only mode"""
args
=
argparse
.
Namespace
()
args
.
prefill_engine_num_gpu
=
None
args
.
decode_engine_num_gpu
=
None
connector
=
Mock
()
connector
.
get_gpu_counts
=
Mock
(
return_value
=
(
2
,
0
))
_initialize_gpu_counts
(
args
,
connector
,
require_prefill
=
True
,
require_decode
=
False
)
assert
args
.
prefill_engine_num_gpu
==
2
assert
args
.
decode_engine_num_gpu
==
0
connector
.
get_gpu_counts
.
assert_called_once_with
(
require_prefill
=
True
,
require_decode
=
False
)
def
test_virtual_mode_uses_cli_args
(
self
):
"""Test that GPU counts come from CLI args in virtual mode"""
args
=
argparse
.
Namespace
()
args
.
prefill_engine_num_gpu
=
2
args
.
decode_engine_num_gpu
=
4
# Virtual connector doesn't have get_gpu_counts method
connector
=
Mock
(
spec
=
[])
_initialize_gpu_counts
(
args
,
connector
,
require_prefill
=
True
,
require_decode
=
True
)
# Values should remain unchanged
assert
args
.
prefill_engine_num_gpu
==
2
assert
args
.
decode_engine_num_gpu
==
4
def
test_virtual_mode_missing_prefill_raises_error
(
self
):
"""Test that missing prefill GPU flag raises error in virtual mode"""
args
=
argparse
.
Namespace
()
args
.
prefill_engine_num_gpu
=
None
args
.
decode_engine_num_gpu
=
4
connector
=
Mock
(
spec
=
[])
with
pytest
.
raises
(
DeploymentValidationError
)
as
exc_info
:
_initialize_gpu_counts
(
args
,
connector
,
require_prefill
=
True
,
require_decode
=
True
)
assert
"prefill-engine-num-gpu"
in
str
(
exc_info
.
value
)
def
test_virtual_mode_missing_decode_raises_error
(
self
):
"""Test that missing decode GPU flag raises error in virtual mode"""
args
=
argparse
.
Namespace
()
args
.
prefill_engine_num_gpu
=
2
args
.
decode_engine_num_gpu
=
None
connector
=
Mock
(
spec
=
[])
with
pytest
.
raises
(
DeploymentValidationError
)
as
exc_info
:
_initialize_gpu_counts
(
args
,
connector
,
require_prefill
=
True
,
require_decode
=
True
)
assert
"decode-engine-num-gpu"
in
str
(
exc_info
.
value
)
def
test_virtual_mode_missing_both_raises_error_with_both_messages
(
self
):
"""Test that missing both GPU flags shows both error messages"""
args
=
argparse
.
Namespace
()
args
.
prefill_engine_num_gpu
=
None
args
.
decode_engine_num_gpu
=
None
connector
=
Mock
(
spec
=
[])
with
pytest
.
raises
(
DeploymentValidationError
)
as
exc_info
:
_initialize_gpu_counts
(
args
,
connector
,
require_prefill
=
True
,
require_decode
=
True
)
assert
len
(
exc_info
.
value
.
errors
)
==
2
def
test_virtual_mode_decode_only_no_prefill_error
(
self
):
"""Test decode-only mode doesn't require prefill GPU flag"""
args
=
argparse
.
Namespace
()
args
.
prefill_engine_num_gpu
=
None
args
.
decode_engine_num_gpu
=
4
connector
=
Mock
(
spec
=
[])
# Should not raise - prefill not required
_initialize_gpu_counts
(
args
,
connector
,
require_prefill
=
False
,
require_decode
=
True
)
assert
args
.
decode_engine_num_gpu
==
4
def
test_kubernetes_mode_fallback_to_cli_on_dgd_error
(
self
):
"""Test that K8s mode falls back to CLI flags when DGD parsing fails"""
args
=
argparse
.
Namespace
()
args
.
prefill_engine_num_gpu
=
2
args
.
decode_engine_num_gpu
=
4
connector
=
Mock
()
connector
.
get_gpu_counts
=
Mock
(
side_effect
=
ValueError
(
"No GPU count specified"
)
)
_initialize_gpu_counts
(
args
,
connector
,
require_prefill
=
True
,
require_decode
=
True
)
# Should use CLI flag values after fallback
assert
args
.
prefill_engine_num_gpu
==
2
assert
args
.
decode_engine_num_gpu
==
4
def
test_kubernetes_mode_fallback_missing_cli_flags_raises_error
(
self
):
"""Test that K8s fallback raises error when CLI flags are also missing"""
args
=
argparse
.
Namespace
()
args
.
prefill_engine_num_gpu
=
None
args
.
decode_engine_num_gpu
=
None
connector
=
Mock
()
connector
.
get_gpu_counts
=
Mock
(
side_effect
=
ValueError
(
"No GPU count specified"
)
)
with
pytest
.
raises
(
DeploymentValidationError
)
as
exc_info
:
_initialize_gpu_counts
(
args
,
connector
,
require_prefill
=
True
,
require_decode
=
True
)
assert
len
(
exc_info
.
value
.
errors
)
==
2
def
test_kubernetes_mode_fallback_partial_cli_flags
(
self
):
"""Test K8s fallback with only one CLI flag provided"""
args
=
argparse
.
Namespace
()
args
.
prefill_engine_num_gpu
=
2
args
.
decode_engine_num_gpu
=
None
connector
=
Mock
()
connector
.
get_gpu_counts
=
Mock
(
side_effect
=
ValueError
(
"No GPU count specified"
)
)
with
pytest
.
raises
(
DeploymentValidationError
)
as
exc_info
:
_initialize_gpu_counts
(
args
,
connector
,
require_prefill
=
True
,
require_decode
=
True
)
assert
"decode-engine-num-gpu"
in
str
(
exc_info
.
value
)
# Tests for dryrun GPU defaults
class
TestDryrunGpuDefaults
:
def
test_dryrun_defaults_gpu_counts_when_none
(
self
):
"""Test that dryrun sets default GPU counts of 1 when None"""
from
dynamo.planner.utils.dryrun
import
run_sla_planner_dryrun
args
=
_build_args
()
args
.
prefill_engine_num_gpu
=
None
args
.
decode_engine_num_gpu
=
None
args
.
dataset
=
"nonexistent.jsonl"
# Will fail but we check args first
# The function will set defaults before trying to load dataset
try
:
run_sla_planner_dryrun
(
args
)
except
(
FileNotFoundError
,
ValueError
):
pass
# Expected - dataset doesn't exist
assert
args
.
prefill_engine_num_gpu
==
1
assert
args
.
decode_engine_num_gpu
==
1
def
test_dryrun_preserves_cli_gpu_counts
(
self
):
"""Test that dryrun preserves GPU counts provided via CLI"""
from
dynamo.planner.utils.dryrun
import
run_sla_planner_dryrun
args
=
_build_args
()
args
.
prefill_engine_num_gpu
=
2
args
.
decode_engine_num_gpu
=
4
args
.
dataset
=
"nonexistent.jsonl"
try
:
run_sla_planner_dryrun
(
args
)
except
(
FileNotFoundError
,
ValueError
):
pass
assert
args
.
prefill_engine_num_gpu
==
2
assert
args
.
decode_engine_num_gpu
==
4
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment