Unverified Commit 611e856d authored by hhzhang16's avatar hhzhang16 Committed by GitHub
Browse files

feat: hide optimizationType (#7160)


Signed-off-by: default avatarHannah Zhang <hannahz@nvidia.com>
parent 012236ee
...@@ -52,11 +52,6 @@ class ProfilingPhase(str, Enum): ...@@ -52,11 +52,6 @@ class ProfilingPhase(str, Enum):
Done = "Done" Done = "Done"
class OptimizationType(str, Enum):
Latency = "latency"
Throughput = "throughput"
class SearchStrategy(str, Enum): class SearchStrategy(str, Enum):
Rapid = "rapid" Rapid = "rapid"
Thorough = "thorough" Thorough = "thorough"
...@@ -101,16 +96,11 @@ class WorkloadSpec(BaseModel): ...@@ -101,16 +96,11 @@ class WorkloadSpec(BaseModel):
class SLASpec(BaseModel): class SLASpec(BaseModel):
"""Service-level agreement targets. """Service-level agreement targets.
Provide exactly one of: Provide one of:
- ``ttft`` + ``itl``: explicit latency targets (default: 2000 ms / 30 ms) - ``ttft`` + ``itl``: explicit latency targets (default: 2000 ms / 30 ms)
- ``e2eLatency``: end-to-end latency target - ``e2eLatency``: end-to-end latency target (mutually exclusive with ttft/itl)"""
- ``optimizationType``: high-level objective without explicit numeric targets"""
optimizationType: Optional[OptimizationType] = Field(
default=None,
description="OptimizationType controls the profiling optimization strategy. Use when explicit SLA targets (ttft+itl or e2eLatency) are not known.",
)
ttft: Optional[float] = Field( ttft: Optional[float] = Field(
default=2000, default=2000,
description="TTFT is the Time To First Token target in milliseconds.", description="TTFT is the Time To First Token target in milliseconds.",
...@@ -125,20 +115,17 @@ class SLASpec(BaseModel): ...@@ -125,20 +115,17 @@ class SLASpec(BaseModel):
@model_validator(mode="after") @model_validator(mode="after")
def _validate_sla_options(self) -> "SLASpec": def _validate_sla_options(self) -> "SLASpec":
"""Ensure at most one SLA mode is active.""" """Ensure e2eLatency and ttft/itl are not both provided."""
has_e2e = self.e2eLatency is not None has_e2e = self.e2eLatency is not None
has_opt = self.optimizationType is not None
ttft_itl_touched = ( ttft_itl_touched = (
"ttft" in self.model_fields_set or "itl" in self.model_fields_set "ttft" in self.model_fields_set or "itl" in self.model_fields_set
) )
has_ttft_itl = (self.ttft is not None and self.itl is not None) and ( has_ttft_itl = (
ttft_itl_touched or (not has_e2e and not has_opt) self.ttft is not None or self.itl is not None
) ) and ttft_itl_touched
options_count = sum([has_ttft_itl, has_e2e, has_opt]) if has_e2e and has_ttft_itl:
if options_count > 1:
raise ValueError( raise ValueError(
"SLA must specify exactly one of: (ttft and itl), e2eLatency, " "SLA must specify either (ttft and itl) or e2eLatency, not both."
"or optimizationType — not multiple."
) )
if (self.ttft is not None) != (self.itl is not None): if (self.ttft is not None) != (self.itl is not None):
raise ValueError("ttft and itl must both be provided together.") raise ValueError("ttft and itl must both be provided together.")
......
...@@ -9078,18 +9078,6 @@ spec: ...@@ -9078,18 +9078,6 @@ spec:
itl: itl:
description: ITL is the Inter-Token Latency target in milliseconds. description: ITL is the Inter-Token Latency target in milliseconds.
type: number type: number
optimizationType:
allOf:
- enum:
- latency
- throughput
- enum:
- latency
- throughput
description: |-
OptimizationType controls the profiling optimization strategy.
Use when explicit SLA targets (ttft+itl or e2eLatency) are not known.
type: string
ttft: ttft:
description: TTFT is the Time To First Token target in milliseconds. description: TTFT is the Time To First Token target in milliseconds.
type: number type: number
......
...@@ -53,10 +53,9 @@ _IMPORT_OVERRIDES: dict[str, tuple[str, str, bool]] = { ...@@ -53,10 +53,9 @@ _IMPORT_OVERRIDES: dict[str, tuple[str, str, bool]] = {
_STRUCT_DOCSTRINGS: dict = { _STRUCT_DOCSTRINGS: dict = {
"SLASpec": ( "SLASpec": (
"Service-level agreement targets.\n\n" "Service-level agreement targets.\n\n"
" Provide exactly one of:\n\n" " Provide one of:\n\n"
" - ``ttft`` + ``itl``: explicit latency targets (default: 2000 ms / 30 ms)\n" " - ``ttft`` + ``itl``: explicit latency targets (default: 2000 ms / 30 ms)\n"
" - ``e2eLatency``: end-to-end latency target\n" " - ``e2eLatency``: end-to-end latency target (mutually exclusive with ttft/itl)"
" - ``optimizationType``: high-level objective without explicit numeric targets"
), ),
} }
...@@ -66,18 +65,15 @@ _STRUCT_EXTRAS: dict = { ...@@ -66,18 +65,15 @@ _STRUCT_EXTRAS: dict = {
"SLASpec": """\ "SLASpec": """\
@model_validator(mode="after") @model_validator(mode="after")
def _validate_sla_options(self) -> "SLASpec": def _validate_sla_options(self) -> "SLASpec":
\"\"\"Ensure at most one SLA mode is active.\"\"\" \"\"\"Ensure e2eLatency and ttft/itl are not both provided.\"\"\"
has_e2e = self.e2eLatency is not None has_e2e = self.e2eLatency is not None
has_opt = self.optimizationType is not None ttft_itl_touched = (
ttft_itl_touched = "ttft" in self.model_fields_set or "itl" in self.model_fields_set "ttft" in self.model_fields_set or "itl" in self.model_fields_set
has_ttft_itl = (self.ttft is not None and self.itl is not None) and (
ttft_itl_touched or (not has_e2e and not has_opt)
) )
options_count = sum([has_ttft_itl, has_e2e, has_opt]) has_ttft_itl = (self.ttft is not None or self.itl is not None) and ttft_itl_touched
if options_count > 1: if has_e2e and has_ttft_itl:
raise ValueError( raise ValueError(
"SLA must specify exactly one of: (ttft and itl), e2eLatency, " "SLA must specify either (ttft and itl) or e2eLatency, not both."
"or optimizationType \u2014 not multiple."
) )
if (self.ttft is not None) != (self.itl is not None): if (self.ttft is not None) != (self.itl is not None):
raise ValueError("ttft and itl must both be provided together.") raise ValueError("ttft and itl must both be provided together.")
......
...@@ -103,7 +103,6 @@ from dynamo.profiler.utils.dgdr_v1beta1_types import ( # noqa: E402 ...@@ -103,7 +103,6 @@ from dynamo.profiler.utils.dgdr_v1beta1_types import ( # noqa: E402
FeaturesSpec, FeaturesSpec,
MockerSpec, MockerSpec,
ModelCacheSpec, ModelCacheSpec,
OptimizationType,
PlannerConfig, PlannerConfig,
PlannerPreDeploymentSweepMode, PlannerPreDeploymentSweepMode,
ProfilingPhase, ProfilingPhase,
...@@ -175,7 +174,6 @@ def test_sla_defaults_and_validation(): ...@@ -175,7 +174,6 @@ def test_sla_defaults_and_validation():
assert sla.ttft == 2000.0 assert sla.ttft == 2000.0
assert sla.itl == 30.0 assert sla.itl == 30.0
assert sla.e2eLatency is None assert sla.e2eLatency is None
assert sla.optimizationType is None
print("✓ SLASpec defaults correct") print("✓ SLASpec defaults correct")
# explicit ttft+itl mode: OK # explicit ttft+itl mode: OK
...@@ -184,13 +182,6 @@ def test_sla_defaults_and_validation(): ...@@ -184,13 +182,6 @@ def test_sla_defaults_and_validation():
# e2eLatency mode: OK (null out ttft/itl) # e2eLatency mode: OK (null out ttft/itl)
SLASpec(ttft=None, itl=None, e2eLatency=500.0) SLASpec(ttft=None, itl=None, e2eLatency=500.0)
# optimizationType mode: OK (null out ttft/itl)
SLASpec(ttft=None, itl=None, optimizationType=OptimizationType.Throughput)
# optimizationType mode: OK without explicitly nulling defaults (TC-2.5 / TC-2.6)
SLASpec(optimizationType=OptimizationType.Throughput)
SLASpec(optimizationType=OptimizationType.Latency)
# e2eLatency mode: OK without explicitly nulling defaults # e2eLatency mode: OK without explicitly nulling defaults
SLASpec(e2eLatency=500.0) SLASpec(e2eLatency=500.0)
...@@ -230,10 +221,6 @@ def test_enums(): ...@@ -230,10 +221,6 @@ def test_enums():
assert ProfilingPhase.Initializing == "Initializing" assert ProfilingPhase.Initializing == "Initializing"
assert ProfilingPhase.SweepingPrefill == "SweepingPrefill" assert ProfilingPhase.SweepingPrefill == "SweepingPrefill"
# OptimizationType — TitleCase from Go const names
assert OptimizationType.Latency == "latency"
assert OptimizationType.Throughput == "throughput"
# SearchStrategy — TitleCase from Go const names # SearchStrategy — TitleCase from Go const names
assert SearchStrategy.Rapid == "rapid" assert SearchStrategy.Rapid == "rapid"
assert SearchStrategy.Thorough == "thorough" assert SearchStrategy.Thorough == "thorough"
......
...@@ -80,7 +80,7 @@ ...@@ -80,7 +80,7 @@
// //
// v1beta1-only fields with no v1alpha1 equivalent (omitted / TODO): // v1beta1-only fields with no v1alpha1 equivalent (omitted / TODO):
// //
// Hardware.*, Workload.{Concurrency,RequestRate}, SLA.{E2ELatency,OptimizationType}, // Hardware.*, Workload.{Concurrency,RequestRate}, SLA.{E2ELatency},
// Features.{KVRouter}, SearchStrategy // Features.{KVRouter}, SearchStrategy
// //
// # Status field mapping // # Status field mapping
......
...@@ -165,15 +165,6 @@ const ( ...@@ -165,15 +165,6 @@ const (
ProfilingReasonJobCreationFailed = "JobCreationFailed" ProfilingReasonJobCreationFailed = "JobCreationFailed"
) )
// OptimizationType specifies the profiling optimization strategy.
// +kubebuilder:validation:Enum=latency;throughput
type OptimizationType string
const (
OptimizationTypeLatency OptimizationType = "latency"
OptimizationTypeThroughput OptimizationType = "throughput"
)
// SearchStrategy controls the profiling search depth. // SearchStrategy controls the profiling search depth.
// +kubebuilder:validation:Enum=rapid;thorough // +kubebuilder:validation:Enum=rapid;thorough
type SearchStrategy string type SearchStrategy string
...@@ -231,14 +222,7 @@ type WorkloadSpec struct { ...@@ -231,14 +222,7 @@ type WorkloadSpec struct {
} }
// SLASpec defines the service-level agreement targets for profiling optimization. // SLASpec defines the service-level agreement targets for profiling optimization.
// Exactly one mode should be active: ttft+itl (default), e2eLatency, or optimizationType.
type SLASpec struct { type SLASpec struct {
// OptimizationType controls the profiling optimization strategy.
// Use when explicit SLA targets (ttft+itl or e2eLatency) are not known.
// +optional
// +kubebuilder:validation:Enum=latency;throughput
OptimizationType OptimizationType `json:"optimizationType,omitempty"`
// TTFT is the Time To First Token target in milliseconds. // TTFT is the Time To First Token target in milliseconds.
// +optional // +optional
// +python-default=2000 // +python-default=2000
......
...@@ -9078,18 +9078,6 @@ spec: ...@@ -9078,18 +9078,6 @@ spec:
itl: itl:
description: ITL is the Inter-Token Latency target in milliseconds. description: ITL is the Inter-Token Latency target in milliseconds.
type: number type: number
optimizationType:
allOf:
- enum:
- latency
- throughput
- enum:
- latency
- throughput
description: |-
OptimizationType controls the profiling optimization strategy.
Use when explicit SLA targets (ttft+itl or e2eLatency) are not known.
type: string
ttft: ttft:
description: TTFT is the Time To First Token target in milliseconds. description: TTFT is the Time To First Token target in milliseconds.
type: number type: number
......
...@@ -1480,24 +1480,6 @@ _Appears in:_ ...@@ -1480,24 +1480,6 @@ _Appears in:_
| `pvcMountPath` _string_ | PVCMountPath is the mount path for the PVC inside the container. | /opt/model-cache | Optional: \{\} <br /> | | `pvcMountPath` _string_ | PVCMountPath is the mount path for the PVC inside the container. | /opt/model-cache | Optional: \{\} <br /> |
#### OptimizationType
_Underlying type:_ _string_
OptimizationType specifies the profiling optimization strategy.
_Validation:_
- Enum: [latency throughput]
_Appears in:_
- [SLASpec](#slaspec)
| Field | Description |
| --- | --- |
| `latency` | |
| `throughput` | |
#### OverridesSpec #### OverridesSpec
...@@ -1579,7 +1561,6 @@ _Appears in:_ ...@@ -1579,7 +1561,6 @@ _Appears in:_
SLASpec defines the service-level agreement targets for profiling optimization. SLASpec defines the service-level agreement targets for profiling optimization.
Exactly one mode should be active: ttft+itl (default), e2eLatency, or optimizationType.
...@@ -1588,7 +1569,6 @@ _Appears in:_ ...@@ -1588,7 +1569,6 @@ _Appears in:_
| Field | Description | Default | Validation | | Field | Description | Default | Validation |
| --- | --- | --- | --- | | --- | --- | --- | --- |
| `optimizationType` _[OptimizationType](#optimizationtype)_ | OptimizationType controls the profiling optimization strategy.<br />Use when explicit SLA targets (ttft+itl or e2eLatency) are not known. | | Enum: [latency throughput] <br />Optional: \{\} <br /> |
| `ttft` _float_ | TTFT is the Time To First Token target in milliseconds. | | Optional: \{\} <br /> | | `ttft` _float_ | TTFT is the Time To First Token target in milliseconds. | | Optional: \{\} <br /> |
| `itl` _float_ | ITL is the Inter-Token Latency target in milliseconds. | | Optional: \{\} <br /> | | `itl` _float_ | ITL is the Inter-Token Latency target in milliseconds. | | Optional: \{\} <br /> |
| `e2eLatency` _float_ | E2ELatency is the target end-to-end request latency in milliseconds.<br />Alternative to specifying TTFT + ITL. | | Optional: \{\} <br /> | | `e2eLatency` _float_ | E2ELatency is the target end-to-end request latency in milliseconds.<br />Alternative to specifying TTFT + ITL. | | Optional: \{\} <br /> |
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment