"lib/bindings/python/vscode:/vscode.git/clone" did not exist on "3a5fe17db902203e2d4f1eb13673219dcd9b88f4"
Unverified Commit 7f9d92ed authored by mohammedabdulwahhab's avatar mohammedabdulwahhab Committed by GitHub
Browse files

fix: planner fixes DEP-78 DEP-94 (#1082)

parent 995cf55c
......@@ -30,7 +30,7 @@ PYTHONPATH=/workspace/examples/llm python components/planner.py --namespace <nam
## Backends
1. `local` - uses circus to start/stop worker subprocesses
2. `kubernetes` - uses the kubernetes API to adjust replicas of each component's resource definition. This is a work in progress and not currently available
2. `kubernetes` - uses the kubernetes API to adjust replicas of the DynamoGraphDeployment resource, which automatically scales the corresponding worker pods up or down
## Local Backend (LocalPlanner)
......@@ -118,10 +118,12 @@ If scaled to zero, the initial entry is kept without resources to maintain confi
### Testing
For manual testing, you can use the controller_test.py file to add/remove components after you've run a serve command with `--enable-local-planner`.
For manual testing, you can use the controller_test.py file to add/remove components after you've run a serve command on a Dynamo pipeline where the planner is linked.
## Kubernetes Backend
[Coming soon]
The Kubernetes backend works by updating the replicas count of the DynamoGraphDeployment custom resource. When the planner determines that workers need to be scaled up or down based on workload metrics, it uses the Kubernetes API to patch the DynamoGraphDeployment resource specification, changing the replicas count for the appropriate worker component. The Kubernetes operator then reconciles this change by creating or terminating the necessary pods. This provides a seamless autoscaling experience in Kubernetes environments without requiring manual intervention.
The Kubernetes backend will automatically be used by Planner when your pipeline is deployed with `dynamo deployment create`. By default, the planner will run in no-op mode, which means it will monitor metrics but not take scaling actions. To enable actual scaling, you should also specify `--Planner.no-operation=false`.
......@@ -18,10 +18,12 @@ __all__ = [
"LocalConnector",
"PlannerConnector",
"KubernetesConnector",
"PlannerDefaults",
]
# Import the classes
from dynamo.planner.circusd import CircusController
from dynamo.planner.defaults import PlannerDefaults
from dynamo.planner.kubernetes_connector import KubernetesConnector
from dynamo.planner.local_connector import LocalConnector
from dynamo.planner.planner_connector import PlannerConnector
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Source of truth for planner defaults
class PlannerDefaults:
namespace = "dynamo"
served_model_name = "vllm"
environment = "local"
no_operation = False
log_dir = None
adjustment_interval = 10
metric_pulling_interval = 1
max_gpu_budget = 8
min_endpoint = 1
decode_kv_scale_up_threshold = 0.9
decode_kv_scale_down_threshold = 0.5
prefill_queue_scale_up_threshold = 5.0
prefill_queue_scale_down_threshold = 0.2
decode_engine_num_gpu = 1
prefill_engine_num_gpu = 1
......@@ -29,7 +29,11 @@ import typer
from rich.console import Console
from rich.panel import Panel
from dynamo.sdk.cli.utils import resolve_service_config
from dynamo.sdk.cli.utils import (
is_local_planner_enabled,
raise_local_planner_warning,
resolve_service_config,
)
from dynamo.sdk.core.runner import TargetEnum
if t.TYPE_CHECKING:
......@@ -82,10 +86,6 @@ def serve(
False,
help="Print the final service configuration and exit without starting the server",
),
enable_local_planner: bool = typer.Option(
False,
help="Save a snapshot of your service state to a file that allows planner to edit your deployment configuration",
),
target: TargetEnum = typer.Option(
TargetEnum.DYNAMO,
"--target",
......@@ -149,6 +149,12 @@ def serve(
logger.debug("Dependencies: %s", [dep.on.name for dep in svc.dependencies.values()])
LinkedServices.remove_unused_edges()
# Check if local planner is enabled
enable_local_planner = is_local_planner_enabled(svc, service_configs)
if enable_local_planner:
# Raise warning if local planner is enabled, but workers for prefill or decode is > 1. Not supported.
raise_local_planner_warning(svc, service_configs)
from dynamo.sdk.cli.serving import serve_dynamo_graph # type: ignore
svc.inject_config()
......
......@@ -28,17 +28,23 @@ import socket
from typing import Any, DefaultDict, Dict, Iterator, Optional, Protocol, TextIO, Union
import click
import typer
import yaml
from click import Command, Context
from rich.console import Console
from dynamo.planner.defaults import PlannerDefaults # type: ignore[attr-defined]
from dynamo.runtime.logging import configure_dynamo_logging
from dynamo.sdk.core.protocol.interface import ComponentType
from dynamo.sdk.core.runner import TargetEnum
configure_dynamo_logging()
logger = logging.getLogger(__name__)
console = Console()
DYN_LOCAL_STATE_DIR = "DYN_LOCAL_STATE_DIR"
PLANNER_SERVICE_NAME = "Planner"
# Define a Protocol for services to ensure type safety
......@@ -369,3 +375,65 @@ def configure_target_environment(target: TargetEnum):
raise ValueError(f"Invalid target: {target}")
logger.debug(f"Setting deployment target to {target}")
set_target(target)
def is_local_planner_enabled(svc: Any, service_configs: dict) -> bool:
"""Check if local planner is enabled.
Args:
svc: The entrypoint service instance
service_configs: Dictionary of service configurations
Returns:
bool: True if local planner is enabled, False otherwise
"""
# Check all nodes to find planner
nodes = [dep for dep in svc.all_services().values()]
nodes.append(svc)
planners = [
node
for node in nodes
if node.config.get("dynamo", {}).get("component_type") == ComponentType.PLANNER
]
if len(planners) > 1:
console.print(
"[bold red]Error:[/bold red] More than one planner found in the pipeline"
)
raise typer.Exit(code=1)
# Exactly one planner
if planners:
# Get the config for the planner and check environment
planner_config = service_configs.get(PLANNER_SERVICE_NAME, {})
environment = planner_config.get("environment", PlannerDefaults.environment)
return environment == "local"
return False
def raise_local_planner_warning(svc: Any, service_configs: dict) -> None:
"""Warn if local planner is enabled and active (not set to no-op), but workers for prefill or decode is > 1. This is currently not supported.
Args:
svc: The service instance
service_configs: Dictionary of service configurations
"""
planner_config = service_configs.get(PLANNER_SERVICE_NAME, {})
# Resolve no-op setting
no_op = planner_config.get("no-operation", PlannerDefaults.no_operation)
# Check worker counts across nodes
nodes = [dep for dep in svc.all_services().values()]
nodes.append(svc)
worker_names = ("PrefillWorker", "VllmWorker")
worker_counts_greater_than_one = [
node.config.get("workers", 1) > 1 for node in nodes if node.name in worker_names
]
if any(worker_counts_greater_than_one) and not no_op:
logger.error(
"Local planner is enabled, but workers for prefill or decode is > 1. Local planner must be started with prefill and decode workers set to 1."
)
raise typer.Exit(code=1)
......@@ -19,6 +19,7 @@ import logging
import os
import shlex
import sys
from dataclasses import asdict
from typing import Any, Dict, List, Optional, Set, Type, TypeVar
import psutil
......@@ -80,6 +81,8 @@ class LocalService(ServiceMixin, ServiceInterface[T]):
self._dynamo_config = dynamo_config or DynamoConfig(
name=name, namespace="default"
)
# Add the dynamo config to the service config
self._config["dynamo"] = asdict(self._dynamo_config)
self._watcher = watcher
self._socket = socket
self.app = app or FastAPI(title=name)
......
......@@ -48,19 +48,9 @@ To measure the performance of dynamo with planner, we start from a 1p1d deployme
```bash
cd examples/llm
dynamo serve graphs.disagg:Frontend -f <path to disagg_1p1d.yml in this folder> --enable-local-planner
dynamo serve graphs.disagg:Frontend -f disagg_1p1d.yml
# in terminal 2
PYTHONPATH=/workspace/examples/llm python components/planner.py \
--metric-pulling-interval 1 \
--adjustment-interval 10 \
--prefill-queue-scale-down-threshold 0.2 \
--prefill-queue-scale-up-threshold 10 \
--decode-kv-scale-down-threshold 0.3 \
--decode-kv-scale-up-threshold 0.6 \
--log-dir log/planner
# in terminal 3
genai-perf profile \
--tokenizer deepseek-ai/DeepSeek-R1-Distill-Llama-8B \
-m deepseek-ai/DeepSeek-R1-Distill-Llama-8B \
......@@ -92,12 +82,9 @@ In this example, we use a fixed 2p2d engine as baseline. Planner provides a `--n
```bash
# in terminal 1
dynamo serve --enable-local-planner graphs.disagg:Frontend -f disagg_2p2d.yml
dynamo serve graphs.disagg:Frontend -f disagg_2p2d.yml
# in terminal 2 (optional)
PYTHONPATH=/workspace/examples/llm python components/planner.py --no-operation --log-dir log/2p2d
# in terminal 3
# in terminal 2
genai-perf profile --tokenizer deepseek-ai/DeepSeek-R1-Distill-Llama-8B -m deepseek-ai/DeepSeek-R1-Distill-Llama-8B --service-kind openai --endpoint-type chat --url http://localhost:8000 --streaming --input-file payload:sin_b512_t600_rr5.0-20.0-150.0_io3000150-3000150-0.2-0.8-10.jsonl
```
......
......@@ -43,3 +43,14 @@ PrefillWorker:
resources:
gpu: 1
common-configs: [model, block-size, max-model-len, kv-transfer-config]
Planner:
environment: local
no-operation: false
metric-pulling-interval: 1
adjustment-interval: 10
prefill-queue-scale-down-threshold: 0.2
prefill-queue-scale-up-threshold: 10
decode-kv-scale-down-threshold: 0.3
decode-kv-scale-up-threshold: 0.6
log-dir: log/planner
......@@ -43,3 +43,8 @@ PrefillWorker:
resources:
gpu: 1
common-configs: [model, block-size, max-model-len, kv-transfer-config]
Planner:
environment: local
no-operation: true
log-dir: log/2p2d
......@@ -49,26 +49,48 @@ There are two additional rules set by planner to prevent over-compensation:
1. We do not scale up prefill worker if the prefill queue size is estimated to reduce below the `--prefill-queue-scale-up-threshold` within the next `NEW_PREFILL_WORKER_QUEUE_BUFFER_PERIOD=3` adjustment intervals following the trend observed in the current adjustment interval.
## Usage
After you've deployed a dynamo graph - you can start the planner with the following command:
The planner is started automatically as part of Dynamo pipelines when running `dynamo serve`. You can configure the planner just as you would any other component in your pipeline either via YAML configuration or through CLI arguments.
Usage:
```bash
# Configure the planner through YAML configuration
dynamo serve graphs.disagg:Frontend -f disagg.yaml
# disagg.yaml
# ...
# Planner:
# environment: local
# no-operation: false
# log-dir: log/planner
# Configure the planner through CLI arguments
dynamo serve graphs.disagg:Frontend -f disagg.yaml --Planner.environment=local --Planner.no-operation=false --Planner.log-dir=log/planner
```
The planner accepts the following configuration options:
* `namespace` (str, default: "dynamo"): Namespace planner will look at
* `served-model-name` (str, default: "vllm"): Model name that is being served`
* `no-operation` (bool, default: false): Do not make any adjustments, just observe the metrics and log to tensorboard.
* `log-dir` (str, default: None): Tensorboard logging directory
* `adjustment-interval` (int, default: 30): Interval in seconds between scaling adjustments
* `metric-pulling-interval` (int, default: 1): Interval in seconds between metric pulls
* `max-gpu-budget` (int, default: 8): Maximum number of GPUs to use, planner will not scale up more than this number of GPUs for prefill plus decode workers
* `min-gpu-budget` (int, default: 1): Minimum number of GPUs to use, planner will not scale down below this number of GPUs for prefill or decode workers
* `decode-kv-scale-up-threshold` (float, default: 0.9): KV cache utilization threshold to scale up decode workers
* `decode-kv-scale-down-threshold` (float, default: 0.5): KV cache utilization threshold to scale down decode workers
* `prefill-queue-scale-up-threshold` (float, default: 0.5): Queue utilization threshold to scale up prefill workers
* `prefill-queue-scale-down-threshold` (float, default: 0.2): Queue utilization threshold to scale down prefill workers
* `decode-engine-num-gpu` (int, default: 1): Number of GPUs per decode engine
* `prefill-engine-num-gpu` (int, default: 1): Number of GPUs per prefill engine
Alternatively, you can run the planner as a standalone python process. The configuration options above can be directly passed in as CLI arguments.
```bash
PYTHONPATH=/workspace/examples/llm python components/planner.py <arguments>
# Example
# PYTHONPATH=/workspace/examples/llm python components/planner.py --namespace=dynamo --served-model-name=vllm --no-operation --log-dir=log/planner
```
Planner takes the following arguments:
* `--namespace` (str, default: "dynamo"): Namespace planner will look at
* `--served-model-name` (str, default: "vllm"): Model name that is being served`
* `--no-operation` (flag): Do not make any adjustments, just observe the metrics and log to tensorboard
* `--log-dir` (str, default: None): Tensorboard logging directory
* `--adjustment-interval` (int, default: 30): Interval in seconds between scaling adjustments
* `--metric-pulling-interval` (int, default: 1): Interval in seconds between metric pulls
* `--max-gpu-budget` (int, default: 8): Maximum number of GPUs to use, planner will not scale up more than this number of GPUs for prefill plus decode workers
* `--min-gpu-budget` (int, default: 1): Minimum number of GPUs to use, planner will not scale down below this number of GPUs for prefill or decode workers
* `--decode-kv-scale-up-threshold` (float, default: 0.9): KV cache utilization threshold to scale up decode workers
* `--decode-kv-scale-down-threshold` (float, default: 0.5): KV cache utilization threshold to scale down decode workers
* `--prefill-queue-scale-up-threshold` (float, default: 0.5): Queue utilization threshold to scale up prefill workers
* `--prefill-queue-scale-down-threshold` (float, default: 0.2): Queue utilization threshold to scale down prefill workers
* `--decode-engine-num-gpu` (int, default: 1): Number of GPUs per decode engine
* `--prefill-engine-num-gpu` (int, default: 1): Number of GPUs per prefill engine
### Tensorboard
......@@ -78,8 +100,9 @@ tensorboard --logdir=<path-to-tensorboard-log-dir>
```
## Backends
We currently only support one backend:
We currently support two backends:
1. `local` - uses circus to start/stop worker subprocesses
2. `kubernetes` - uses kubernetes to scale up/down the number of worker pods by updating the replicas count of the DynamoGraphDeployment resource
### Local Backend
......@@ -129,3 +152,7 @@ Note that we keep the initial non-suffix entry in order to know what cmd we will
> [!NOTE]
> At the moment - planner work best if your initial replicas per worker are 1. This is because if you specify replicas > 1 when you initially start `dynamo serve`, the current implementation in `serving.py` starts each process in the same watcher.
### Kubernetes Backend
The Kubernetes backend works by updating the replicas count of the DynamoGraphDeployment custom resource. When the planner detects the need to scale up or down a specific worker type, it uses the Kubernetes API to patch the DynamoGraphDeployment resource, modifying the replicas count for the appropriate component. The Kubernetes operator then reconciles this change by creating or removing the necessary pods. This provides a seamless scaling experience in Kubernetes environments without requiring manual intervention.
\ No newline at end of file
......@@ -125,6 +125,9 @@ This figure shows an overview of the major components to deploy:
```
> [!NOTE]
> The planner component is enabled by default for all deployment architectures but is set to no-op mode. This means the planner observes metrics but doesn't take scaling actions. To enable active scaling, you can add `--Planner.no-operation=false` to your `dynamo serve` command. For more details, see the [Planner documentation](../../components/planner/README.md).
### Example architectures
_Note_: For a non-dockerized deployment, first export `DYNAMO_HOME` to point to the dynamo repository root, e.g. `export DYNAMO_HOME=$(pwd)`
......@@ -216,6 +219,8 @@ export DEPLOYMENT_NAME=llm-agg
dynamo deployment create $DYNAMO_TAG -n $DEPLOYMENT_NAME -f ./configs/agg.yaml
```
**Note**: Optionally add `--Planner.no-operation=false` at the end of the deployment command to enable the planner component to take scaling actions on your deployment.
### Testing the Deployment
Once the deployment is complete, you can test it using:
......
......@@ -31,6 +31,7 @@ from utils.prefill_queue import PrefillQueue
from dynamo.llm import KvMetricsAggregator
from dynamo.planner import KubernetesConnector, LocalConnector
from dynamo.planner.defaults import PlannerDefaults
from dynamo.runtime import DistributedRuntime, dynamo_worker
from dynamo.runtime.logging import configure_dynamo_logging
......@@ -78,6 +79,8 @@ class Planner:
logger.info(f"Components present in namespace: {args.namespace}")
self.init_time = time.time()
# Set the appropriate logger function for repeated metric logging
self._repeating_log_func = logger.debug if args.no_operation else logger.info
async def set_metric_aggregator(self):
# TODO: separate KV metrics and prefill metrics
......@@ -100,7 +103,9 @@ class Planner:
p_endpoints = self.prefill_client.endpoint_ids()
except Exception:
p_endpoints = []
logger.info("No prefill workers found, operating in aggregated mode")
self._repeating_log_func(
"No prefill workers found, operating in aggregated mode"
)
try:
if self.workers_client is None:
self.workers_client = (
......@@ -118,13 +123,13 @@ class Planner:
return p_endpoints, d_endpoints
async def reset_adjustment_interval(self):
logger.info(
self._repeating_log_func(
f"Reset metrics for new adjustment interval at t={time.time() - self.init_time:.1f}s"
)
self.p_endpoints, self.d_endpoints = await self.get_workers_info()
logger.info(
self._repeating_log_func(
f"Number of prefill workers: {len(self.p_endpoints)}, number of decode workers: {len(self.d_endpoints)}"
)
......@@ -135,7 +140,9 @@ class Planner:
self.last_adjustment_time = time.time()
async def collect_metrics(self):
logger.info(f"Collecting metrics at t={time.time() - self.init_time:.1f}s")
self._repeating_log_func(
f"Collecting metrics at t={time.time() - self.init_time:.1f}s"
)
# collect prefill queue load
try:
......@@ -146,14 +153,16 @@ class Planner:
prefill_queue_size = await prefill_queue.get_queue_size()
measure_time = time.time() - self.init_time
self.prefill_queue_load.append(prefill_queue_size)
logger.info(
self._repeating_log_func(
f"Collected prefill queue size at t={measure_time:.1f}s: {int(prefill_queue_size)}"
)
self.writer.add_scalar(
"prefill_queue_size", prefill_queue_size, measure_time
)
except Exception as e:
logger.info(f"Failed to collect prefill queue size metrics: {e}")
self._repeating_log_func(
f"Failed to collect prefill queue size metrics: {e}"
)
# collect kv load
total_active_requests: int = 0
......@@ -176,7 +185,7 @@ class Planner:
kv_load = kv_load + 0.02 * num_requests_waiting
self.kv_load.append(kv_load)
measure_time = time.time() - self.init_time
logger.info(
self._repeating_log_func(
f"Collected kv load at t={measure_time:.1f}s: {self.kv_load[prev_kv_load_len:]} (act/pnd req: {total_active_requests}/{total_queued_requests})"
)
average_kv_load = np.mean(self.kv_load[prev_kv_load_len:])
......@@ -185,7 +194,7 @@ class Planner:
"total_queued_requests", total_queued_requests, measure_time
)
except Exception as e:
logger.info(f"Failed to collect kv load metrics: {e}")
self._repeating_log_func(f"Failed to collect kv load metrics: {e}")
p_endpoints, d_endpoints = await self.get_workers_info()
self.writer.add_scalar(
......@@ -333,6 +342,12 @@ class Planner:
"""Main loop for the planner"""
await self.set_metric_aggregator()
if self._repeating_log_func == logger.debug:
logger.info(
"Running in no-operation mode - detailed metrics will be logged at DEBUG level"
)
await self.reset_adjustment_interval()
while True:
......@@ -391,90 +406,91 @@ if __name__ == "__main__":
parser.add_argument(
"--namespace",
type=str,
default="dynamo",
default=PlannerDefaults.namespace,
help="Namespace planner will look at",
)
parser.add_argument(
"--served-model-name",
type=str,
default="vllm",
default=PlannerDefaults.served_model_name,
help="Model name that is being served (used for prefill queue name)",
)
parser.add_argument(
"--no-operation",
action="store_true",
default=PlannerDefaults.no_operation,
help="Do not make any adjustments, just observe the metrics",
)
parser.add_argument(
"--log-dir",
type=str,
default=None,
default=PlannerDefaults.log_dir,
help="Tensorboard logging directory",
)
parser.add_argument(
"--adjustment-interval",
type=int,
default=10,
default=PlannerDefaults.adjustment_interval,
help="Interval in seconds between scaling adjustments",
)
parser.add_argument(
"--metric-pulling-interval",
type=int,
default=1,
default=PlannerDefaults.metric_pulling_interval,
help="Interval in seconds between metric pulls",
)
parser.add_argument(
"--max-gpu-budget",
type=int,
default=8,
default=PlannerDefaults.max_gpu_budget,
help="Maximum number of GPUs to use",
)
parser.add_argument(
"--min-endpoint",
type=int,
default=1,
default=PlannerDefaults.min_endpoint,
help="Minimum number of endpoints to keep for prefill/decode workers",
)
parser.add_argument(
"--decode-kv-scale-up-threshold",
type=float,
default=0.9,
default=PlannerDefaults.decode_kv_scale_up_threshold,
help="KV cache utilization threshold to scale up decode workers",
)
parser.add_argument(
"--decode-kv-scale-down-threshold",
type=float,
default=0.5,
default=PlannerDefaults.decode_kv_scale_down_threshold,
help="KV cache utilization threshold to scale down decode workers",
)
parser.add_argument(
"--prefill-queue-scale-up-threshold",
type=float,
default=5,
default=PlannerDefaults.prefill_queue_scale_up_threshold,
help="Queue utilization threshold to scale up prefill workers",
)
parser.add_argument(
"--prefill-queue-scale-down-threshold",
type=float,
default=0.2,
default=PlannerDefaults.prefill_queue_scale_down_threshold,
help="Queue utilization threshold to scale down prefill workers",
)
parser.add_argument(
"--decode-engine-num-gpu",
type=int,
default=1,
default=PlannerDefaults.decode_engine_num_gpu,
help="Number of GPUs per decode engine",
)
parser.add_argument(
"--prefill-engine-num-gpu",
type=int,
default=1,
default=PlannerDefaults.prefill_engine_num_gpu,
help="Number of GPUs per prefill engine",
)
parser.add_argument(
"--environment",
type=str,
default="local",
default=PlannerDefaults.environment,
help="Environment to run the planner in (local, kubernetes)",
)
args = parser.parse_args()
......
......@@ -19,8 +19,10 @@ import logging
from pydantic import BaseModel
from components.planner import start_planner # type: ignore[attr-defined]
from dynamo.planner.defaults import PlannerDefaults
from dynamo.runtime.logging import configure_dynamo_logging
from dynamo.sdk import async_on_start, dynamo_context, dynamo_endpoint, service
from dynamo.sdk.core.protocol.interface import ComponentType
from dynamo.sdk.lib.config import ServiceConfig
from dynamo.sdk.lib.image import DYNAMO_IMAGE
......@@ -34,7 +36,7 @@ class RequestType(BaseModel):
@service(
dynamo={
"namespace": "dynamo",
"component_type": "planner",
"component_type": ComponentType.PLANNER,
},
resources={"cpu": "10", "memory": "20Gi"},
workers=1,
......@@ -50,33 +52,59 @@ class Planner:
# Get namespace directly from dynamo_context as it contains the active namespace
self.namespace = dynamo_context["namespace"]
self.environment = config.get("Planner", {}).get("environment", "local")
self.no_operation = config.get("Planner", {}).get("no-operation", True)
config_instance = config.get("Planner", {})
# Create args with all parameters from planner.py, using defaults except for namespace and environment
self.args = argparse.Namespace(
namespace=self.namespace,
environment=self.environment,
served_model_name="vllm",
no_operation=self.no_operation,
log_dir=None,
adjustment_interval=10,
metric_pulling_interval=1,
max_gpu_budget=8,
min_endpoint=1,
decode_kv_scale_up_threshold=0.9,
decode_kv_scale_down_threshold=0.5,
prefill_queue_scale_up_threshold=5,
prefill_queue_scale_down_threshold=0.2,
decode_engine_num_gpu=1,
prefill_engine_num_gpu=1,
environment=config_instance.get("environment", PlannerDefaults.environment),
served_model_name=config_instance.get(
"served-model-name", PlannerDefaults.served_model_name
),
no_operation=config_instance.get(
"no-operation", PlannerDefaults.no_operation
),
log_dir=config_instance.get("log-dir", PlannerDefaults.log_dir),
adjustment_interval=config_instance.get(
"adjustment-interval", PlannerDefaults.adjustment_interval
),
metric_pulling_interval=config_instance.get(
"metric-pulling-interval", PlannerDefaults.metric_pulling_interval
),
max_gpu_budget=config_instance.get(
"max-gpu-budget", PlannerDefaults.max_gpu_budget
),
min_endpoint=config_instance.get(
"min-endpoint", PlannerDefaults.min_endpoint
),
decode_kv_scale_up_threshold=config_instance.get(
"decode-kv-scale-up-threshold",
PlannerDefaults.decode_kv_scale_up_threshold,
),
decode_kv_scale_down_threshold=config_instance.get(
"decode-kv-scale-down-threshold",
PlannerDefaults.decode_kv_scale_down_threshold,
),
prefill_queue_scale_up_threshold=config_instance.get(
"prefill-queue-scale-up-threshold",
PlannerDefaults.prefill_queue_scale_up_threshold,
),
prefill_queue_scale_down_threshold=config_instance.get(
"prefill-queue-scale-down-threshold",
PlannerDefaults.prefill_queue_scale_down_threshold,
),
decode_engine_num_gpu=config_instance.get(
"decode-engine-num-gpu", PlannerDefaults.decode_engine_num_gpu
),
prefill_engine_num_gpu=config_instance.get(
"prefill-engine-num-gpu", PlannerDefaults.prefill_engine_num_gpu
),
)
@async_on_start
async def async_init(self):
import asyncio
await asyncio.sleep(60)
await asyncio.sleep(30)
logger.info("Calling start_planner")
await start_planner(self.runtime, self.args)
logger.info("Planner started")
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment