Commit 0292feb5 authored by jon-chuang's avatar jon-chuang Committed by GitHub
Browse files

feat: Extract Common Configs + Log Configs on Init + Add `test_` to...


feat: Extract Common Configs + Log Configs on Init + Add `test_` to `sdk/tests` filenames required for pytest (#434)
Co-authored-by: default avatarishandhanani <82981111+ishandhanani@users.noreply.github.com>
parent 0186aa7b
......@@ -217,17 +217,17 @@ def build_serve_command() -> click.Group:
# Initialize service_configs as empty dict if it's None
# Convert nested YAML structure to flat dict with dot notation
for service, configs in yaml_configs.items():
for key, value in configs.items():
if service not in service_configs:
service_configs[service] = {}
for key, value in configs.items():
service_configs[service][key] = value
# Process service-specific options
cmdline_overrides: t.Dict[str, t.Any] = _parse_service_args(ctx.args)
for service, configs in cmdline_overrides.items():
for key, value in configs.items():
if service not in service_configs:
service_configs[service] = {}
for key, value in configs.items():
service_configs[service][key] = value
# Process depends
......@@ -243,11 +243,12 @@ def build_serve_command() -> click.Group:
rich.print(f"DYNAMO_SERVICE_CONFIG={json.dumps(service_configs)}")
sys.exit(0)
configure_server_logging()
# Set environment variable with service configuration
if service_configs:
logger.info(f"Running dynamo serve with service configs {service_configs}")
os.environ["DYNAMO_SERVICE_CONFIG"] = json.dumps(service_configs)
configure_server_logging()
if working_dir is None:
if os.path.isdir(os.path.expanduser(bento)):
working_dir = os.path.expanduser(bento)
......
......@@ -14,8 +14,11 @@
# limitations under the License.
import json
import logging
import os
logger = logging.getLogger(__name__)
class ServiceConfig(dict):
"""Configuration store that inherits from dict for simpler access patterns"""
......@@ -47,14 +50,27 @@ class ServiceConfig(dict):
return self[service_name][key]
def as_args(self, service_name, prefix=""):
"""Extract configs as CLI args for a service, with optional prefix filtering"""
"""Extract configs as CLI args for a service, with optional prefix filtering.
Every component will additionally have the args in the `Common` configs
applied if it has subscribed to that config key, i.e. the given key is provided in
the component's `common-configs` setting, and that key has not been overriden by the
component's config.
"""
COMMON_CONFIG_SERVICE = "Common"
COMMON_CONFIG_KEY = "common-configs"
if service_name not in self:
return []
args = []
for key, value in self[service_name].items():
args: list[str] = []
def add_to_args(args: list[str], key: str, value):
if prefix and not key.startswith(prefix):
continue
return
if key.endswith(COMMON_CONFIG_KEY):
return
# Strip prefix if needed
arg_key = key[len(prefix) :] if prefix and key.startswith(prefix) else key
......@@ -68,4 +84,16 @@ class ServiceConfig(dict):
else:
args.extend([f"--{arg_key}", str(value)])
if (common := self.get(COMMON_CONFIG_SERVICE)) is not None and (
common_config_keys := self[service_name].get(COMMON_CONFIG_KEY)
) is not None:
for key in common_config_keys:
if key in common and key not in self[service_name]:
add_to_args(args, key, common[key])
for key, value in self[service_name].items():
add_to_args(args, key, value)
logger.info(f"Running {service_name} with {args=}")
return args
......@@ -86,15 +86,15 @@ class Backend2:
backend = depends(Backend)
def __init__(self) -> None:
print("Starting middle2")
print("Starting backend2")
@dynamo_endpoint()
async def generate(self, req: RequestType):
"""Forward requests to backend."""
req_text = req.text
print(f"Middle2 received: {req_text}")
text = f"{req_text}-mid2"
print(f"Backend2 received: {req_text}")
text = f"{req_text}-back2"
next_request = RequestType(text=text).model_dump_json()
print(next_request)
......@@ -117,8 +117,17 @@ class Middle:
req_text = req.text
print(f"Middle received: {req_text}")
text = f"{req_text}-mid"
for token in text.split():
yield f"Mid: {token}"
txt = RequestType(text=text)
if self.backend:
async for back_resp in self.backend.generate(txt.model_dump_json()):
print(f"Frontend received back_resp: {back_resp}")
yield f"Frontend: {back_resp}"
else:
async for back_resp in self.backend2.generate(txt.model_dump_json()):
print(f"Frontend received back_resp: {back_resp}")
yield f"Frontend: {back_resp}"
@service(resources={"cpu": "1"}, traffic={"timeout": 60})
......@@ -136,11 +145,6 @@ class Frontend:
print(f"Frontend received type: {type(text)}")
txt = RequestType(text=text)
print(f"Frontend sending: {type(txt)}")
if self.backend:
async for back_resp in self.backend.generate(txt.model_dump_json()):
print(f"Frontend received back_resp: {back_resp}")
yield f"Frontend: {back_resp}"
else:
async for mid_resp in self.middle.generate(txt.model_dump_json()):
print(f"Frontend received mid_resp: {mid_resp}")
yield f"Frontend: {mid_resp}"
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os
import pytest
from dynamo.sdk.lib.config import ServiceConfig
pytestmark = pytest.mark.pre_merge
def test_service_config_with_common_configs():
# Reset singleton instance
ServiceConfig._instance = None
# Set environment variable with config that includes common-configs
os.environ[
"DYNAMO_SERVICE_CONFIG"
] = """
{
"Common": {
"model": "deepseek-ai/DeepSeek-R1-Distill-Llama-8B",
"block-size": 64,
"max-model-len": 16384
},
"VllmWorker": {
"enforce-eager": true,
"common-configs": ["model", "block-size", "max-model-len"]
}
}
"""
# Get arguments and verify common configs are included
service_config = ServiceConfig.get_instance()
vllm_worker_args = service_config.as_args("VllmWorker")
# Check that each common config appears in the arguments
for key in ["model", "block-size", "max-model-len"]:
assert f"--{key}" in vllm_worker_args
def test_service_config_without_common_configs():
# Reset singleton instance
ServiceConfig._instance = None
# Set environment variable with config that DOESN'T include common-configs
os.environ[
"DYNAMO_SERVICE_CONFIG"
] = """
{
"Common": {
"model": "deepseek-ai/DeepSeek-R1-Distill-Llama-8B",
"block-size": 64,
"max-model-len": 16384
},
"VllmWorker": {
"enforce-eager": true
}
}
"""
# Get arguments and verify common configs are NOT included
service_config = ServiceConfig.get_instance()
vllm_worker_args = service_config.as_args("VllmWorker")
# Check that none of the common configs appear in arguments
for key in ["model", "block-size", "max-model-len"]:
assert f"--{key}" not in vllm_worker_args
def test_service_config_with_direct_configs():
# Reset singleton instance
ServiceConfig._instance = None
# Set environment variable with direct configs (no Common section reference)
os.environ[
"DYNAMO_SERVICE_CONFIG"
] = """
{
"VllmWorker": {
"enforce-eager": true,
"model": "deepseek-ai/DeepSeek-R1-Distill-Llama-8B",
"block-size": 64,
"max-model-len": 16384
}
}
"""
# Get arguments and verify direct configs are included
service_config = ServiceConfig.get_instance()
vllm_worker_args = service_config.as_args("VllmWorker")
# Check that each config appears in the arguments
for key in ["model", "block-size", "max-model-len"]:
assert f"--{key}" in vllm_worker_args
def test_service_config_override_common_configs():
# Reset singleton instance
ServiceConfig._instance = None
# Set environment variable with config that includes common-configs
# overridden by the subscribing config
os.environ[
"DYNAMO_SERVICE_CONFIG"
] = """
{
"Common": {
"model": "deepseek-ai/DeepSeek-R1-Distill-Llama-8B",
"block-size": 64,
"max-model-len": 16384
},
"VllmWorker": {
"enforce-eager": true,
"block-size": 128,
"common-configs": ["model", "block-size", "max-model-len"]
}
}
"""
# Get arguments and verify common configs are included
service_config = ServiceConfig.get_instance()
vllm_worker_args = service_config.as_args("VllmWorker")
# Check that each common config appears in the arguments
for key in ["model", "block-size", "max-model-len"]:
assert f"--{key}" in vllm_worker_args
assert vllm_worker_args[vllm_worker_args.index("--block-size") + 1] == "128"
......@@ -79,4 +79,4 @@ async def test_pipeline():
if attempt == max_retries - 1:
raise
print(f"Attempt {attempt + 1} failed, retrying...")
await asyncio.sleep(1)
await asyncio.sleep(3)
......@@ -162,36 +162,36 @@ This will print out something like
```bash
Service Configuration:
{
"Common": {
"model": "deepseek-ai/DeepSeek-R1-Distill-Llama-8B",
"block-size": 64,
"max-model-len": 16384,
},
"Frontend": {
"served_model_name": "deepseek-ai/DeepSeek-R1-Distill-Llama-8B",
"endpoint": "dynamo.Processor.chat/completions",
"port": 8000
},
"Processor": {
"model": "deepseek-ai/DeepSeek-R1-Distill-Llama-8B",
"block-size": 64,
"max-model-len": 16384,
"router": "round-robin"
"router": "round-robin",
"common-configs": [model, block-size, max-model-len]
},
"VllmWorker": {
"model": "deepseek-ai/DeepSeek-R1-Distill-Llama-8B",
"enforce-eager": true,
"block-size": 64,
"max-model-len": 16384,
"max-num-batched-tokens": 16384,
"enable-prefix-caching": true,
"router": "random",
"tensor-parallel-size": 1,
"ServiceArgs": {
"workers": 1
}
},
"common-configs": [model, block-size, max-model-len]
}
}
Environment Variable that would be set:
DYNAMO_SERVICE_CONFIG={"Frontend": {"served_model_name": "deepseek-ai/DeepSeek-R1-Distill-Llama-8B", "endpoint": "dynamo.Processor.chat/completions", "port": 8000}, "Processor": {"model": "deepseek-ai/DeepSeek-R1-Distill-Llama-8B", "block-size": 64,
"max-model-len": 16384, "router": "round-robin"}, "VllmWorker": {"model": "deepseek-ai/DeepSeek-R1-Distill-Llama-8B", "enforce-eager": true, "block-size": 64, "max-model-len": 16384, "max-num-batched-tokens": 16384, "enable-prefix-caching":
true, "router": "random", "tensor-parallel-size": 1, "ServiceArgs": {"workers": 1}}}
DYNAMO_SERVICE_CONFIG={"Common": {"model": "deepseek-ai/DeepSeek-R1-Distill-Llama-8B", "block-size": 64, "max-model-len": 16384}, "Frontend": {"served_model_name": "deepseek-ai/DeepSeek-R1-Distill-Llama-8B", "endpoint": "dynamo.Processor.chat/completions", "port": 8000}, "Processor": {"router": "round-robin", "common-configs": ["model", "block-size", "max-model-len"]}, "VllmWorker": {"enforce-eager": true, "max-num-batched-tokens": 16384, "enable-prefix-caching":
true, "router": "random", "tensor-parallel-size": 1, "ServiceArgs": {"workers": 1}, "common-configs": ["model", "block-size", "max-model-len"]}}
```
You can override any of these configuration options by passing in CLI flags to serve. For example, to change the routing strategy, you can run
......
......@@ -43,7 +43,7 @@ def parse_args(service_name, prefix) -> Namespace:
help="Minimum number of workers required before proceeding",
)
parser.add_argument(
"--model-name",
"--model",
type=str,
default="deepseek-ai/DeepSeek-R1-Distill-Llama-8B",
help="Model that is being served",
......
......@@ -12,6 +12,10 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
Common:
model: deepseek-ai/DeepSeek-R1-Distill-Llama-8B
block-size: 64
max-model-len: 16384
Frontend:
served_model_name: deepseek-ai/DeepSeek-R1-Distill-Llama-8B
......@@ -19,16 +23,11 @@ Frontend:
port: 8000
Processor:
model: deepseek-ai/DeepSeek-R1-Distill-Llama-8B
block-size: 64
max-model-len: 16384
router: round-robin
common-configs: [model, block-size, max-model-len]
VllmWorker:
model: deepseek-ai/DeepSeek-R1-Distill-Llama-8B
enforce-eager: true
block-size: 64
max-model-len: 16384
max-num-batched-tokens: 16384
enable-prefix-caching: true
router: random
......@@ -37,3 +36,4 @@ VllmWorker:
workers: 1
resources:
gpu: 1
common-configs: [model, block-size, max-model-len]
......@@ -12,6 +12,12 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
Common:
model: deepseek-ai/DeepSeek-R1-Distill-Llama-8B
router: kv
block-size: 64
max-model-len: 16384
kv-transfer-config: '{"kv_connector":"DynamoNixlConnector"}'
Frontend:
served_model_name: deepseek-ai/DeepSeek-R1-Distill-Llama-8B
......@@ -19,26 +25,19 @@ Frontend:
port: 8000
Processor:
model: deepseek-ai/DeepSeek-R1-Distill-Llama-8B
block-size: 64
max-model-len: 16384
router: kv
common-configs: [model, block-size, max-model-len, router]
Router:
model-name: deepseek-ai/DeepSeek-R1-Distill-Llama-8B
min-workers: 1
common-configs: [model]
VllmWorker:
model: deepseek-ai/DeepSeek-R1-Distill-Llama-8B
enforce-eager: true
kv-transfer-config: '{"kv_connector":"DynamoNixlConnector"}'
block-size: 64
max-model-len: 16384
max-num-batched-tokens: 16384
enable-prefix-caching: true
router: kv
tensor-parallel-size: 1
ServiceArgs:
workers: 1
resources:
gpu: 1
common-configs: [model, block-size, max-model-len, router, kv-transfer-config]
......@@ -12,7 +12,11 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
Common:
model: deepseek-ai/DeepSeek-R1-Distill-Llama-8B
block-size: 64
max-model-len: 16384
kv-transfer-config: '{"kv_connector":"DynamoNixlConnector"}'
Frontend:
served_model_name: deepseek-ai/DeepSeek-R1-Distill-Llama-8B
......@@ -20,13 +24,10 @@ Frontend:
port: 8000
Processor:
model: deepseek-ai/DeepSeek-R1-Distill-Llama-8B
router: round-robin
common-configs: [model, block-size]
VllmWorker:
model: deepseek-ai/DeepSeek-R1-Distill-Llama-8B
kv-transfer-config: '{"kv_connector":"DynamoNixlConnector"}'
max-model-len: 16384
remote-prefill: true
conditional-disagg: true
max-local-prefill-length: 10
......@@ -35,13 +36,12 @@ VllmWorker:
workers: 1
resources:
gpu: 1
common-configs: [model, block-size, max-model-len, kv-transfer-config]
PrefillWorker:
model: deepseek-ai/DeepSeek-R1-Distill-Llama-8B
kv-transfer-config: '{"kv_connector":"DynamoNixlConnector"}'
max-model-len: 16384
max-num-batched-tokens: 16384
ServiceArgs:
workers: 1
resources:
gpu: 1
common-configs: [model, block-size, max-model-len, kv-transfer-config]
......@@ -12,6 +12,12 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
Common:
model: deepseek-ai/DeepSeek-R1-Distill-Llama-8B
block-size: 64
max-model-len: 16384
router: kv
kv-transfer-config: '{"kv_connector":"DynamoNixlConnector"}'
Frontend:
served_model_name: deepseek-ai/DeepSeek-R1-Distill-Llama-8B
......@@ -19,41 +25,30 @@ Frontend:
port: 8000
Processor:
model: deepseek-ai/DeepSeek-R1-Distill-Llama-8B
block-size: 64
max-model-len: 16384
router: kv
common-configs: [model, block-size, max-model-len, router]
Router:
model-name: deepseek-ai/DeepSeek-R1-Distill-Llama-8B
min-workers: 1
common-configs: [model]
VllmWorker:
model: deepseek-ai/DeepSeek-R1-Distill-Llama-8B
kv-transfer-config: '{"kv_connector":"DynamoNixlConnector"}'
block-size: 64
max-model-len: 16384
max-num-batched-tokens: 16384
remote-prefill: true
conditional-disagg: true
max-local-prefill-length: 10
max-prefill-queue-size: 2
tensor-parallel-size: 1
router: kv
enable-prefix-caching: true
ServiceArgs:
workers: 1
resources:
gpu: 1
common-configs: [model, block-size, max-model-len, router, kv-transfer-config]
# TODO - set all of these but model as default
PrefillWorker:
model: deepseek-ai/DeepSeek-R1-Distill-Llama-8B
kv-transfer-config: '{"kv_connector":"DynamoNixlConnector"}'
block-size: 64
max-model-len: 16384
max-num-batched-tokens: 16384
ServiceArgs:
workers: 1
resources:
gpu: 1
common-configs: [model, block-size, max-model-len, kv-transfer-config]
......@@ -89,7 +89,7 @@ pub enum PromptContextMixin {
/// Support OAI Chat Messages and Tools
OaiChat,
/// Enables templates with `{{datatime}}` to be rendered with the current date and time.
/// Enables templates with `{{datetime}}` to be rendered with the current date and time.
Llama3DateTime,
}
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment