register.py 4.75 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0

import logging
from typing import Optional

import sglang as sgl
from sglang.srt.server_args import ServerArgs

from dynamo._core import Endpoint
11
from dynamo.llm import ModelInput, ModelRuntimeConfig, ModelType, register_llm
12
from dynamo.sglang.args import DynamoArgs
13
14
15
16
17
18


async def register_llm_with_runtime_config(
    engine: sgl.Engine,
    endpoint: Endpoint,
    server_args: ServerArgs,
19
    dynamo_args: DynamoArgs,
20
21
22
23
24
25
) -> bool:
    """Register LLM with runtime config

    Returns:
        bool: True if registration succeeded, False if it failed
    """
26
    runtime_config = await _get_runtime_config(engine, server_args, dynamo_args)
27
28
29
30
31
32
33
34
    input_type = ModelInput.Tokens
    output_type = ModelType.Chat | ModelType.Completions
    if not server_args.skip_tokenizer_init:
        logging.warning(
            "The skip-tokenizer-init flag was not set. Using the sglang tokenizer/detokenizer instead. The dynamo tokenizer/detokenizer will not be used and only v1/chat/completions will be available"
        )
        input_type = ModelInput.Text
        output_type = ModelType.Chat
35
36
    try:
        await register_llm(
37
38
            input_type,
            output_type,
39
40
41
42
            endpoint,
            server_args.model_path,
            server_args.served_model_name,
            kv_cache_block_size=server_args.page_size,
43
            migration_limit=dynamo_args.migration_limit,
44
            runtime_config=runtime_config,
45
            custom_template_path=dynamo_args.custom_jinja_template,
46
        )
47
48
        logging.info("Successfully registered LLM with runtime config")
        return True
49
50
    except Exception as e:
        logging.error(f"Failed to register with runtime config: {e}")
51
        return False
52
53


54
async def _get_runtime_config(
55
    engine: sgl.Engine, server_args: ServerArgs, dynamo_args: DynamoArgs
56
) -> Optional[ModelRuntimeConfig]:
57
    """Get runtime config from SGLang engine"""
58
59
60
61
    runtime_config = ModelRuntimeConfig()
    # set reasoning parser and tool call parser
    runtime_config.reasoning_parser = dynamo_args.reasoning_parser
    runtime_config.tool_call_parser = dynamo_args.tool_call_parser
62
63
64
65
66
67
68
69
70
71
72
73

    # In SGLang, these are server_args, not scheduler_info (unlike vLLM)
    # Note: If --max-running-requests is not specified, SGLang uses an internal default
    # undocumented value. The value here will be None if not explicitly set by user.
    max_running_requests = getattr(server_args, "max_running_requests", None)
    if max_running_requests:
        runtime_config.max_num_seqs = max_running_requests

    max_prefill_tokens = getattr(server_args, "max_prefill_tokens", None)
    if max_prefill_tokens:
        runtime_config.max_num_batched_tokens = max_prefill_tokens

74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
    try:
        # Try to check if the engine has a scheduler attribute with the computed values
        if hasattr(engine, "scheduler_info") and engine.scheduler_info is not None:
            # Get max_total_num_tokens from scheduler_info
            if "max_total_num_tokens" in engine.scheduler_info:
                max_total_tokens = engine.scheduler_info["max_total_num_tokens"]
                if max_total_tokens and hasattr(
                    engine.tokenizer_manager, "server_args"
                ):
                    page_size = engine.tokenizer_manager.server_args.page_size
                    if page_size:
                        runtime_config.total_kv_blocks = (
                            max_total_tokens + page_size - 1
                        ) // page_size
                        logging.info(
                            f"Got total KV blocks from scheduler: {runtime_config.total_kv_blocks} "
                            f"(max_total_tokens={max_total_tokens}, page_size={page_size})"
                        )

93
94
95
96
            # Note: max_running_requests and max_prefill_tokens are NOT available in scheduler_info.
            # SGLang separates configuration (server_args) from runtime stats (scheduler_info).
            # In contrast, vLLM exposes both config and runtime values through engine config.
            # These are config parameters, so they must be retrieved from server_args only.
97
98
99
100
101
102
103
104
105

            return runtime_config

        # If scheduler approach doesn't work, log and return None to indicate we'll skip runtime config
        logging.warning(
            "Could not access runtime config from SGLang engine. "
            "The engine may compute these values internally after initialization. "
            "Proceeding without runtime config - SGLang will use its internal defaults."
        )
106
        return runtime_config
107
108
109

    except Exception as e:
        logging.warning(f"Failed to get runtime config: {e}. Proceeding without it.")
110
        return runtime_config