model_runner.py 86.5 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
11
12
13
# Copyright 2023-2024 SGLang Team
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
Lianmin Zheng's avatar
Lianmin Zheng committed
14
"""ModelRunner runs the forward passes of the models."""
15

16
import datetime
17
import gc
18
import inspect
Shuo Yang's avatar
Shuo Yang committed
19
import json
20
import logging
21
import os
22
23
import socket
import threading
24
import time
25
from collections import defaultdict
26
27
from dataclasses import dataclass
from typing import List, Optional, Tuple, Union
28
from urllib.parse import urlparse
Lianmin Zheng's avatar
Lianmin Zheng committed
29

30
import requests
Lianmin Zheng's avatar
Lianmin Zheng committed
31
import torch
32
import torch.distributed as dist
33

fzyzcjy's avatar
fzyzcjy committed
34
from sglang.srt import slow_rank_detector
35
from sglang.srt.configs.device_config import DeviceConfig
36
from sglang.srt.configs.load_config import LoadConfig, LoadFormat
37
from sglang.srt.configs.model_config import AttentionArch, ModelConfig
38
from sglang.srt.configs.update_config import adjust_config_with_unaligned_cpu_tp
39
from sglang.srt.connector import ConnectorType
40
from sglang.srt.constants import GPU_MEMORY_TYPE_WEIGHTS
41
from sglang.srt.distributed import (
42
    get_pp_group,
zhyncs's avatar
zhyncs committed
43
    get_tp_group,
44
    get_world_group,
zhyncs's avatar
zhyncs committed
45
46
    init_distributed_environment,
    initialize_model_parallel,
47
    set_custom_all_reduce,
48
    set_mscclpp_all_reduce,
zhyncs's avatar
zhyncs committed
49
)
50
from sglang.srt.distributed.parallel_state import monkey_patch_vllm_parallel_state
fzyzcjy's avatar
fzyzcjy committed
51
52
53
54
55
56
57
58
59
60
61
62
63
from sglang.srt.eplb.eplb_manager import EPLBManager
from sglang.srt.eplb.expert_distribution import (
    ExpertDistributionRecorder,
    get_global_expert_distribution_recorder,
    set_global_expert_distribution_recorder,
)
from sglang.srt.eplb.expert_location import (
    ExpertLocationMetadata,
    compute_initial_expert_location_metadata,
    get_global_expert_location_metadata,
    set_global_expert_location_metadata,
)
from sglang.srt.eplb.expert_location_updater import ExpertLocationUpdater
64
65
66
67
from sglang.srt.layers.attention.attention_registry import (
    ATTENTION_BACKENDS,
    attn_backend_wrapper,
)
68
from sglang.srt.layers.attention.tbo_backend import TboAttnBackend
69
70
from sglang.srt.layers.dp_attention import (
    get_attention_tp_group,
71
    get_attention_tp_size,
72
73
    initialize_dp_attention,
)
Liangsheng Yin's avatar
Liangsheng Yin committed
74
from sglang.srt.layers.logits_processor import LogitsProcessorOutput
75
76
77
from sglang.srt.layers.quantization import (
    deep_gemm_wrapper,
    monkey_patch_isinstance_for_vllm_base_layer,
78
)
79
from sglang.srt.layers.sampler import Sampler
80
from sglang.srt.layers.torchao_utils import apply_torchao_config_to_model
81
from sglang.srt.lora.lora_manager import LoRAManager
82
from sglang.srt.lora.lora_registry import LoRARef
83
84
85
86
from sglang.srt.managers.schedule_batch import (
    GLOBAL_SERVER_ARGS_KEYS,
    global_server_args_dict,
)
87
88
89
from sglang.srt.mem_cache.allocator import (
    BaseTokenToKVPoolAllocator,
    PagedTokenToKVPoolAllocator,
tarinkk's avatar
tarinkk committed
90
    SWATokenToKVPoolAllocator,
91
92
    TokenToKVPoolAllocator,
)
Lianmin Zheng's avatar
Lianmin Zheng committed
93
from sglang.srt.mem_cache.allocator_ascend import AscendPagedTokenToKVPoolAllocator
94
from sglang.srt.mem_cache.memory_pool import (
95
96
    AscendMLAPagedTokenToKVPool,
    AscendTokenToKVPool,
Shuo Yang's avatar
Shuo Yang committed
97
    DoubleSparseTokenToKVPool,
Yi Zhang's avatar
Yi Zhang committed
98
99
    HybridLinearKVPool,
    HybridReqToTokenPool,
100
101
102
    MHATokenToKVPool,
    MLATokenToKVPool,
    ReqToTokenPool,
tarinkk's avatar
tarinkk committed
103
    SWAKVPool,
104
)
105
from sglang.srt.model_executor.cpu_graph_runner import CPUGraphRunner
106
from sglang.srt.model_executor.cuda_graph_runner import CudaGraphRunner
107
from sglang.srt.model_executor.forward_batch_info import ForwardBatch, PPProxyTensors
108
from sglang.srt.model_executor.npu_graph_runner import NPUGraphRunner
109
from sglang.srt.model_loader import get_model
110
from sglang.srt.model_loader.loader import DefaultModelLoader, get_model_loader
111
112
113
from sglang.srt.model_loader.remote_instance_weight_loader_utils import (
    trigger_init_weights_send_group_for_remote_instance_request,
)
Lianmin Zheng's avatar
Lianmin Zheng committed
114
from sglang.srt.model_loader.utils import set_default_torch_dtype
115
from sglang.srt.model_loader.weight_utils import default_weight_loader
116
117
118
119
120
from sglang.srt.offloader import (
    create_offloader_from_server_args,
    get_offloader,
    set_offloader,
)
121
from sglang.srt.patch_torch import monkey_patch_torch_reductions
122
from sglang.srt.sampling.sampling_batch_info import SamplingBatchInfo
Lianmin Zheng's avatar
Lianmin Zheng committed
123
from sglang.srt.server_args import ServerArgs
124
from sglang.srt.speculative.spec_info import SpeculativeAlgorithm
125
from sglang.srt.torch_memory_saver_adapter import TorchMemorySaverAdapter
126
from sglang.srt.utils import (
127
    MultiprocessingSerializer,
128
    cpu_has_amx_support,
129
    dynamic_import,
130
    enable_show_time_cost,
131
    get_available_gpu_memory,
132
    get_bool_env_var,
133
    get_cpu_ids_by_node,
134
    init_custom_process_group,
135
    is_blackwell,
136
    is_fa3_default_architecture,
137
    is_flashinfer_available,
HAI's avatar
HAI committed
138
    is_hip,
139
    is_hopper_with_cuda_12_3,
140
    is_no_spec_infer_or_topk_one,
141
    is_npu,
142
    is_sm100_supported,
143
    log_info_on_rank0,
144
    monkey_patch_p2p_access_check,
145
    monkey_patch_vllm_gguf_config,
146
    parse_connector_type,
147
    set_cuda_arch,
148
)
149
150
151
152
from sglang.srt.weight_sync.tensor_bucket import (
    FlattenedTensorBucket,
    FlattenedTensorMetadata,
)
153

154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
MLA_ATTENTION_BACKENDS = [
    "aiter",
    "flashinfer",
    "fa3",
    "fa4",
    "triton",
    "flashmla",
    "cutlass_mla",
    "trtllm_mla",
    "ascend",
]


def add_mla_attention_backend(backend_name):
    if backend_name not in MLA_ATTENTION_BACKENDS:
        MLA_ATTENTION_BACKENDS.append(backend_name)
        logger.info(f"Added {backend_name} to MLA_ATTENTION_BACKENDS.")


173
_is_hip = is_hip()
174
_is_npu = is_npu()
175
_is_cpu_amx_available = cpu_has_amx_support()
176

Lianmin Zheng's avatar
Lianmin Zheng committed
177
# Use a small KV cache pool size for tests in CI
178
SGLANG_CI_SMALL_KV_SIZE = os.getenv("SGLANG_CI_SMALL_KV_SIZE", None)
Lianmin Zheng's avatar
Lianmin Zheng committed
179
180

# Detect stragger ranks in model loading
181
182
UNBALANCED_MODEL_LOADING_TIMEOUT_S = 300

Lianmin Zheng's avatar
Lianmin Zheng committed
183
184
logger = logging.getLogger(__name__)

185

186
187
188
189
190
191
192
if _is_npu:
    import torch_npu

    torch.npu.config.allow_internal_format = True
    torch_npu.npu.set_compile_mode(jit_compile=False)


193
194
195
196
197
198
199
200
201
202
203
204
205
class RankZeroFilter(logging.Filter):
    """Filter that only allows INFO level logs from rank 0, but allows all other levels from any rank."""

    def __init__(self, is_rank_zero):
        super().__init__()
        self.is_rank_zero = is_rank_zero

    def filter(self, record):
        if record.levelno == logging.INFO:
            return self.is_rank_zero
        return True


Lianmin Zheng's avatar
Lianmin Zheng committed
206
class ModelRunner:
207
208
    """ModelRunner runs the forward passes of the models."""

Lianmin Zheng's avatar
Lianmin Zheng committed
209
210
    def __init__(
        self,
211
        model_config: ModelConfig,
212
213
214
215
        mem_fraction_static: float,
        gpu_id: int,
        tp_rank: int,
        tp_size: int,
Cheng Wan's avatar
Cheng Wan committed
216
217
        moe_ep_rank: int,
        moe_ep_size: int,
218
219
        pp_rank: int,
        pp_size: int,
220
        nccl_port: int,
Lianmin Zheng's avatar
Lianmin Zheng committed
221
        server_args: ServerArgs,
fzyzcjy's avatar
fzyzcjy committed
222
        dp_rank: Optional[int] = None,
223
        is_draft_worker: bool = False,
224
        req_to_token_pool: Optional[ReqToTokenPool] = None,
225
        token_to_kv_pool_allocator: Optional[BaseTokenToKVPoolAllocator] = None,
Lianmin Zheng's avatar
Lianmin Zheng committed
226
    ):
227
        # Parse args
Lianmin Zheng's avatar
Lianmin Zheng committed
228
        self.mem_fraction_static = mem_fraction_static
Zhang, Liangang's avatar
Zhang, Liangang committed
229
        self.device = server_args.device
230
        self.gpu_id = gpu_id
Lianmin Zheng's avatar
Lianmin Zheng committed
231
232
        self.tp_rank = tp_rank
        self.tp_size = tp_size
Cheng Wan's avatar
Cheng Wan committed
233
234
        self.moe_ep_rank = moe_ep_rank
        self.moe_ep_size = moe_ep_size
235
        self.dp_size = server_args.dp_size
236
237
        self.pp_rank = pp_rank
        self.pp_size = pp_size
238
        self.model_config = model_config
Zhang, Liangang's avatar
Zhang, Liangang committed
239
        self.dist_port = nccl_port
Lianmin Zheng's avatar
Lianmin Zheng committed
240
        self.server_args = server_args
241
        self.is_draft_worker = is_draft_worker
242
243
        self.is_generation = model_config.is_generation
        self.is_multimodal = model_config.is_multimodal
244
245
246
        self.is_multimodal_chunked_prefill_supported = (
            model_config.is_multimodal_chunked_prefill_supported
        )
247
248
249
        self.spec_algorithm = SpeculativeAlgorithm.from_string(
            server_args.speculative_algorithm
        )
250
        self.page_size = server_args.page_size
251
252
        self.req_to_token_pool = req_to_token_pool
        self.token_to_kv_pool_allocator = token_to_kv_pool_allocator
tarinkk's avatar
tarinkk committed
253
        self.is_hybrid = model_config.is_hybrid
Baizhou Zhang's avatar
Baizhou Zhang committed
254
        self.use_mla_backend = self.model_config.attention_arch == AttentionArch.MLA
Chang Su's avatar
Chang Su committed
255
        self.attention_chunk_size = model_config.attention_chunk_size
256
257
        self.forward_pass_id = 0

Lianmin Zheng's avatar
Lianmin Zheng committed
258
259
260
        # Apply the rank zero filter to logger
        if not any(isinstance(f, RankZeroFilter) for f in logger.filters):
            logger.addFilter(RankZeroFilter(tp_rank == 0))
261
262
        if server_args.show_time_cost:
            enable_show_time_cost()
263

Lianmin Zheng's avatar
Lianmin Zheng committed
264
265
266
        # Model-specific adjustment
        self.model_specific_adjustment()

267
        # Global vars
268
        global_server_args_dict.update(
269
270
271
            {k: getattr(server_args, k) for k in GLOBAL_SERVER_ARGS_KEYS}
            | {
                # TODO it is indeed not a "server args"
272
                "use_mla_backend": self.use_mla_backend,
273
                "speculative_algorithm": self.spec_algorithm,
274
            }
275
        )
Lianmin Zheng's avatar
Lianmin Zheng committed
276

277
278
279
280
        # Init OpenMP threads binding for CPU
        if self.device == "cpu":
            self.init_threads_binding()

281
        # Get memory before model loading
282
        min_per_gpu_memory = self.init_torch_distributed()
283

284
        # CPU offload
fzyzcjy's avatar
fzyzcjy committed
285
        set_offloader(create_offloader_from_server_args(server_args, dp_rank=dp_rank))
286

fzyzcjy's avatar
fzyzcjy committed
287
288
289
        if get_bool_env_var("SGLANG_DETECT_SLOW_RANK"):
            slow_rank_detector.execute()

290
        # Update deep gemm configure
291
292
        if deep_gemm_wrapper.ENABLE_JIT_DEEPGEMM:
            deep_gemm_wrapper.update_deep_gemm_config(gpu_id, server_args)
293

Lianmin Zheng's avatar
Lianmin Zheng committed
294
        # Initialize the model runner
295
296
        self.initialize(min_per_gpu_memory)

Lianmin Zheng's avatar
Lianmin Zheng committed
297
        # Temporary cached values
298
299
300
        self.support_pp = (
            "pp_proxy_tensors" in inspect.signature(self.model.forward).parameters
        )
Lianmin Zheng's avatar
Lianmin Zheng committed
301
302

        # For weight updates
303
        self._model_update_group = {}
304
        self._weights_send_group = {}
305

306
307
    def initialize(self, min_per_gpu_memory: float):
        server_args = self.server_args
308

309
310
311
312
        self.memory_saver_adapter = TorchMemorySaverAdapter.create(
            enable=self.server_args.enable_memory_saver
        )

313
314
315
316
317
318
319
320
        if not self.is_draft_worker:
            set_global_expert_location_metadata(
                compute_initial_expert_location_metadata(server_args, self.model_config)
            )
            if self.tp_rank == 0 and get_bool_env_var(
                "SGLANG_LOG_EXPERT_LOCATION_METADATA"
            ):
                logger.info(
321
                    f"Initial expert_location_metadata: {get_global_expert_location_metadata()}"
322
323
324
325
326
327
328
329
330
331
                )

            set_global_expert_distribution_recorder(
                ExpertDistributionRecorder.init_new(
                    server_args,
                    get_global_expert_location_metadata(),
                    rank=self.tp_rank,
                )
            )

Lianmin Zheng's avatar
Lianmin Zheng committed
332
        # Expert parallelism
333
334
335
336
337
        self.eplb_manager = (
            EPLBManager(self)
            if self.server_args.enable_eplb and (not self.is_draft_worker)
            else None
        )
338
        self.expert_location_updater = ExpertLocationUpdater()
339

340
        # Load the model
341
        self.sampler = Sampler()
342
        self.load_model()
343

344
        # Check if the model is using hybrid SWA
Hanming Lu's avatar
Hanming Lu committed
345
346
347
348
349
350
351
352
353
        if (
            not self.server_args.disable_hybrid_swa_memory
            and self.sliding_window_size is not None
            and self.sliding_window_size > 0
        ):
            architectures = self.model_config.hf_config.architectures
            if architectures and not any("Llama4" in arch for arch in architectures):
                self.is_hybrid = self.model_config.is_hybrid = True

Yi Zhang's avatar
Yi Zhang committed
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
        if self.is_hybrid_gdn:
            logger.warning("Hybrid GDN model detected, disable radix cache")
            self.server_args.disable_radix_cache = True
            if self.server_args.max_mamba_cache_size is None:
                if self.server_args.max_running_requests is not None:
                    self.server_args.max_mamba_cache_size = (
                        self.server_args.max_running_requests
                    )
                else:
                    self.server_args.max_mamba_cache_size = 512
            self.server_args.max_mamba_cache_size = (
                self.server_args.max_mamba_cache_size
                // (
                    self.server_args.dp_size
                    if self.server_args.enable_dp_attention
                    else 1
                )
            )

373
374
375
376
377
378
379
        # For MTP models like DeepSeek-V3 or GLM-4.5, the MTP layer(s) are used separately as draft
        # models for speculative decoding. In those cases, `num_nextn_predict_layers` is used to
        # determine the number of layers.
        model_has_mtp_layers = self.model_config.num_nextn_predict_layers is not None
        model_num_layers = (
            self.model_config.num_nextn_predict_layers
            if self.is_draft_worker and model_has_mtp_layers
380
381
382
383
            else max(
                self.model_config.num_hidden_layers,
                self.model_config.num_attention_layers,
            )
384
        )
385
386
        self.start_layer = getattr(self.model, "start_layer", 0)
        self.end_layer = getattr(self.model, "end_layer", model_num_layers)
387
        self.num_effective_layers = self.end_layer - self.start_layer
388
389
390
391
392
393
394
        assert (
            (not model_has_mtp_layers)
            or (self.spec_algorithm.is_none())
            or (
                (not self.spec_algorithm.is_none())
                and (self.num_effective_layers == model_num_layers)
            )
395
        ), "PP is not compatible with MTP models."
396

397
        # Apply torchao quantization
398
399
400
401
402
403
        torchao_applied = getattr(self.model, "torchao_applied", False)
        # In layered loading, torchao may have been applied
        if not torchao_applied:
            apply_torchao_config_to_model(
                self.model, global_server_args_dict["torchao_config"]
            )
404

405
        # Apply torch TP if the model supports it
406
407
408
409
        supports_torch_tp = getattr(self.model, "supports_torch_tp", False)
        if self.tp_size > 1 and supports_torch_tp:
            self.apply_torch_tp()

410
        # Init lora
411
        if server_args.enable_lora:
412
            self.init_lora_manager()
413

414
415
416
417
418
419
420
421
        # Init Double Sparsity
        if server_args.enable_double_sparsity:
            if server_args.ds_heavy_channel_type is None:
                raise ValueError(
                    "Please specify the heavy channel type for double sparsity optimization."
                )
            self.init_double_sparsity_channel_config(server_args.ds_heavy_channel_type)

422
423
        # Enable batch invariant mode
        if server_args.enable_deterministic_inference:
424
            from sglang.srt.batch_invariant_ops import enable_batch_invariant_mode
425
426
427

            enable_batch_invariant_mode()

428
        # Init memory pool and attention backends
429
430
        self.init_memory_pool(
            min_per_gpu_memory,
431
            server_args.max_running_requests,
432
433
            server_args.max_total_tokens,
        )
Zhang, Liangang's avatar
Zhang, Liangang committed
434
435
436
        if self.device == "cuda":
            self.init_cublas()
            self.init_attention_backend()
437
            self.init_device_graphs()
438
        elif self.device in ["npu", "cpu"]:
439
440
            self.init_attention_backend()
            self.init_device_graphs()
Zhang, Liangang's avatar
Zhang, Liangang committed
441
        else:
442
            self.graph_runner = None
443
            self.graph_mem_usage = 0
Zhang, Liangang's avatar
Zhang, Liangang committed
444
            self.init_attention_backend()
445

James Liu's avatar
James Liu committed
446
447
        # auxiliary hidden capture mode. TODO: expose this to server args?
        if self.spec_algorithm.is_eagle3() and not self.is_draft_worker:
lukec's avatar
lukec committed
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
            # load draft config
            draft_model_config = ModelConfig.from_server_args(
                server_args,
                model_path=(server_args.speculative_draft_model_path),
                is_draft_model=True,
            )

            try:
                # get the aux layer from draft model config
                eagle_config = getattr(
                    draft_model_config.hf_config, "eagle_config", None
                )
                eagle_aux_hidden_state_layer_ids = eagle_config[
                    "eagle_aux_hidden_state_layer_ids"
                ]
            except:
                # if there is no aux layer, set to None
                eagle_aux_hidden_state_layer_ids = None

            self.model.set_eagle3_layers_to_capture(eagle_aux_hidden_state_layer_ids)
James Liu's avatar
James Liu committed
468

469
470
471
    def model_specific_adjustment(self):
        server_args = self.server_args

472
473
474
        if (
            server_args.attention_backend == "intel_amx"
            and server_args.device == "cpu"
475
            and not _is_cpu_amx_available
476
477
478
479
480
481
        ):
            logger.info(
                "The current platform does not support Intel AMX, will fallback to torch_native backend."
            )
            server_args.attention_backend = "torch_native"

482
483
484
485
486
487
        if server_args.prefill_attention_backend is not None and (
            server_args.prefill_attention_backend
            == server_args.decode_attention_backend
        ):  # override the default attention backend
            server_args.attention_backend = server_args.prefill_attention_backend

488
489
490
491
492
493
494
495
496
497
498
499
500
        if (
            getattr(self.model_config.hf_config, "dual_chunk_attention_config", None)
            is not None
        ):
            if server_args.attention_backend is None:
                server_args.attention_backend = "dual_chunk_flash_attn"
                logger.info("Dual chunk attention is turned on by default.")
            elif server_args.attention_backend != "dual_chunk_flash_attn":
                raise ValueError(
                    "Dual chunk attention is enabled, but attention backend is set to "
                    f"{server_args.attention_backend}. Please set it to 'dual_chunk_flash_attn'."
                )

501
        if server_args.attention_backend is None:
502
            """
Lianmin Zheng's avatar
Lianmin Zheng committed
503
504
            Auto select the fastest attention backend.

505
506
507
508
509
            1. Models with MHA Architecture (e.g: Llama, QWen)
                1.1 We will turn on FA3 on hopper unless user use spec decode with topk > 1 or page_size > 1.
                1.2 In other cases, we will use flashinfer if available, otherwise use triton.
            2. Models with MLA Architecture and using FA3
                2.1 We will use FA3 backend on hopper.
510
511
                2.2 We will use Flashinfer backend on blackwell.
                2.3 Otherwise, we will use triton backend.
512
513
            """

514
            if not self.use_mla_backend:
Lianmin Zheng's avatar
Lianmin Zheng committed
515
                # MHA architecture
516
                if (
517
                    is_hopper_with_cuda_12_3()
518
519
520
521
                    and is_no_spec_infer_or_topk_one(server_args)
                    and is_fa3_default_architecture(self.model_config.hf_config)
                ):
                    server_args.attention_backend = "fa3"
522
523
                elif _is_hip:
                    server_args.attention_backend = "aiter"
524
525
                elif _is_npu:
                    server_args.attention_backend = "ascend"
526
527
528
529
                else:
                    server_args.attention_backend = (
                        "flashinfer" if is_flashinfer_available() else "triton"
                    )
530
            else:
Lianmin Zheng's avatar
Lianmin Zheng committed
531
                # MLA architecture
532
                if is_hopper_with_cuda_12_3():
533
                    server_args.attention_backend = "fa3"
534
535
                elif is_sm100_supported():
                    server_args.attention_backend = "flashinfer"
536
537
538
539
540
541
542
543
544
                elif _is_hip:
                    head_num = self.model_config.get_num_kv_heads(self.tp_size)
                    # TODO current aiter only support head number 16 or 128 head number
                    if (
                        head_num == 128 or head_num == 16
                    ) and self.spec_algorithm.is_none():
                        server_args.attention_backend = "aiter"
                    else:
                        server_args.attention_backend = "triton"
545
546
                elif _is_npu:
                    server_args.attention_backend = "ascend"
547
548
                else:
                    server_args.attention_backend = "triton"
549
            logger.info(
550
                f"Attention backend not explicitly specified. Use {server_args.attention_backend} backend by default."
551
            )
552
        elif self.use_mla_backend:
553
            if server_args.device != "cpu":
554
                if server_args.attention_backend in MLA_ATTENTION_BACKENDS:
555
556
557
                    logger.info(
                        f"MLA optimization is turned on. Use {server_args.attention_backend} backend."
                    )
558
                else:
559
560
561
562
                    raise ValueError(
                        f"Invalid attention backend for MLA: {server_args.attention_backend}"
                    )
            else:
563
564
565
566
                if server_args.attention_backend != "intel_amx":
                    raise ValueError(
                        "MLA optimization not supported on CPU except for intel_amx backend."
                    )
567

568
569
570
571
572
573
574
575
576
577
        if (
            server_args.attention_backend == "fa3"
            and server_args.kv_cache_dtype == "fp8_e5m2"
        ):
            logger.warning(
                "FlashAttention3 only supports fp8_e4m3 if using FP8; "
                "Setting attention backend to triton."
            )
            server_args.attention_backend = "triton"

578
        if server_args.enable_double_sparsity:
579
580
581
            logger.info(
                "Double sparsity optimization is turned on. Use triton backend without CUDA graph."
            )
582
583
584
585
            server_args.attention_backend = "triton"
            server_args.disable_cuda_graph = True

        if self.is_multimodal:
586
587
588
            if not self.is_multimodal_chunked_prefill_supported:
                server_args.chunked_prefill_size = -1
                logger.info(
589
                    f"Automatically turn off --chunked-prefill-size as it is not supported for "
590
591
                    f"{self.model_config.hf_config.model_type}"
                )
592

593
594
        if not self.use_mla_backend:
            server_args.disable_chunked_prefix_cache = True
595

596
        if not server_args.disable_chunked_prefix_cache:
597
            logger.info("Chunked prefix cache is turned on.")
598

kk's avatar
kk committed
599
600
601
602
        if server_args.attention_backend == "aiter":
            if self.model_config.context_len > 8192:
                self.mem_fraction_static *= 0.85

603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
        if (
            server_args.enable_hierarchical_cache
            and server_args.hicache_io_backend == "kernel"
        ):
            # fix for the compatibility issue with FlashAttention3 decoding and HiCache kernel backend
            if server_args.decode_attention_backend is None:
                if not self.use_mla_backend:
                    server_args.decode_attention_backend = (
                        "flashinfer" if is_flashinfer_available() else "triton"
                    )
                else:
                    server_args.decode_attention_backend = (
                        "flashinfer" if is_sm100_supported() else "triton"
                    )
            elif server_args.decode_attention_backend == "fa3":
                server_args.hicache_io_backend = "direct"
                logger.warning(
                    "FlashAttention3 decode backend is not compatible with hierarchical cache. "
                    f"Setting hicache_io_backend to vanilla I/O, which may lead to suboptimal performance with small page sizes."
                )

624
    def init_torch_distributed(self):
625
        logger.info("Init torch distributed begin.")
626

627
628
629
630
631
632
633
634
        try:
            torch.get_device_module(self.device).set_device(self.gpu_id)
        except Exception:
            logger.warning(
                f"Context: {self.device=} {self.gpu_id=} {os.environ.get('CUDA_VISIBLE_DEVICES')=} {self.tp_rank=} {self.tp_size=}"
            )
            raise

Zhang, Liangang's avatar
Zhang, Liangang committed
635
636
        if self.device == "cuda":
            backend = "nccl"
637
        elif self.device == "xpu":
638
            backend = "xccl"
639
640
        elif self.device == "hpu":
            backend = "hccl"
641
642
        elif self.device == "cpu":
            backend = "gloo"
643
644
        elif self.device == "npu":
            backend = "hccl"
645

646
        before_avail_memory = get_available_gpu_memory(self.device, self.gpu_id)
647
        if not self.server_args.enable_p2p_check:
648
649
            monkey_patch_p2p_access_check()

650
        if self.server_args.dist_init_addr:
Zhang, Liangang's avatar
Zhang, Liangang committed
651
            dist_init_method = f"tcp://{self.server_args.dist_init_addr}"
652
        else:
Zhang, Liangang's avatar
Zhang, Liangang committed
653
            dist_init_method = f"tcp://127.0.0.1:{self.dist_port}"
654
        set_custom_all_reduce(not self.server_args.disable_custom_all_reduce)
655
        set_mscclpp_all_reduce(self.server_args.enable_mscclpp)
656
657

        if not self.is_draft_worker:
658
659
660
661
            if self.device == "cpu":
                if _is_cpu_amx_available:
                    # Bind OpenMP threads to CPU cores
                    torch.ops.sgl_kernel.init_cpu_threads_env(self.local_omp_cpuid)
662
663
664
665

                    # Set local size to hint SGLang to use shared memory based AllReduce
                    os.environ["LOCAL_SIZE"] = str(self.tp_size)
                    torch.ops.sgl_kernel.initialize(self.tp_size, self.tp_rank)
666
667
668
669
670

                    @torch.library.register_fake("sgl_kernel::shm_allgather")
                    def _(data, dim):
                        return torch.cat([data] * self.tp_size, dim=dim)

671
672
                else:
                    logger.warning(
673
                        "init_cpu_threads_env and shared memory based AllReduce is disabled since intel amx backend is not available"
674
675
                    )

Mick's avatar
Mick committed
676
            # Only initialize the distributed environment on the target model worker.
677
678
            init_distributed_environment(
                backend=backend,
679
680
                world_size=self.tp_size * self.pp_size,
                rank=self.tp_size * self.pp_rank + self.tp_rank,
681
682
                local_rank=self.gpu_id,
                distributed_init_method=dist_init_method,
683
                timeout=self.server_args.dist_timeout,
684
            )
685
686
687
            initialize_model_parallel(
                tensor_model_parallel_size=self.tp_size,
                pipeline_model_parallel_size=self.pp_size,
Cheng Wan's avatar
Cheng Wan committed
688
                expert_model_parallel_size=self.moe_ep_size,
689
                duplicate_tp_group=self.server_args.enable_pdmux,
690
            )
691
            initialize_dp_attention(
692
693
                server_args=self.server_args,
                model_config=self.model_config,
694
            )
695

696
        min_per_gpu_memory = get_available_gpu_memory(
697
698
699
700
            self.device,
            self.gpu_id,
            distributed=get_world_group().world_size > 1,
            cpu_group=get_world_group().cpu_group,
701
        )
702
        self.tp_group = get_tp_group()
703
        self.pp_group = get_pp_group()
704
        self.attention_tp_group = get_attention_tp_group()
705

706
        # Check memory for tensor parallelism
707
        local_gpu_memory = get_available_gpu_memory(self.device, self.gpu_id)
708
        if self.tp_size > 1 and not self.is_draft_worker:
709
            if min_per_gpu_memory < local_gpu_memory * 0.9:
710
711
712
713
714
715
716
717
718
719
                if get_bool_env_var("SGL_DISABLE_TP_MEMORY_INBALANCE_CHECK"):
                    logger.warning(
                        "The memory capacity is unbalanced. Some GPUs may be occupied by other processes. "
                        f"{min_per_gpu_memory=}, {local_gpu_memory=}, {local_gpu_memory * 0.9=}"
                    )
                else:
                    raise ValueError(
                        "The memory capacity is unbalanced. Some GPUs may be occupied by other processes. "
                        f"{min_per_gpu_memory=}, {local_gpu_memory=}, {local_gpu_memory * 0.9=}"
                    )
Lianmin Zheng's avatar
Lianmin Zheng committed
720

721
722
723
        logger.info(
            f"Init torch distributed ends. mem usage={(before_avail_memory - local_gpu_memory):.2f} GB"
        )
724
        return min_per_gpu_memory
725

Lianmin Zheng's avatar
Lianmin Zheng committed
726
    def load_model(self):
727
        before_avail_memory = get_available_gpu_memory(self.device, self.gpu_id)
728
        logger.info(
Zhang, Liangang's avatar
Zhang, Liangang committed
729
            f"Load weight begin. avail mem={get_available_gpu_memory(self.device, self.gpu_id):.2f} GB"
730
        )
Lianmin Zheng's avatar
Lianmin Zheng committed
731
732

        # This can reduce thread conflicts and speed up weight loading.
733
734
        if self.device != "cpu":
            torch.set_num_threads(1)
Zhang, Liangang's avatar
Zhang, Liangang committed
735
736
        if self.device == "cuda":
            if torch.cuda.get_device_capability()[0] < 8:
737
738
739
                logger.info(
                    "Compute capability below sm80. Use float16 due to lack of bfloat16 support."
                )
Zhang, Liangang's avatar
Zhang, Liangang committed
740
                self.server_args.dtype = "float16"
741
                self.model_config.dtype = torch.float16
Zhang, Liangang's avatar
Zhang, Liangang committed
742
743
                if torch.cuda.get_device_capability()[1] < 5:
                    raise RuntimeError("SGLang only supports sm75 and above.")
Lianmin Zheng's avatar
Lianmin Zheng committed
744

745
746
        set_cuda_arch()

747
        # Prepare the model config
748
749
750
        self.load_config = LoadConfig(
            load_format=self.server_args.load_format,
            download_dir=self.server_args.download_dir,
751
            model_loader_extra_config=self.server_args.model_loader_extra_config,
752
753
754
755
            tp_rank=self.tp_rank,
            remote_instance_weight_loader_seed_instance_ip=self.server_args.remote_instance_weight_loader_seed_instance_ip,
            remote_instance_weight_loader_seed_instance_service_port=self.server_args.remote_instance_weight_loader_seed_instance_service_port,
            remote_instance_weight_loader_send_weights_group_ports=self.server_args.remote_instance_weight_loader_send_weights_group_ports,
756
        )
757
758
759
760
        if self.device == "cpu":
            self.model_config = adjust_config_with_unaligned_cpu_tp(
                self.model_config, self.load_config, self.tp_size
            )
761
762
        if self.server_args.load_format == "gguf":
            monkey_patch_vllm_gguf_config()
763

764
765
766
767
768
769
770
771
772
773
774
775
776
777
        if self.server_args.load_format == LoadFormat.REMOTE_INSTANCE:
            if self.tp_rank == 0:
                instance_ip = socket.gethostbyname(socket.gethostname())
                t = threading.Thread(
                    target=trigger_init_weights_send_group_for_remote_instance_request,
                    args=(
                        self.server_args.remote_instance_weight_loader_seed_instance_ip,
                        self.server_args.remote_instance_weight_loader_seed_instance_service_port,
                        self.server_args.remote_instance_weight_loader_send_weights_group_ports,
                        instance_ip,
                    ),
                )
                t.start()

778
        # Load the model
779
780
        # Remove monkey_patch when linear.py quant remove dependencies with vllm
        monkey_patch_vllm_parallel_state()
781
782
        monkey_patch_isinstance_for_vllm_base_layer()

783
        with self.memory_saver_adapter.region(GPU_MEMORY_TYPE_WEIGHTS):
784
785
786
            self.model = get_model(
                model_config=self.model_config,
                load_config=self.load_config,
787
                device_config=DeviceConfig(self.device, self.gpu_id),
788
            )
789
        monkey_patch_vllm_parallel_state(reverse=True)
790
        monkey_patch_isinstance_for_vllm_base_layer(reverse=True)
791

792
793
        get_offloader().post_init()

bjmsong's avatar
bjmsong committed
794
795
796
797
798
799
        if self.server_args.kv_cache_dtype == "fp8_e4m3":
            if self.server_args.quantization_param_path is not None:
                if callable(getattr(self.model, "load_kv_cache_scales", None)):
                    self.model.load_kv_cache_scales(
                        self.server_args.quantization_param_path
                    )
800
801
802
803
                    logger.info(
                        "Loaded KV cache scaling factors from %s",
                        self.server_args.quantization_param_path,
                    )
bjmsong's avatar
bjmsong committed
804
805
806
807
808
809
810
811
812
813
814
815
816
                else:
                    raise RuntimeError(
                        "Using FP8 KV cache and scaling factors provided but "
                        "model %s does not support loading scaling factors.",
                        self.model.__class__,
                    )
            else:
                logger.warning(
                    "Using FP8 KV cache but no scaling factors "
                    "provided. Defaulting to scaling factors of 1.0. "
                    "This may lead to less accurate results!"
                )

817
        # Parse other args
Hanming Lu's avatar
Hanming Lu committed
818
819
820
821
822
        self.sliding_window_size = None
        if hasattr(self.model, "get_attention_sliding_window_size"):
            self.sliding_window_size = self.model.get_attention_sliding_window_size()
        elif self.model_config.attention_chunk_size is not None:
            self.sliding_window_size = self.model_config.attention_chunk_size
823
            logger.info(
Hanming Lu's avatar
Hanming Lu committed
824
825
826
                f"Setting sliding_window_size to be attention_chunk_size: {self.sliding_window_size}"
            )

827
        self.dtype = self.model_config.dtype
828

829
        after_avail_memory = get_available_gpu_memory(self.device, self.gpu_id)
830
        self.weight_load_mem_usage = before_avail_memory - after_avail_memory
831
        logger.info(
832
            f"Load weight end. "
833
            f"type={type(self.model).__name__}, "
Lianmin Zheng's avatar
Lianmin Zheng committed
834
            f"dtype={self.dtype}, "
835
            f"avail mem={after_avail_memory:.2f} GB, "
836
            f"mem usage={self.weight_load_mem_usage:.2f} GB."
837
        )
Lianmin Zheng's avatar
Lianmin Zheng committed
838

839
840
841
842
843
844
845
846
847
848
849
850
        # Handle the case where some ranks do not finish loading.
        try:
            dist.monitored_barrier(
                group=get_tp_group().cpu_group,
                timeout=datetime.timedelta(seconds=UNBALANCED_MODEL_LOADING_TIMEOUT_S),
                wait_all_ranks=True,
            )
        except RuntimeError:
            raise ValueError(
                f"TP rank {self.tp_rank} could finish the model loading, but there are other ranks that didn't finish loading. It is likely due to unexpected failures (e.g., OOM) or a slow node."
            ) from None

851
    def update_expert_location(
852
853
854
        self,
        new_expert_location_metadata: ExpertLocationMetadata,
        update_layer_ids: List[int],
855
    ):
856
        self.expert_location_updater.update(
857
858
            self.model.routed_experts_weights_of_layer,
            new_expert_location_metadata,
859
            update_layer_ids=update_layer_ids,
860
861
862
863
            nnodes=self.server_args.nnodes,
            rank=self.tp_rank,
        )

864
865
866
867
    def update_weights_from_disk(
        self, model_path: str, load_format: str
    ) -> tuple[bool, str]:
        """Update engine weights in-place from the disk."""
868
        logger.info(
Chayenne's avatar
Chayenne committed
869
            f"Update engine weights online from disk begin. "
Zhang, Liangang's avatar
Zhang, Liangang committed
870
            f"avail mem={get_available_gpu_memory(self.device, self.gpu_id):.2f} GB"
871
872
        )

Zhang, Liangang's avatar
Zhang, Liangang committed
873
        target_device = torch.device(self.device)
874
        self.model_config.model_path = model_path
875
876
        load_config = LoadConfig(load_format=load_format)

Lianmin Zheng's avatar
Lianmin Zheng committed
877
        # Only support DefaultModelLoader for now
878
879
        loader = get_model_loader(load_config)
        if not isinstance(loader, DefaultModelLoader):
Lianmin Zheng's avatar
Lianmin Zheng committed
880
881
            message = f"Failed to get model loader: {loader}."
            return False, message
882
883
884

        def get_weight_iter(config):
            iter = loader._get_weights_iterator(
885
                DefaultModelLoader.Source.init_new(config, self.model)
886
887
888
889
            )
            return iter

        def model_load_weights(model, iter):
890
            DefaultModelLoader.load_weights_and_postprocess(model, iter, target_device)
891
892
            return model

893
        with set_default_torch_dtype(self.model_config.dtype):
894
            try:
895
                iter = get_weight_iter(self.model_config)
896
            except Exception as e:
Lianmin Zheng's avatar
Lianmin Zheng committed
897
                message = f"Failed to get weights iterator: {e}."
898
899
900
901
                return False, message
            try:
                model = model_load_weights(self.model, iter)
            except Exception as e:
Lianmin Zheng's avatar
Lianmin Zheng committed
902
903
904
                message = (
                    f"Failed to update weights: {e}.\nRolling back to original weights."
                )
905
906
                del iter
                gc.collect()
907
                iter = get_weight_iter(self.model_config)
908
909
910
911
912
913
914
915
                self.model = model_load_weights(self.model, iter)
                return False, message

        self.model = model
        self.server_args.model_path = model_path
        self.server_args.load_format = load_format
        self.load_config = load_config

916
        logger.info("Update weights end.")
Lianmin Zheng's avatar
Lianmin Zheng committed
917
        return True, "Succeeded to update model weights."
918

919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
    def init_weights_send_group_for_remote_instance(
        self,
        master_address,
        ports,
        group_rank,
        world_size,
        group_name,
        backend="nccl",
    ):
        assert (
            torch.distributed.is_initialized()
        ), "Default torch process group must be initialized"
        assert group_name != "", "Group name cannot be empty"

        ports_list = ports.split(",")
        assert (
            len(ports_list) == self.tp_size
        ), f"Expected {self.tp_size} ports, but got {len(ports_list)} ports."
        group_port = ports_list[self.tp_rank]
        group_name = f"{group_name}_{group_port}_{self.tp_rank}"

        logger.info(
            f"init custom process group: tp_rank={self.tp_rank}, gpu_id={self.gpu_id}, master_address={master_address}, master_port={group_port}, "
            f"group_rank={group_rank}, world_size={world_size}, group_name={group_name}, backend={backend}"
        )

        torch.cuda.empty_cache()
        success = False
        message = ""
        try:
            self._weights_send_group[group_name] = init_custom_process_group(
                backend=backend,
                init_method=f"tcp://{master_address}:{group_port}",
                world_size=world_size,
                rank=group_rank,
                group_name=group_name,
                device_id=torch.device("cuda", self.gpu_id),
            )
            dist.barrier(group=self._weights_send_group[group_name])
            success = True
            message = (
                f"Succeeded to init group through {master_address}:{group_port} group."
            )
        except Exception as e:
            message = f"Failed to init group: {e}."
            logger.error(message)

        torch.cuda.empty_cache()
        return success, message

    def send_weights_to_remote_instance(
        self,
        master_address,
        ports,
        group_name,
    ):
        assert (
            torch.distributed.is_initialized()
        ), "Default torch process group must be initialized"
        assert group_name != "", "Group name cannot be empty"

        ports_list = ports.split(",")
        assert (
            len(ports_list) == self.tp_size
        ), f"Expected {self.tp_size} ports, but got {len(ports_list)} ports."
        group_port = ports_list[self.tp_rank]
        group_name = f"{group_name}_{group_port}_{self.tp_rank}"

        if self._weights_send_group[group_name] is not None:
            send_group = self._weights_send_group[group_name]
        else:
            message = f"Group {group_name} not in _weights_send_group list. Please call `init_weights_send_group_for_remote_instance` first."
            logger.error(message)
            return False, message

        torch.cuda.empty_cache()
        success = False
        message = ""
        try:
            for _, weights in self.model.named_parameters():
                torch.distributed.broadcast(
                    weights,
                    src=0,
                    group=send_group,
                )
            success = True
            message = f"Succeeded to send weights through {master_address}:{group_port} {group_name}."
        except Exception as e:
            message = f"Failed to send weights: {e}."
            logger.error(message)

        # destroy the process group after sending weights
        del self._weights_send_group[group_name]
        torch.distributed.distributed_c10d.destroy_process_group(send_group)
        torch.cuda.empty_cache()
        return success, message

1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
    def init_weights_update_group(
        self,
        master_address,
        master_port,
        rank_offset,
        world_size,
        group_name,
        backend="nccl",
    ):
        """Initialize the Torch process group for model parameter updates.

        `_model_update_group` is used in the RLHF workflow, where rank
        0 is the actor model in the training engine, and the other ranks are
        the inference engine, which is used for rollout.

        In the RLHF workflow, the training engine updates the model
        weights/parameters online, and broadcasts them to the inference
        engine through the `_model_update_group` process group.
        """
        assert (
            torch.distributed.is_initialized()
        ), "Default torch process group must be initialized"
        assert group_name != "", "Group name cannot be empty"

        rank = rank_offset + self.tp_rank

        logger.info(
            f"init custom process group: master_address={master_address}, master_port={master_port}, "
1044
            f"rank_offset={rank_offset}, rank={rank}, world_size={world_size}, group_name={group_name}, backend={backend}"
1045
1046
1047
        )

        try:
1048
            self._model_update_group[group_name] = init_custom_process_group(
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
                backend=backend,
                init_method=f"tcp://{master_address}:{master_port}",
                world_size=world_size,
                rank=rank,
                group_name=group_name,
            )
            return True, "Succeeded to initialize custom process group."
        except Exception as e:
            message = f"Failed to initialize custom process group: {e}."
            logger.error(message)
            return False, message

1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
    def destroy_weights_update_group(self, group_name):
        try:
            if group_name in self._model_update_group:
                pg = self._model_update_group.pop(group_name)
                torch.distributed.destroy_process_group(pg)
                return True, "Succeeded to destroy custom process group."
            else:
                return False, "The group to be destroyed does not exist."
        except Exception as e:
            message = f"Failed to destroy custom process group: {e}."
            logger.error(message)
            return False, message

1074
    def update_weights_from_distributed(self, names, dtypes, shapes, group_name):
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
        """
        Update specific parameter in the model weights online
        through `_model_update_group` process group.

        Args:
            name: the name of the parameter to be updated.
            dtype: the data type of the parameter to be updated.
            shape: the shape of the parameter to be updated.
        """

1085
1086
1087
1088
        assert group_name in self._model_update_group, (
            f"Group {group_name} not in {list(self._model_update_group.keys())}. "
            "Please call `init_weights_update_group` first."
        )
1089
1090

        try:
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
            weights = []
            handles = []
            for name, dtype, shape in zip(names, dtypes, shapes):
                target_dtype = (
                    dtype if isinstance(dtype, torch.dtype) else getattr(torch, dtype)
                )
                weight = torch.empty(shape, dtype=target_dtype, device=self.device)
                handles.append(
                    torch.distributed.broadcast(
                        weight,
                        src=0,
                        group=self._model_update_group[group_name],
                        async_op=True,
                    )
                )
                weights.append((name, weight))
            for handle in handles:
                handle.wait()

            self.model.load_weights(weights)
            return True, f"Succeeded to update parameter online."
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121

        except Exception as e:
            error_msg = (
                f"Failed to update parameter online: {e}. "
                f"The full weights of the ModelRunner are partially updated. "
                f"Please discard the whole weights."
            )
            logger.error(error_msg)
            return False, error_msg

1122
1123
1124
1125
1126
    def update_weights_from_tensor(
        self,
        named_tensors: List[Tuple[str, Union[torch.Tensor, "LocalSerializedTensor"]]],
        load_format: Optional[str] = None,
    ):
1127
        monkey_patch_torch_reductions()
1128
1129
1130
1131
1132
1133
        if load_format == "flattened_bucket":
            # Handle flattened bucket format
            return self._update_weights_from_flattened_bucket(
                flattened_tensor_bucket_dict=named_tensors
            )

1134
        # We need to get device after patch otherwise the device would be wrong
1135
1136
        self.device_module = torch.get_device_module(self.device)
        infered_device = self.device_module.current_device()
1137

1138
        named_tensors = [
1139
            (name, _unwrap_tensor(tensor, tp_rank=self.tp_rank, device=infered_device))
1140
1141
1142
1143
            for name, tensor in named_tensors
        ]
        if load_format == "direct":
            _model_load_weights_direct(self.model, named_tensors)
1144
1145
1146
        elif load_format in self.server_args.custom_weight_loader:
            custom_loader = dynamic_import(load_format)
            custom_loader(self.model, named_tensors)
1147
1148
1149
1150
        elif load_format is None:
            self.model.load_weights(named_tensors)
        else:
            raise NotImplementedError(f"Unknown load_format={load_format}")
1151
        return True, "Success"
1152

1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
    def _update_weights_from_flattened_bucket(
        self,
        flattened_tensor_bucket_dict,
    ):
        """Handle flattened bucket format for weight updates"""
        flattened_tensor = flattened_tensor_bucket_dict["flattened_tensor"]
        metadata = flattened_tensor_bucket_dict["metadata"]

        # Convert metadata dict to our format
        converted_metadata = []
        for meta in metadata:
            converted_meta = FlattenedTensorMetadata(
                name=meta.name,
                shape=meta.shape,
                dtype=meta.dtype,
                start_idx=meta.start_idx,
                end_idx=meta.end_idx,
                numel=meta.numel,
            )
            converted_metadata.append(converted_meta)

        # Create bucket and reconstruct tensors
        bucket = FlattenedTensorBucket(
            flattened_tensor=flattened_tensor, metadata=converted_metadata
        )
        reconstructed_tensors = bucket.reconstruct_tensors()

        # Load the reconstructed tensors using the standard method
        self.model.load_weights(reconstructed_tensors)

        return True, "Success"

1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
    def get_weights_by_name(
        self, name: str, truncate_size: int = 100
    ) -> Optional[torch.Tensor]:
        """Get the weights of the parameter by its name. Similar to `get_parameter` in Hugging Face.

        Only used for unit test with an unoptimized performance.
        For optimized performance, please use torch.save and torch.load.
        """
        # TODO: (chenyang) Add support for Qwen models.
        try:
            return self.model.get_weights_by_name(
                name, truncate_size, tp_size=self.tp_size
            )
        except Exception as e:
            logger.error(f"Error when getting parameter {name}: {e}")
            return None

1202
1203
1204
1205
1206
1207
1208
    def init_lora_manager(self):
        self.lora_manager = LoRAManager(
            base_model=self.model,
            base_hf_config=self.model_config.hf_config,
            max_loras_per_batch=self.server_args.max_loras_per_batch,
            load_config=self.load_config,
            dtype=self.dtype,
1209
            lora_backend=self.server_args.lora_backend,
1210
1211
            tp_size=self.tp_size,
            tp_rank=self.tp_rank,
1212
1213
            max_lora_rank=self.server_args.max_lora_rank,
            target_modules=self.server_args.lora_target_modules,
1214
            lora_paths=self.server_args.lora_paths,
1215
            server_args=self.server_args,
1216
        )
1217

1218
    def load_lora_adapter(self, lora_ref: LoRARef):
1219
1220
1221
        """Load a new lora adapter from disk or huggingface."""

        logger.info(
1222
            f"LoRA adapter loading starts: {lora_ref}. "
1223
1224
1225
            f"avail mem={get_available_gpu_memory(self.device, self.gpu_id):.2f} GB"
        )

1226
        result = self.lora_manager.load_lora_adapter(lora_ref)
1227
1228

        logger.info(
1229
            f"LoRA adapter loading completes: {lora_ref}. "
1230
1231
1232
1233
1234
            f"avail mem={get_available_gpu_memory(self.device, self.gpu_id):.2f} GB"
        )

        return result

1235
    def unload_lora_adapter(self, lora_ref: LoRARef):
1236
1237
1238
        """Unload a lora adapter that was previously loaded during initialization or dynamic loading."""

        logger.info(
1239
            f"LoRA adapter unloading starts: {lora_ref}. "
1240
1241
1242
            f"avail mem={get_available_gpu_memory(self.device, self.gpu_id):.2f} GB"
        )

1243
        result = self.lora_manager.unload_lora_adapter(lora_ref)
1244
1245

        logger.info(
1246
            f"LoRA adapter unloading completes: {lora_ref}. "
1247
1248
1249
1250
            f"avail mem={get_available_gpu_memory(self.device, self.gpu_id):.2f} GB"
        )

        return result
1251

1252
    def profile_max_num_token(self, total_gpu_memory: int):
1253
        available_gpu_memory = get_available_gpu_memory(
1254
1255
1256
1257
            self.device,
            self.gpu_id,
            distributed=get_world_group().world_size > 1,
            cpu_group=get_world_group().cpu_group,
1258
        )
1259
1260
1261
1262
1263
        if self.is_draft_worker:
            num_layers = getattr(
                self.model_config.hf_config,
                "num_nextn_predict_layers",
                self.num_effective_layers,
1264
            )
Yi Zhang's avatar
Yi Zhang committed
1265
1266
        elif self.is_hybrid_gdn:
            num_layers = len(self.model_config.hf_config.full_attention_layer_ids)
1267
1268
1269
        else:
            num_layers = self.num_effective_layers
        if self.use_mla_backend:
1270
1271
            cell_size = (
                (self.model_config.kv_lora_rank + self.model_config.qk_rope_head_dim)
1272
                * num_layers
1273
                * torch._utils._element_size(self.kv_cache_dtype)
1274
1275
1276
            )
        else:
            cell_size = (
1277
                self.model_config.get_num_kv_heads(get_attention_tp_size())
1278
                * self.model_config.head_dim
1279
                * num_layers
1280
                * 2
1281
                * torch._utils._element_size(self.kv_cache_dtype)
1282
            )
Lianmin Zheng's avatar
Lianmin Zheng committed
1283
1284
1285
        rest_memory = available_gpu_memory - total_gpu_memory * (
            1 - self.mem_fraction_static
        )
Yi Zhang's avatar
Yi Zhang committed
1286
1287
1288
1289
1290
1291
        if self.is_hybrid_gdn:
            rest_memory -= (
                self.server_args.max_mamba_cache_size
                * self.model_config.hf_config.mamba_cache_per_req
                / (1 << 30)
            )
1292
        max_num_token = int(rest_memory * (1 << 30) // cell_size)
Lianmin Zheng's avatar
Lianmin Zheng committed
1293
1294
        return max_num_token

Yi Zhang's avatar
Yi Zhang committed
1295
1296
1297
1298
1299
    @property
    def is_hybrid_gdn(self):
        return self.model_config.hf_config.architectures[0] in [
            "Qwen3NextForCausalLM",
            "Qwen3NextForCausalLMMTP",
1300
            "FalconH1ForCausalLM",
Yi Zhang's avatar
Yi Zhang committed
1301
1302
        ]

tarinkk's avatar
tarinkk committed
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
    def set_num_token_hybrid(self):
        if (
            "Llama4ForConditionalGeneration"
            in self.model_config.hf_config.architectures
        ):
            temp_ratio = (
                (1 - self.is_hybrid)
                + self.is_hybrid
                * self.attention_chunk_size
                / self.model_config.context_len
            )
            self.swa_max_total_num_tokens = (
                4 * self.max_total_num_tokens * temp_ratio // (3 * temp_ratio + 1)
            )
            self.full_max_total_num_tokens = (
                4 * self.max_total_num_tokens
                - 12 * self.max_total_num_tokens * temp_ratio // (3 * temp_ratio + 1)
            )
            self.swa_max_total_num_tokens = int(
                self.swa_max_total_num_tokens
                // self.server_args.page_size
                * self.server_args.page_size
            )
            self.full_max_total_num_tokens = int(
                self.full_max_total_num_tokens
                // self.server_args.page_size
                * self.server_args.page_size
            )
            self.max_total_num_tokens = self.full_max_total_num_tokens
        else:
Hanming Lu's avatar
Hanming Lu committed
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
            assert self.sliding_window_size is not None and self.sliding_window_size > 0
            full_attention_layer_ids = []
            swa_attention_layer_ids = []

            try:
                layers = self.model.model.layers
            except:
                try:
                    layers = self.model.language_model.model.layers
                except:
1343
1344
1345
1346
1347
                    try:
                        layers = self.model.language_model.layers
                    except:
                        self.is_hybrid = False
                        return
Hanming Lu's avatar
Hanming Lu committed
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382

            for layer in layers:
                if (
                    layer.self_attn.attn.sliding_window_size is None
                    or layer.self_attn.attn.sliding_window_size == -1
                ):
                    full_attention_layer_ids.append(layer.layer_id)
                else:
                    swa_attention_layer_ids.append(layer.layer_id)
            self.model_config.swa_attention_layer_ids = swa_attention_layer_ids
            self.model_config.full_attention_layer_ids = full_attention_layer_ids

            # Algorithm:
            # Existing max_total_num_tokens is per layer and assume all layers have the same number of tokens.
            # - Find total # of tokens available across layers.
            # - Calculate full_max_total_num_tokens and swa_max_total_num_tokens based on the given swa_full_tokens_ratio.
            total_tokens = (
                self.max_total_num_tokens * self.model_config.num_hidden_layers
            )
            full_layers_num = len(full_attention_layer_ids)
            swa_layers_num = len(swa_attention_layer_ids)
            swa_full_tokens_ratio = self.server_args.swa_full_tokens_ratio

            # Solve the equations:
            # 1. swa_max_total_num_tokens * swa_layers_num + full_max_total_num_tokens * full_layers_num == total_tokens
            # 2. full_max_total_num_tokens * swa_full_tokens_ratio == swa_max_total_num_tokens
            denominator = swa_full_tokens_ratio * swa_layers_num + full_layers_num
            self.full_max_total_num_tokens = int(total_tokens / denominator)
            self.swa_max_total_num_tokens = int(
                self.full_max_total_num_tokens * swa_full_tokens_ratio
            )
            self.max_total_num_tokens = self.full_max_total_num_tokens

            logger.info(
                f"Use Sliding window memory pool. full_layer_tokens={self.full_max_total_num_tokens}, swa_layer_tokens={self.swa_max_total_num_tokens}"
tarinkk's avatar
tarinkk committed
1383
1384
            )

1385
    def init_memory_pool(
1386
1387
        self,
        total_gpu_memory: int,
1388
1389
        max_num_reqs: Optional[int] = None,
        max_total_tokens: Optional[int] = None,
1390
    ):
Lianmin Zheng's avatar
Lianmin Zheng committed
1391
        # Determine the kv cache dtype
1392
        if self.server_args.kv_cache_dtype == "auto":
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
            quant_config = getattr(self.model, "quant_config", None)
            kv_cache_quant_algo = getattr(quant_config, "kv_cache_quant_algo", None)
            if (
                isinstance(kv_cache_quant_algo, str)
                and kv_cache_quant_algo.upper() == "FP8"
            ):
                if _is_hip:
                    self.kv_cache_dtype = torch.float8_e4m3fnuz
                else:
                    self.kv_cache_dtype = torch.float8_e4m3fn
            else:
                self.kv_cache_dtype = self.dtype
1405
        elif self.server_args.kv_cache_dtype == "fp8_e5m2":
1406
            if _is_hip:  # Using natively supported format
HAI's avatar
HAI committed
1407
1408
1409
                self.kv_cache_dtype = torch.float8_e5m2fnuz
            else:
                self.kv_cache_dtype = torch.float8_e5m2
bjmsong's avatar
bjmsong committed
1410
        elif self.server_args.kv_cache_dtype == "fp8_e4m3":
1411
1412
1413
            if _is_hip:  # Using natively supported format
                self.kv_cache_dtype = torch.float8_e4m3fnuz
            else:
bjmsong's avatar
bjmsong committed
1414
                self.kv_cache_dtype = torch.float8_e4m3fn
1415
1416
1417
1418
1419
        else:
            raise ValueError(
                f"Unsupported kv_cache_dtype: {self.server_args.kv_cache_dtype}."
            )

1420
1421
        log_info_on_rank0(logger, f"Using KV cache dtype: {self.kv_cache_dtype}")

1422
        self.max_total_num_tokens = self.profile_max_num_token(total_gpu_memory)
Lianmin Zheng's avatar
Lianmin Zheng committed
1423
1424
        if SGLANG_CI_SMALL_KV_SIZE:
            self.max_total_num_tokens = int(SGLANG_CI_SMALL_KV_SIZE)
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435

        if max_num_reqs is None:
            max_num_reqs = min(
                max(
                    int(
                        self.max_total_num_tokens / self.model_config.context_len * 512
                    ),
                    2048,
                ),
                4096,
            )
Yi Zhang's avatar
Yi Zhang committed
1436
1437
        if self.is_hybrid_gdn:
            max_num_reqs = min(max_num_reqs, self.server_args.max_mamba_cache_size)
1438

1439
        if self.spec_algorithm.is_eagle() or self.spec_algorithm.is_standalone():
1440
1441
            if self.is_draft_worker:
                self.max_total_num_tokens = self.server_args.draft_runner_cache_size
1442
                max_num_reqs = self.server_args.max_num_reqs
1443
            else:
1444
1445
                # We are sharing the `token_to_kv_pool`, and both verify and draft tokens
                # can be concurrently allocated, so we should give a headroom for it.
1446
1447
                self.server_args.draft_runner_cache_size = (
                    self.max_total_num_tokens
1448
1449
1450
1451
1452
1453
1454
                    # draft
                    + max_num_reqs
                    * self.server_args.speculative_num_steps
                    * self.server_args.speculative_eagle_topk
                    # verify
                    + max_num_reqs * self.server_args.speculative_num_draft_tokens
                    # buffer
1455
1456
                    + 100
                )
1457
1458
1459
1460
                # Target worker and draft worker shares the same indices for the
                # token_to_kv_pool, so we should make sure to match max_total_num_tokens.
                self.max_total_num_tokens = self.server_args.draft_runner_cache_size
                self.server_args.max_num_reqs = max_num_reqs
1461

1462
1463
        if max_total_tokens is not None:
            if max_total_tokens > self.max_total_num_tokens:
1464
                logging.warning(
1465
1466
1467
1468
1469
                    f"max_total_tokens={max_total_tokens} is larger than the profiled value "
                    f"{self.max_total_num_tokens}. "
                    f"Use the profiled value instead."
                )
            self.max_total_num_tokens = min(self.max_total_num_tokens, max_total_tokens)
1470

1471
1472
1473
1474
1475
        self.max_total_num_tokens = (
            self.max_total_num_tokens
            // self.server_args.page_size
            * self.server_args.page_size
        )
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
        # different pp rank may have different num of layers, so we need to reduce the max_total_num_tokens
        if self.pp_size > 1:
            tensor = torch.tensor(self.max_total_num_tokens, dtype=torch.int64)
            torch.distributed.all_reduce(
                tensor,
                op=torch.distributed.ReduceOp.MIN,
                group=get_world_group().cpu_group,
            )
            self.max_total_num_tokens = tensor.item()

tarinkk's avatar
tarinkk committed
1486
1487
1488
1489
        # create token size for hybrid cache
        if self.is_hybrid:
            self.set_num_token_hybrid()

1490
        if self.max_total_num_tokens <= 0:
1491
            raise RuntimeError(
1492
1493
                f"Not enough memory. Please try to increase --mem-fraction-static. "
                f"Current value: {self.server_args.mem_fraction_static=}"
1494
            )
1495

Lianmin Zheng's avatar
Lianmin Zheng committed
1496
        # Initialize req_to_token_pool
1497
        if self.req_to_token_pool is None:
1498
1499
1500
1501
1502
            # FIXME(lsyin): this is the temporary fix for the context length issue when using speculative decoding
            extra_max_context_len = 4
            if self.server_args.speculative_num_draft_tokens is not None:
                extra_max_context_len += self.server_args.speculative_num_draft_tokens

Byron Hsu's avatar
Byron Hsu committed
1503
1504
1505
1506
1507
1508
1509
1510
            if self.server_args.disaggregation_mode == "decode":
                from sglang.srt.disaggregation.decode import DecodeReqToTokenPool

                # subscribe memory for pre-allocated requests
                # if max_num_reqs <= 32, we pre-allocate 2x requests
                pre_alloc_size = max_num_reqs * 2 if max_num_reqs <= 32 else 0
                self.req_to_token_pool = DecodeReqToTokenPool(
                    size=max_num_reqs,
1511
1512
                    max_context_len=self.model_config.context_len
                    + extra_max_context_len,
Byron Hsu's avatar
Byron Hsu committed
1513
1514
1515
1516
                    device=self.device,
                    enable_memory_saver=self.server_args.enable_memory_saver,
                    pre_alloc_size=pre_alloc_size,
                )
Yi Zhang's avatar
Yi Zhang committed
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
            elif self.is_hybrid_gdn:
                config = self.model_config.hf_config
                (
                    conv_state_shape,
                    temporal_state_shape,
                    conv_dtype,
                    ssm_dtype,
                    mamba_layers,
                ) = config.hybrid_gdn_params
                self.req_to_token_pool = HybridReqToTokenPool(
                    size=max_num_reqs,
                    max_context_len=self.model_config.context_len
                    + extra_max_context_len,
                    device=self.device,
                    enable_memory_saver=self.server_args.enable_memory_saver,
                    conv_state_shape=conv_state_shape,
                    temporal_state_shape=temporal_state_shape,
                    conv_dtype=conv_dtype,
                    ssm_dtype=ssm_dtype,
                    mamba_layers=mamba_layers,
                    speculative_num_draft_tokens=self.server_args.speculative_num_draft_tokens,
                )
Byron Hsu's avatar
Byron Hsu committed
1539
1540
1541
            else:
                self.req_to_token_pool = ReqToTokenPool(
                    size=max_num_reqs,
1542
1543
                    max_context_len=self.model_config.context_len
                    + extra_max_context_len,
Byron Hsu's avatar
Byron Hsu committed
1544
1545
1546
                    device=self.device,
                    enable_memory_saver=self.server_args.enable_memory_saver,
                )
1547
1548
1549
1550
        else:
            # Draft worker shares req_to_token_pool with the target worker.
            assert self.is_draft_worker

Lianmin Zheng's avatar
Lianmin Zheng committed
1551
        # Initialize token_to_kv_pool
Lianmin Zheng's avatar
Lianmin Zheng committed
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
        if self.server_args.attention_backend == "ascend":
            if self.use_mla_backend:
                self.token_to_kv_pool = AscendMLAPagedTokenToKVPool(
                    self.max_total_num_tokens,
                    page_size=self.page_size,
                    dtype=self.kv_cache_dtype,
                    kv_lora_rank=self.model_config.kv_lora_rank,
                    qk_rope_head_dim=self.model_config.qk_rope_head_dim,
                    layer_num=self.num_effective_layers,
                    device=self.device,
                    enable_memory_saver=self.server_args.enable_memory_saver,
                    start_layer=self.start_layer,
                    end_layer=self.end_layer,
                )
            else:
                self.token_to_kv_pool = AscendTokenToKVPool(
                    self.max_total_num_tokens,
                    page_size=self.page_size,
                    dtype=self.kv_cache_dtype,
                    head_num=self.model_config.get_num_kv_heads(
                        get_attention_tp_size()
                    ),
                    head_dim=self.model_config.head_dim,
                    layer_num=self.model_config.num_hidden_layers,
                    device=self.device,
                    enable_memory_saver=self.server_args.enable_memory_saver,
                )
1579
        elif self.use_mla_backend:
1580
1581
            self.token_to_kv_pool = MLATokenToKVPool(
                self.max_total_num_tokens,
Lianmin Zheng's avatar
Lianmin Zheng committed
1582
                page_size=self.page_size,
1583
                dtype=self.kv_cache_dtype,
1584
1585
                kv_lora_rank=self.model_config.kv_lora_rank,
                qk_rope_head_dim=self.model_config.qk_rope_head_dim,
1586
                layer_num=self.num_effective_layers,
Zhang, Liangang's avatar
Zhang, Liangang committed
1587
                device=self.device,
1588
                enable_memory_saver=self.server_args.enable_memory_saver,
1589
1590
                start_layer=self.start_layer,
                end_layer=self.end_layer,
1591
            )
Shuo Yang's avatar
Shuo Yang committed
1592
1593
1594
        elif self.server_args.enable_double_sparsity:
            self.token_to_kv_pool = DoubleSparseTokenToKVPool(
                self.max_total_num_tokens,
Lianmin Zheng's avatar
Lianmin Zheng committed
1595
                page_size=self.page_size,
Shuo Yang's avatar
Shuo Yang committed
1596
                dtype=self.kv_cache_dtype,
1597
                head_num=self.model_config.get_num_kv_heads(get_attention_tp_size()),
Shuo Yang's avatar
Shuo Yang committed
1598
                head_dim=self.model_config.head_dim,
1599
                layer_num=self.num_effective_layers,
Shuo Yang's avatar
Shuo Yang committed
1600
1601
                device=self.device,
                heavy_channel_num=self.server_args.ds_heavy_channel_num,
1602
                enable_memory_saver=self.server_args.enable_memory_saver,
1603
1604
                start_layer=self.start_layer,
                end_layer=self.end_layer,
Shuo Yang's avatar
Shuo Yang committed
1605
            )
1606
        else:
tarinkk's avatar
tarinkk committed
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
            if self.is_hybrid:
                self.token_to_kv_pool = SWAKVPool(
                    size=self.full_max_total_num_tokens,
                    size_swa=self.swa_max_total_num_tokens,
                    dtype=self.kv_cache_dtype,
                    head_num=self.model_config.get_num_kv_heads(
                        get_attention_tp_size()
                    ),
                    head_dim=self.model_config.head_dim,
                    swa_attention_layer_ids=self.model_config.swa_attention_layer_ids,
                    full_attention_layer_ids=self.model_config.full_attention_layer_ids,
                    enable_kvcache_transpose=False,
                    device=self.device,
                )
Yi Zhang's avatar
Yi Zhang committed
1621
1622
            elif self.is_hybrid_gdn:
                self.token_to_kv_pool = HybridLinearKVPool(
1623
                    page_size=self.page_size if _is_npu else 1,
Yi Zhang's avatar
Yi Zhang committed
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
                    size=self.max_total_num_tokens,
                    dtype=self.kv_cache_dtype,
                    head_num=self.model_config.get_num_kv_heads(
                        get_attention_tp_size()
                    ),
                    head_dim=self.model_config.head_dim,
                    # if draft worker, we only need 1 attention layer's kv pool
                    full_attention_layer_ids=(
                        [0]
                        if self.is_draft_worker
                        else self.model_config.hf_config.full_attention_layer_ids
                    ),
                    enable_kvcache_transpose=False,
                    device=self.device,
                )
tarinkk's avatar
tarinkk committed
1639
1640
            else:
                self.token_to_kv_pool = MHATokenToKVPool(
Lianmin Zheng's avatar
Lianmin Zheng committed
1641
                    self.max_total_num_tokens,
tarinkk's avatar
tarinkk committed
1642
                    page_size=self.page_size,
Lianmin Zheng's avatar
Lianmin Zheng committed
1643
                    dtype=self.kv_cache_dtype,
tarinkk's avatar
tarinkk committed
1644
1645
1646
1647
1648
                    head_num=self.model_config.get_num_kv_heads(
                        get_attention_tp_size()
                    ),
                    head_dim=self.model_config.head_dim,
                    layer_num=self.num_effective_layers,
Lianmin Zheng's avatar
Lianmin Zheng committed
1649
                    device=self.device,
tarinkk's avatar
tarinkk committed
1650
1651
1652
                    enable_memory_saver=self.server_args.enable_memory_saver,
                    start_layer=self.start_layer,
                    end_layer=self.end_layer,
Lianmin Zheng's avatar
Lianmin Zheng committed
1653
                )
tarinkk's avatar
tarinkk committed
1654

Lianmin Zheng's avatar
Lianmin Zheng committed
1655
        # Initialize token_to_kv_pool_allocator
Lianmin Zheng's avatar
Lianmin Zheng committed
1656
        need_sort = self.server_args.disaggregation_mode in ("decode", "prefill")
tarinkk's avatar
tarinkk committed
1657
        if self.token_to_kv_pool_allocator is None:
1658
1659
1660
            if _is_npu and (
                self.server_args.attention_backend == "ascend" or self.is_hybrid_gdn
            ):
Lianmin Zheng's avatar
Lianmin Zheng committed
1661
1662
1663
1664
1665
1666
1667
1668
                self.token_to_kv_pool_allocator = AscendPagedTokenToKVPoolAllocator(
                    self.max_total_num_tokens,
                    page_size=self.page_size,
                    dtype=self.kv_cache_dtype,
                    device=self.device,
                    kvcache=self.token_to_kv_pool,
                    need_sort=need_sort,
                )
Lianmin Zheng's avatar
Lianmin Zheng committed
1669
            else:
Lianmin Zheng's avatar
Lianmin Zheng committed
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
                if self.page_size == 1:
                    if self.is_hybrid:
                        self.token_to_kv_pool_allocator = SWATokenToKVPoolAllocator(
                            self.full_max_total_num_tokens,
                            self.swa_max_total_num_tokens,
                            dtype=self.kv_cache_dtype,
                            device=self.device,
                            kvcache=self.token_to_kv_pool,
                            need_sort=need_sort,
                        )
                    else:
                        self.token_to_kv_pool_allocator = TokenToKVPoolAllocator(
                            self.max_total_num_tokens,
                            dtype=self.kv_cache_dtype,
                            device=self.device,
                            kvcache=self.token_to_kv_pool,
                            need_sort=need_sort,
                        )
1688
                else:
Lianmin Zheng's avatar
Lianmin Zheng committed
1689
1690
                    assert not self.is_hybrid
                    self.token_to_kv_pool_allocator = PagedTokenToKVPoolAllocator(
1691
1692
1693
1694
1695
                        self.max_total_num_tokens,
                        page_size=self.page_size,
                        dtype=self.kv_cache_dtype,
                        device=self.device,
                        kvcache=self.token_to_kv_pool,
Lianmin Zheng's avatar
Lianmin Zheng committed
1696
                        need_sort=need_sort,
1697
                    )
1698
1699
1700
        else:
            assert self.is_draft_worker

1701
        logger.info(
1702
            f"Memory pool end. "
Zhang, Liangang's avatar
Zhang, Liangang committed
1703
            f"avail mem={get_available_gpu_memory(self.device, self.gpu_id):.2f} GB"
1704
        )
Lianmin Zheng's avatar
Lianmin Zheng committed
1705

Lianmin Zheng's avatar
Lianmin Zheng committed
1706
1707
1708
1709
1710
1711
1712
1713
1714
    def init_cublas(self):
        """We need to run a small matmul to init cublas. Otherwise, it will raise some errors later."""
        dtype = torch.float16
        device = "cuda"
        a = torch.ones((16, 16), dtype=dtype, device=device)
        b = torch.ones((16, 16), dtype=dtype, device=device)
        c = a @ b
        return c

1715
1716
    def init_attention_backend(self):
        """Init attention kernel backend."""
1717
        if self.server_args.enable_two_batch_overlap and not self.is_draft_worker:
1718
1719
1720
1721
1722
            self.attn_backend = TboAttnBackend.init_new(self._get_attention_backend)
        else:
            self.attn_backend = self._get_attention_backend()

    def _get_attention_backend(self):
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
        """Init attention kernel backend."""
        self.decode_attention_backend_str = (
            self.server_args.decode_attention_backend
            if self.server_args.decode_attention_backend
            else self.server_args.attention_backend
        )
        self.prefill_attention_backend_str = (
            self.server_args.prefill_attention_backend
            if self.server_args.prefill_attention_backend
            else self.server_args.attention_backend
        )
        if self.decode_attention_backend_str != self.prefill_attention_backend_str:
            from sglang.srt.layers.attention.hybrid_attn_backend import (
                HybridAttnBackend,
            )

            attn_backend = HybridAttnBackend(
1740
                self,
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
                decode_backend=self._get_attention_backend_from_str(
                    self.decode_attention_backend_str
                ),
                prefill_backend=self._get_attention_backend_from_str(
                    self.prefill_attention_backend_str
                ),
            )
            logger.info(
                f"Using hybrid attention backend for decode and prefill: "
                f"decode_backend={self.decode_attention_backend_str}, "
                f"prefill_backend={self.prefill_attention_backend_str}."
            )
            logger.warning(
                f"Warning: Attention backend specified by --attention-backend or default backend might be overridden."
                f"The feature of hybrid attention backend is experimental and unstable. Please raise an issue if you encounter any problem."
            )
        else:
            attn_backend = self._get_attention_backend_from_str(
                self.server_args.attention_backend
            )

        global_server_args_dict.update(
            {
                "decode_attention_backend": self.decode_attention_backend_str,
                "prefill_attention_backend": self.prefill_attention_backend_str,
            }
        )
        return attn_backend

    def _get_attention_backend_from_str(self, backend_str: str):
1771
        if backend_str not in ATTENTION_BACKENDS:
1772
            raise ValueError(f"Invalid attention backend: {backend_str}")
1773
1774
        full_attention_backend = ATTENTION_BACKENDS[backend_str](self)
        return attn_backend_wrapper(self, full_attention_backend)
1775

Shuo Yang's avatar
Shuo Yang committed
1776
1777
1778
1779
1780
1781
1782
    def init_double_sparsity_channel_config(self, selected_channel):
        selected_channel = "." + selected_channel + "_proj"
        self.sorted_channels = []
        # load channel config
        with open(self.server_args.ds_channel_config_path, "r") as f:
            channel_config = json.load(f)

1783
        for i in range(self.start_layer, self.end_layer):
Shuo Yang's avatar
Shuo Yang committed
1784
1785
1786
1787
1788
1789
1790
1791
1792
            key = "model.layers." + str(i) + ".self_attn" + selected_channel
            self.sorted_channels.append(
                torch.tensor(channel_config[key])[
                    :, : self.server_args.ds_heavy_channel_num
                ]
                .contiguous()
                .cuda()
            )

1793
    def init_device_graphs(self):
1794
        """Capture device graphs."""
1795
        self.graph_runner = None
1796
        self.graph_mem_usage = 0
1797

1798
        if not self.is_generation:
1799
            # TODO: Currently, cuda graph only captures decode steps, which only exists for generation models
1800
1801
            return

1802
1803
1804
1805
        if self.device != "cpu" and self.server_args.disable_cuda_graph:
            return

        if self.device == "cpu" and not self.server_args.enable_torch_compile:
1806
            return
1807

1808
        tic = time.perf_counter()
1809
        before_mem = get_available_gpu_memory(self.device, self.gpu_id)
1810
        logger.info(
1811
            f"Capture {'cpu graph' if self.device == 'cpu' else 'cuda graph'} begin. This can take up to several minutes. avail mem={before_mem:.2f} GB"
1812
        )
1813
1814
1815
1816
1817
1818
        graph_runners = defaultdict(
            lambda: CudaGraphRunner,
            {
                "cpu": CPUGraphRunner,
                "npu": NPUGraphRunner,
            },
1819
        )
1820
1821
        self.graph_runner = graph_runners[self.device](self)

1822
        after_mem = get_available_gpu_memory(self.device, self.gpu_id)
1823
        self.graph_mem_usage = before_mem - after_mem
1824
        logger.info(
1825
1826
            f"Capture {'cpu graph' if self.device == 'cpu' else 'cuda graph'} end. Time elapsed: {time.perf_counter() - tic:.2f} s. "
            f"mem usage={self.graph_mem_usage:.2f} GB. avail mem={after_mem:.2f} GB."
1827
        )
1828

1829
1830
    def init_threads_binding(self):
        omp_cpuids = os.environ.get("SGLANG_CPU_OMP_THREADS_BIND", "all")
1831
1832
        cpu_ids_by_node = get_cpu_ids_by_node()
        n_numa_node = len(cpu_ids_by_node)
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
        if omp_cpuids == "all":
            assert self.tp_size <= n_numa_node, (
                f"SGLANG_CPU_OMP_THREADS_BIND is not set, in this case, "
                f"tp_size {self.tp_size} should be smaller than or equal to number of numa node on the machine {n_numa_node}. "
                f"If you need tp_size to be larger than number of numa node, please set the CPU cores for each tp rank via SGLANG_CPU_OMP_THREADS_BIND explicitly. "
                f"For example, on a machine with 2 numa nodes, where core 0-31 are on numa node 0 and core 32-63 are on numa node 1, "
                f"it is suggested to use -tp 2 and bind tp rank 0 to core 0-31 and tp rank 1 to core 32-63. "
                f"This is the default behavior if SGLANG_CPU_OMP_THREADS_BIND is not set and it is the same as setting SGLANG_CPU_OMP_THREADS_BIND=0-31|32-63. "
                f"If you do need tp_size to be larger than the number of numa nodes, you could set SGLANG_CPU_OMP_THREADS_BIND explicitly for example SGLANG_CPU_OMP_THREADS_BIND=0-15|16-31|32-47|48-63 and run with -tp 4. "
                f"If you don't want each tp rank to use all the cores on one numa node, you could set for example SGLANG_CPU_OMP_THREADS_BIND=0-15|32-47 and run with -tp 2."
            )
            if self.tp_size < n_numa_node:
                logger.warning(
                    f"Detected the current machine has {n_numa_node} numa nodes available, but tp_size is set to {self.tp_size}, so only {self.tp_size} numa nodes are used."
                )
            self.local_omp_cpuid = cpu_ids_by_node[self.tp_rank]
        else:
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
            threads_bind_list = omp_cpuids.split("|")
            assert self.tp_size == len(threads_bind_list), (
                f"SGLANG_CPU_OMP_THREADS_BIND setting must be aligned with TP size parameter ({self.tp_size}). "
                f"Please double check your settings."
            )
            self.local_omp_cpuid = threads_bind_list[self.tp_rank]
            if self.tp_size > n_numa_node:
                logger.warning(
                    f"TP size ({self.tp_size})is larger than numa node number ({n_numa_node}), "
                    f"in this case the available memory amount of each rank cannot be determined in prior. "
                    f"Please set proper `--max-total-tokens` to avoid the out-of-memory error."
                )
1862

1863
    def apply_torch_tp(self):
1864
        logger.info(f"Enabling torch tensor parallelism on {self.tp_size} devices.")
1865
        from sglang.srt.layers.model_parallel import tensor_parallel
1866
1867
1868
1869

        device_mesh = torch.distributed.init_device_mesh(self.device, (self.tp_size,))
        tensor_parallel(self.model, device_mesh)

1870
    def forward_decode(
Cheng Wan's avatar
Cheng Wan committed
1871
1872
1873
1874
        self,
        forward_batch: ForwardBatch,
        skip_attn_backend_init: bool = False,
        pp_proxy_tensors=None,
1875
    ) -> LogitsProcessorOutput:
Cheng Wan's avatar
Cheng Wan committed
1876
1877
        if not skip_attn_backend_init:
            self.attn_backend.init_forward_metadata(forward_batch)
1878
1879
1880
1881
        # FIXME: add pp_proxy_tensors arg to all models
        kwargs = {}
        if self.support_pp:
            kwargs["pp_proxy_tensors"] = pp_proxy_tensors
1882
        return self.model.forward(
1883
1884
1885
1886
            forward_batch.input_ids,
            forward_batch.positions,
            forward_batch,
            **kwargs,
Lianmin Zheng's avatar
Lianmin Zheng committed
1887
1888
        )

1889
    def forward_extend(
1890
1891
1892
1893
1894
        self,
        forward_batch: ForwardBatch,
        skip_attn_backend_init: bool = False,
        pp_proxy_tensors=None,
    ) -> LogitsProcessorOutput:
1895
1896
1897
        if not skip_attn_backend_init:
            self.attn_backend.init_forward_metadata(forward_batch)

1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
        kwargs = {}
        if self.support_pp:
            kwargs["pp_proxy_tensors"] = pp_proxy_tensors
        if forward_batch.input_embeds is not None:
            kwargs["input_embeds"] = forward_batch.input_embeds.bfloat16()
        if not self.is_generation:
            kwargs["get_embedding"] = True
        return self.model.forward(
            forward_batch.input_ids,
            forward_batch.positions,
            forward_batch,
            **kwargs,
        )
Lianmin Zheng's avatar
Lianmin Zheng committed
1911

1912
1913
1914
1915
1916
1917
    def forward_idle(
        self, forward_batch: ForwardBatch, pp_proxy_tensors=None
    ) -> LogitsProcessorOutput:
        kwargs = {}
        if self.support_pp:
            kwargs["pp_proxy_tensors"] = pp_proxy_tensors
Ke Bao's avatar
Ke Bao committed
1918
        return self.model.forward(
1919
1920
1921
1922
            forward_batch.input_ids,
            forward_batch.positions,
            forward_batch,
            **kwargs,
Ke Bao's avatar
Ke Bao committed
1923
1924
        )

1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
    def forward_split_prefill(
        self,
        forward_batch: ForwardBatch,
        reinit_attn_backend: bool = False,
        forward_count: int = 1,
    ) -> LogitsProcessorOutput:
        if forward_batch.split_index == 0 or reinit_attn_backend:
            self.attn_backend.init_forward_metadata(forward_batch)
        next_split_index = min(
            forward_batch.split_index + forward_count,
            self.model_config.num_hidden_layers,
        )
        ret = self.model.forward_split_prefill(
            forward_batch.input_ids,
            forward_batch.positions,
            forward_batch,
            (forward_batch.split_index, next_split_index),
        )
        forward_batch.split_index = next_split_index
        return ret

1946
    def forward(
1947
1948
1949
1950
        self,
        forward_batch: ForwardBatch,
        skip_attn_backend_init: bool = False,
        pp_proxy_tensors: Optional[PPProxyTensors] = None,
1951
1952
        reinit_attn_backend: bool = False,
        split_forward_count: int = 1,
1953
1954
1955
1956
1957
1958
1959
    ) -> Tuple[Union[LogitsProcessorOutput, PPProxyTensors], bool]:
        self.forward_pass_id += 1

        with get_global_expert_distribution_recorder().with_forward_pass(
            self.forward_pass_id,
            forward_batch,
        ):
1960
            output = self._forward_raw(
1961
1962
1963
1964
1965
                forward_batch,
                skip_attn_backend_init,
                pp_proxy_tensors,
                reinit_attn_backend,
                split_forward_count,
1966
1967
            )

1968
        if self.eplb_manager is not None:
1969
            self.eplb_manager.on_forward_pass_end()
1970
1971
1972

        return output

1973
1974
1975
1976
1977
    def _forward_raw(
        self,
        forward_batch: ForwardBatch,
        skip_attn_backend_init: bool,
        pp_proxy_tensors: Optional[PPProxyTensors],
1978
1979
        reinit_attn_backend: bool = False,
        split_forward_count: int = 1,
1980
    ) -> Tuple[Union[LogitsProcessorOutput, PPProxyTensors], bool]:
1981
1982
1983
1984
1985
1986
1987
        mode_check = (
            forward_batch.forward_mode.is_cpu_graph
            if self.device == "cpu"
            else forward_batch.forward_mode.is_cuda_graph
        )
        can_run_graph = bool(
            mode_check()
1988
1989
            and self.graph_runner
            and self.graph_runner.can_run(forward_batch)
1990
        )
1991
1992

        if can_run_graph:
1993
            ret = self.graph_runner.replay(
1994
1995
1996
                forward_batch,
                skip_attn_backend_init=skip_attn_backend_init,
                pp_proxy_tensors=pp_proxy_tensors,
1997
            )
1998
            return ret, can_run_graph
Cheng Wan's avatar
Cheng Wan committed
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009

        # For MLP sync
        if forward_batch.global_num_tokens_cpu is not None:
            forward_batch.prepare_mlp_sync_batch(self)

        if forward_batch.forward_mode.is_decode():
            ret = self.forward_decode(
                forward_batch,
                skip_attn_backend_init=skip_attn_backend_init,
                pp_proxy_tensors=pp_proxy_tensors,
            )
2010
        elif forward_batch.forward_mode.is_extend():
2011
            ret = self.forward_extend(
2012
2013
2014
                forward_batch,
                skip_attn_backend_init=skip_attn_backend_init,
                pp_proxy_tensors=pp_proxy_tensors,
2015
            )
2016
2017
2018
2019
2020
2021
        elif forward_batch.forward_mode.is_split_prefill():
            ret = self.forward_split_prefill(
                forward_batch,
                reinit_attn_backend=reinit_attn_backend,
                forward_count=split_forward_count,
            )
Ke Bao's avatar
Ke Bao committed
2022
        elif forward_batch.forward_mode.is_idle():
2023
            ret = self.forward_idle(forward_batch, pp_proxy_tensors=pp_proxy_tensors)
Lianmin Zheng's avatar
Lianmin Zheng committed
2024
        else:
2025
            raise ValueError(f"Invalid forward mode: {forward_batch.forward_mode}")
2026

2027
2028
2029
2030
        if (
            forward_batch.global_num_tokens_cpu is not None
            and self.pp_group.is_last_rank
        ):
Cheng Wan's avatar
Cheng Wan committed
2031
2032
            forward_batch.post_forward_mlp_sync_batch(ret)

2033
        return ret, can_run_graph
2034

2035
2036
2037
    def _preprocess_logits(
        self, logits_output: LogitsProcessorOutput, sampling_info: SamplingBatchInfo
    ):
2038
        # Apply logit bias
2039
2040
2041
2042
2043
2044
2045
2046
        if sampling_info.sampling_info_done:
            # Overlap mode: the function update_regex_vocab_mask was executed
            # in process_batch_result of the last batch.
            if sampling_info.grammars:
                sampling_info.sampling_info_done.wait()
        else:
            # Normal mode: Put CPU-heavy tasks here. They will be overlapped with the forward pass.
            sampling_info.update_regex_vocab_mask()
2047
2048
        sampling_info.apply_logits_bias(logits_output.next_token_logits)

2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
    def sample(
        self,
        logits_output: LogitsProcessorOutput,
        forward_batch: ForwardBatch,
    ) -> torch.Tensor:
        """Sample and compute logprobs and update logits_output.

        Args:
            logits_output: The logits output from the model forward
            forward_batch: The forward batch that generates logits_output

        Returns:
            A list of next_token_ids
        """
        # For duplex models with multiple output streams.
        if isinstance(logits_output, tuple):
            return torch.stack(
                [self.sample(values, forward_batch) for values in logits_output],
                axis=-1,
            )
2069

2070
        self._preprocess_logits(logits_output, forward_batch.sampling_info)
2071
2072
2073
        # Sample the next tokens
        next_token_ids = self.sampler(
            logits_output,
2074
            forward_batch.sampling_info,
2075
2076
            forward_batch.return_logprob,
            forward_batch.top_logprobs_nums,
2077
            forward_batch.token_ids_logprobs,
2078
2079
2080
2081
2082
2083
            # For prefill, we only use the position of the last token.
            (
                forward_batch.positions
                if forward_batch.forward_mode.is_decode()
                else forward_batch.seq_lens - 1
            ),
2084
        )
2085
2086
        return next_token_ids

2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
    def compute_logprobs_only(
        self,
        logits_output: LogitsProcessorOutput,
        forward_batch: ForwardBatch,
    ) -> None:
        """
        Compute token_ids_logprobs without performing sampling.

        Optimized path for prefill-only requests that need token_ids_logprobs but don't
        require next token generation. Skips expensive sampling operations
        while still providing requested probability information.

        Args:
            logits_output: The logits output from the model forward
            forward_batch: The forward batch that generates logits_output
        """
        if not forward_batch.token_ids_logprobs:
            return

        # Preprocess logits (same as in sample method)
        self._preprocess_logits(logits_output, forward_batch.sampling_info)

        # Delegate to sampler for logprob-only computation
        # This populates logits_output with requested token probabilities
        self.sampler.compute_logprobs_only(
            logits_output,
            forward_batch.sampling_info,
            forward_batch.return_logprob,
            forward_batch.top_logprobs_nums,
            forward_batch.token_ids_logprobs,
        )

Yineng Zhang's avatar
Yineng Zhang committed
2119
2120
2121
2122
    @property
    def model_is_mrope(self) -> bool:
        """Detect if the model has "mrope" rope_scaling type.
        mrope requires keep "rope_deltas" between prompt and decoding phases."""
2123
        rope_scaling = getattr(self.model_config.hf_text_config, "rope_scaling", {})
Yineng Zhang's avatar
Yineng Zhang committed
2124
2125
        if rope_scaling is None:
            return False
2126
2127
        is_mrope_enabled = "mrope_section" in rope_scaling
        return is_mrope_enabled
2128

2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
    def save_remote_model(self, url: str):
        from sglang.srt.model_loader.loader import RemoteModelLoader

        logger.info(f"Saving model to {url}")
        RemoteModelLoader.save_model(self.model, self.model_config.model_path, url)

    def save_sharded_model(
        self, path: str, pattern: Optional[str] = None, max_size: Optional[int] = None
    ):
        from sglang.srt.model_loader.loader import ShardedStateLoader

        logger.info(
            f"Save sharded model to {path} with pattern {pattern} and max_size {max_size}"
        )
        ShardedStateLoader.save_model(self.model, path, pattern, max_size)

2145
2146
2147
2148
2149
2150
2151

def _model_load_weights_direct(model, named_tensors: List[Tuple[str, torch.Tensor]]):
    params_dict = dict(model.named_parameters())
    for name, tensor in named_tensors:
        default_weight_loader(params_dict[name], tensor)


2152
def _unwrap_tensor(tensor, tp_rank, device):
2153
    if isinstance(tensor, LocalSerializedTensor):
2154
        tensor = tensor.get(tp_rank)
2155
    return tensor.to(device)
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166


@dataclass
class LocalSerializedTensor:
    """torch.Tensor that gets serialized by MultiprocessingSerializer (which only serializes a pointer and not the data).
    The i-th element in the list corresponds to i-th rank's GPU."""

    values: List[bytes]

    def get(self, rank: int):
        return MultiprocessingSerializer.deserialize(self.values[rank])