decode.py 23.1 KB
Newer Older
Byron Hsu's avatar
Byron Hsu committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
"""
Life cycle of a request in the decode server

1. PreallocQueue:
    a. Initialize a receiver for each request
    b. The request handshakes first, and pre-allocate kv once there is available kv.
    c. Move the request to TransferQueue.

2. TransferQueue:
    a. Poll the receiver to check the transfer state
    b. If the transfer has finished, move the request to waiting queue

3. WaitingQueue:
    a. Use the requests in the queue to construct a PrebuiltExtendBatch
    b. Skip the prefill forward but only populate metadata

4. RunningBatch:
    a. Merge the resolved PrebuiltExtendBatch into running batch to run decoding
"""

from __future__ import annotations

import logging
Liangsheng Yin's avatar
Liangsheng Yin committed
24
import os
25
from collections import deque
Byron Hsu's avatar
Byron Hsu committed
26
from dataclasses import dataclass
27
from http import HTTPStatus
Byron Hsu's avatar
Byron Hsu committed
28
29
from typing import TYPE_CHECKING, List, Optional, Tuple

30
import numpy as np
Byron Hsu's avatar
Byron Hsu committed
31
32
33
import torch
from torch.distributed import ProcessGroup

34
from sglang.srt.disaggregation.base import BaseKVManager, BaseKVReceiver, KVArgs, KVPoll
Byron Hsu's avatar
Byron Hsu committed
35
from sglang.srt.disaggregation.utils import (
36
    DisaggregationMode,
37
    FakeBootstrapHost,
38
    KVClassType,
Byron Hsu's avatar
Byron Hsu committed
39
    ReqToMetadataIdxAllocator,
40
41
    TransferBackend,
    get_kv_class,
42
    is_mla_backend,
Byron Hsu's avatar
Byron Hsu committed
43
    kv_to_page_indices,
Byron Hsu's avatar
Byron Hsu committed
44
    poll_and_all_reduce,
45
    prepare_abort,
Byron Hsu's avatar
Byron Hsu committed
46
)
Byron Hsu's avatar
Byron Hsu committed
47
from sglang.srt.managers.schedule_batch import FINISH_ABORT
Byron Hsu's avatar
Byron Hsu committed
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
from sglang.srt.mem_cache.base_prefix_cache import BasePrefixCache
from sglang.srt.mem_cache.memory_pool import ReqToTokenPool, TokenToKVPoolAllocator
from sglang.srt.model_executor.forward_batch_info import ForwardMode
from sglang.srt.sampling.sampling_batch_info import SamplingBatchInfo

logger = logging.getLogger(__name__)

if TYPE_CHECKING:
    from sglang.srt.configs.model_config import ModelConfig
    from sglang.srt.managers.schedule_batch import Req, ScheduleBatch
    from sglang.srt.managers.scheduler import Scheduler
    from sglang.srt.server_args import ServerArgs


@dataclass
class DecodeRequest:
    req: Req
65
    kv_receiver: BaseKVReceiver
Byron Hsu's avatar
Byron Hsu committed
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
    waiting_for_input: bool = False
    metadata_buffer_index: int = -1


class DecodePreallocQueue:
    """
    Store the requests that are preallocating.
    """

    def __init__(
        self,
        req_to_token_pool: ReqToTokenPool,
        token_to_kv_pool_allocator: TokenToKVPoolAllocator,
        req_to_metadata_buffer_idx_allocator: ReqToMetadataIdxAllocator,
        metadata_buffers: List[torch.Tensor],
        aux_dtype: torch.dtype,
        scheduler: Scheduler,
        transfer_queue: DecodeTransferQueue,
        tree_cache: BasePrefixCache,
        gloo_group: ProcessGroup,
        tp_rank: int,
        tp_size: int,
        bootstrap_port: int,
89
        transfer_backend: TransferBackend,
Byron Hsu's avatar
Byron Hsu committed
90
91
92
93
    ):
        self.req_to_token_pool = req_to_token_pool
        self.token_to_kv_pool_allocator = token_to_kv_pool_allocator
        self.token_to_kv_pool = token_to_kv_pool_allocator.get_kvcache()
94
        self.is_mla_backend = is_mla_backend(self.token_to_kv_pool)
Byron Hsu's avatar
Byron Hsu committed
95
96
97
98
99
100
101
102
103
104
105
        self.aux_dtype = aux_dtype
        self.metadata_buffers = metadata_buffers
        self.req_to_metadata_buffer_idx_allocator = req_to_metadata_buffer_idx_allocator
        self.scheduler = scheduler
        self.transfer_queue = transfer_queue
        self.tree_cache = tree_cache  # this is always a chunk cache
        self.gloo_group = gloo_group
        self.tp_rank = tp_rank
        self.tp_size = tp_size
        self.bootstrap_port = bootstrap_port

106
107
108
        self.num_reserved_decode_tokens = int(
            os.environ.get("SGLANG_NUM_RESERVED_DECODE_TOKENS", "512")
        )
Byron Hsu's avatar
Byron Hsu committed
109
110
111

        # Queue for requests pending pre-allocation
        self.queue: List[DecodeRequest] = []
112
        self.transfer_backend = transfer_backend
Byron Hsu's avatar
Byron Hsu committed
113
114
        self.kv_manager = self._init_kv_manager()

115
    def _init_kv_manager(self) -> BaseKVManager:
Byron Hsu's avatar
Byron Hsu committed
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
        kv_args = KVArgs()
        kv_args.engine_rank = self.tp_rank
        kv_data_ptrs, kv_data_lens, kv_item_lens = (
            self.token_to_kv_pool.get_contiguous_buf_infos()
        )

        kv_args.kv_data_ptrs = kv_data_ptrs
        kv_args.kv_data_lens = kv_data_lens
        kv_args.kv_item_lens = kv_item_lens

        kv_args.aux_data_ptrs = [
            output_id_tensor.data_ptr() for output_id_tensor in self.metadata_buffers
        ]
        kv_args.aux_data_lens = [
            metadata_buffer.nbytes for metadata_buffer in self.metadata_buffers
        ]
        kv_args.aux_item_lens = [
            metadata_buffer[0].nbytes for metadata_buffer in self.metadata_buffers
        ]
135
        kv_args.ib_device = self.scheduler.server_args.disaggregation_ib_device
136
        kv_args.gpu_id = self.scheduler.gpu_id
137
        kv_manager_class = get_kv_class(self.transfer_backend, KVClassType.MANAGER)
138
        kv_manager = kv_manager_class(
139
140
141
142
            kv_args,
            DisaggregationMode.DECODE,
            self.scheduler.server_args,
            self.is_mla_backend,
143
        )
Byron Hsu's avatar
Byron Hsu committed
144
145
146
147
        return kv_manager

    def add(self, req: Req) -> None:
        """Add a request to the pending queue."""
148
149
150
151
152
153
154
        if req.bootstrap_host == FakeBootstrapHost:
            # Fake transfer for warmup reqs
            kv_receiver_class = get_kv_class(TransferBackend.FAKE, KVClassType.RECEIVER)
        else:
            kv_receiver_class = get_kv_class(
                self.transfer_backend, KVClassType.RECEIVER
            )
155
        kv_receiver = kv_receiver_class(
Byron Hsu's avatar
Byron Hsu committed
156
            mgr=self.kv_manager,
157
            bootstrap_addr=f"{req.bootstrap_host}:{req.bootstrap_port}",
Byron Hsu's avatar
Byron Hsu committed
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
            bootstrap_room=req.bootstrap_room,
        )
        self.queue.append(DecodeRequest(req=req, kv_receiver=kv_receiver))

    def extend(self, reqs: List[Req]) -> None:
        """Add a request to the pending queue."""
        for req in reqs:
            self.add(req)

    def _update_handshake_waiters(self) -> None:
        if not self.queue:
            return

        if all(decode_req.waiting_for_input for decode_req in self.queue):
            return

        polls = poll_and_all_reduce(
            [decode_req.kv_receiver for decode_req in self.queue], self.gloo_group
        )

        for i, (decode_req, poll) in enumerate(zip(self.queue, polls)):
            if poll == KVPoll.Bootstrapping:
                pass
            elif poll == KVPoll.WaitingForInput:
                decode_req.waiting_for_input = True
            elif poll == KVPoll.Failed:
184
185
186
187
188
189
190
191
192
193
194
                error_message = f"Decode handshake failed for request rank={self.tp_rank} {decode_req.req.rid=} {decode_req.req.bootstrap_room=}"
                try:
                    decode_req.kv_receiver.failure_exception()
                except Exception as e:
                    error_message += f" with exception {e}"
                logger.error(error_message)
                prepare_abort(
                    decode_req.req,
                    error_message,
                    status_code=HTTPStatus.INTERNAL_SERVER_ERROR,
                )
Byron Hsu's avatar
Byron Hsu committed
195
196
197
198
199
200
201

    def pop_preallocated(self) -> List[DecodeRequest]:
        """Pop the preallocated requests from the pending queue (FIFO)."""
        self._update_handshake_waiters()

        preallocated_reqs = []
        indices_to_remove = set()
202
        allocatable_tokens = self._allocatable_tokens()
Byron Hsu's avatar
Byron Hsu committed
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229

        for i, decode_req in enumerate(self.queue):
            if not decode_req.waiting_for_input:
                continue

            if self.req_to_token_pool.available_size() <= 0:
                break

            if self.req_to_metadata_buffer_idx_allocator.available_size() <= 0:
                break

            required_tokens_for_request = (
                len(decode_req.req.origin_input_ids) + self.num_reserved_decode_tokens
            )

            if required_tokens_for_request > allocatable_tokens:
                break

            allocatable_tokens -= required_tokens_for_request
            self._pre_alloc(decode_req.req)

            kv_indices = (
                self.req_to_token_pool.req_to_token[decode_req.req.req_pool_idx][
                    : len(decode_req.req.origin_input_ids)
                ]
                .cpu()
                .numpy()
230
                .astype(np.int64)
Byron Hsu's avatar
Byron Hsu committed
231
232
233
234
235
236
            )

            decode_req.metadata_buffer_index = (
                self.req_to_metadata_buffer_idx_allocator.alloc()
            )
            assert decode_req.metadata_buffer_index is not None
Byron Hsu's avatar
Byron Hsu committed
237
238
239
240
            page_indices = kv_to_page_indices(
                kv_indices, self.token_to_kv_pool_allocator.page_size
            )
            decode_req.kv_receiver.init(page_indices, decode_req.metadata_buffer_index)
Byron Hsu's avatar
Byron Hsu committed
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
            preallocated_reqs.append(decode_req)
            indices_to_remove.add(i)

        self.queue = [
            entry for i, entry in enumerate(self.queue) if i not in indices_to_remove
        ]

        return preallocated_reqs

    def _allocatable_tokens(self) -> int:
        allocatable_tokens = (
            self.token_to_kv_pool_allocator.available_size()
            - self.num_reserved_decode_tokens
            * (
                len(self.scheduler.running_batch.reqs)
                + len(self.transfer_queue.queue)
                + len(self.scheduler.waiting_queue)
            )
        )

        # Note: if the last fake extend just finishes, and we enter `pop_preallocated` immediately in the next iteration
        #       the extend batch is not in any queue, so we need to explicitly add the tokens slots here
        if (
            self.scheduler.last_batch
            and self.scheduler.last_batch.forward_mode.is_extend()
        ):
            allocatable_tokens -= self.num_reserved_decode_tokens * len(
                self.scheduler.last_batch.reqs
            )

        return allocatable_tokens

    def _pre_alloc(self, req: Req) -> torch.Tensor:
        """Pre-allocate the memory for req_to_token and token_kv_pool"""
        req_pool_indices = self.req_to_token_pool.alloc(1)

        assert req_pool_indices is not None

        req.req_pool_idx = req_pool_indices[0]
Byron Hsu's avatar
Byron Hsu committed
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
        if self.token_to_kv_pool_allocator.page_size == 1:
            kv_loc = self.token_to_kv_pool_allocator.alloc(
                len(req.origin_input_ids) + max(len(req.output_ids) - 1, 0)
            )
        else:
            num_tokens = len(req.origin_input_ids) + max(len(req.output_ids) - 1, 0)
            kv_loc = self.token_to_kv_pool_allocator.alloc_extend(
                prefix_lens=torch.tensor(
                    [0],
                    dtype=torch.int64,
                    device=self.token_to_kv_pool_allocator.device,
                ),
                seq_lens=torch.tensor(
                    [num_tokens],
                    dtype=torch.int64,
                    device=self.token_to_kv_pool_allocator.device,
                ),
                last_loc=torch.tensor(
                    [-1],
                    dtype=torch.int64,
                    device=self.token_to_kv_pool_allocator.device,
                ),
                extend_num_tokens=num_tokens,
            )
Byron Hsu's avatar
Byron Hsu committed
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
        assert kv_loc is not None

        self.req_to_token_pool.write((req.req_pool_idx, slice(0, len(kv_loc))), kv_loc)

        # populate metadata
        req.fill_ids = req.origin_input_ids + req.output_ids
        req.extend_input_len = len(req.origin_input_ids)

        return kv_loc


class DecodeTransferQueue:
    """
    Store the requests that is polling kv
    """

    def __init__(
        self,
        gloo_group: ProcessGroup,
        req_to_metadata_buffer_idx_allocator: ReqToMetadataIdxAllocator,
        metadata_buffers: torch.Tensor,
Byron Hsu's avatar
Byron Hsu committed
325
326
        scheduler: Scheduler,
        tree_cache: BasePrefixCache,
Byron Hsu's avatar
Byron Hsu committed
327
328
329
330
331
    ):
        self.queue: List[DecodeRequest] = []
        self.gloo_group = gloo_group
        self.req_to_metadata_buffer_idx_allocator = req_to_metadata_buffer_idx_allocator
        self.metadata_buffers = metadata_buffers
Byron Hsu's avatar
Byron Hsu committed
332
333
        self.scheduler = scheduler
        self.tree_cache = tree_cache
Byron Hsu's avatar
Byron Hsu committed
334
335
336
337
338
339
340

    def add(self, req_conn: DecodeRequest) -> None:
        self.queue.append(req_conn)

    def extend(self, req_conns) -> None:
        self.queue.extend(req_conns)

Liangsheng Yin's avatar
Liangsheng Yin committed
341
    def pop_transferred(self) -> List[DecodeRequest]:
Byron Hsu's avatar
Byron Hsu committed
342
343
344
345
346
347
348
        if not self.queue:
            return []

        polls = poll_and_all_reduce(
            [decode_req.kv_receiver for decode_req in self.queue], self.gloo_group
        )

Byron Hsu's avatar
Byron Hsu committed
349
350
351
352
353
354
355
356
        # First, remove all failed requests from the queue
        for i, decode_req in enumerate(self.queue):
            if isinstance(decode_req.req.finished_reason, FINISH_ABORT):
                self.scheduler.stream_output(
                    [decode_req.req], decode_req.req.return_logprob
                )
                indices_to_remove.add(i)

Byron Hsu's avatar
Byron Hsu committed
357
358
359
360
        transferred_reqs = []
        indices_to_remove = set()
        for i, (decode_req, poll) in enumerate(zip(self.queue, polls)):
            if poll == KVPoll.Failed:
361
                error_message = f"Decode transfer failed for request {decode_req.req.rid=} {decode_req.req.bootstrap_room=}"
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
                try:
                    decode_req.kv_receiver.failure_exception()
                except Exception as e:
                    error_message += f" with exception {e}"
                logger.error(error_message)
                prepare_abort(
                    decode_req.req,
                    error_message,
                    status_code=HTTPStatus.INTERNAL_SERVER_ERROR,
                )
                self.scheduler.stream_output(
                    [decode_req.req], decode_req.req.return_logprob
                )
                # unlock the kv cache or it will have memory leak
                self.tree_cache.cache_finished_req(decode_req.req)
                indices_to_remove.add(i)
                continue
Byron Hsu's avatar
Byron Hsu committed
379
380
381
382
383
384
385
386
387
388
            elif poll == KVPoll.Success:
                # pop and push it to waiting queue
                idx = decode_req.metadata_buffer_index
                assert len(decode_req.req.output_ids) == 0
                output_id_buffer = self.metadata_buffers[0]
                # the last dimension is padded by the same values.
                output_id = output_id_buffer[idx][0].item()
                assert len(decode_req.req.output_ids) == 0
                assert decode_req.req.transferred_output_id is None
                decode_req.req.transferred_output_id = output_id
Liangsheng Yin's avatar
Liangsheng Yin committed
389
                transferred_reqs.append(decode_req)
Byron Hsu's avatar
Byron Hsu committed
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
                indices_to_remove.add(i)
            elif poll in [
                KVPoll.Bootstrapping,
                KVPoll.WaitingForInput,
                KVPoll.Transferring,
            ]:
                pass
            else:
                raise ValueError(f"Unexpected poll case: {poll}")

        for i in indices_to_remove:
            idx = self.queue[i].metadata_buffer_index
            assert idx != -1
            self.req_to_metadata_buffer_idx_allocator.free(idx)

        self.queue = [
            entry for i, entry in enumerate(self.queue) if i not in indices_to_remove
        ]

        return transferred_reqs


class SchedulerDisaggregationDecodeMixin:

414
415
416
417
418
419
420
421
422
    def _prepare_idle_batch_and_run(self, batch, delay_process=False):
        batch, _ = self.prepare_dp_attn_batch(batch)
        result = None
        if batch:
            result = self.run_batch(batch)
            if not delay_process:
                self.process_batch_result(batch, result)
        return batch, result

423
    @torch.no_grad()
Liangsheng Yin's avatar
Liangsheng Yin committed
424
    def event_loop_normal_disagg_decode(self: Scheduler):
425
426
427
428
429
430
431
432
433
434
        """A normal scheduler loop for decode worker in disaggregation mode."""

        while True:
            recv_reqs = self.recv_requests()
            self.process_input_requests(recv_reqs)
            # polling and allocating kv cache
            self.process_decode_queue()
            batch = self.get_next_disagg_decode_batch_to_run()
            self.cur_batch = batch

435
436
437
438
439
            prepare_dp_attn_flag = (
                self.server_args.enable_dp_attention
                or self.server_args.enable_sp_layernorm
            )

440
441
442
443
444
            if batch:
                # Generate fake extend output.
                if batch.forward_mode.is_extend():
                    # Note: Logprobs should be handled on the prefill engine.
                    self.stream_output(batch.reqs, False)
445
446
                    if prepare_dp_attn_flag:
                        self._prepare_idle_batch_and_run(None)
447
                else:
448
449
                    if prepare_dp_attn_flag:
                        self.prepare_dp_attn_batch(batch)
450
451
                    result = self.run_batch(batch)
                    self.process_batch_result(batch, result)
452
453
            elif prepare_dp_attn_flag:
                batch, _ = self._prepare_idle_batch_and_run(None)
454
455
456
457
458
459
460
461
462
463
464
465

            if batch is None and (
                len(self.disagg_decode_transfer_queue.queue)
                + len(self.disagg_decode_prealloc_queue.queue)
                == 0
            ):
                # When the server is idle, do self-check and re-init some states
                self.check_memory()
                self.new_token_ratio = self.init_new_token_ratio

            self.last_batch = batch

466
    @torch.no_grad()
Liangsheng Yin's avatar
Liangsheng Yin committed
467
    def event_loop_overlap_disagg_decode(self: Scheduler):
468
469
        result_queue = deque()
        self.last_batch: Optional[ScheduleBatch] = None
470
        self.last_batch_in_queue = False  # last batch is modified in-place, so we need another variable to track if it's extend
471
472
473
474
475
476
477
478

        while True:
            recv_reqs = self.recv_requests()
            self.process_input_requests(recv_reqs)
            # polling and allocating kv cache
            self.process_decode_queue()
            batch = self.get_next_disagg_decode_batch_to_run()
            self.cur_batch = batch
479
480
481
482
483
484
            last_batch_in_queue = False

            prepare_dp_attn_flag = (
                self.server_args.enable_dp_attention
                or self.server_args.enable_sp_layernorm
            )
485
486
487
488
489
490

            if batch:
                # Generate fake extend output.
                if batch.forward_mode.is_extend():
                    # Note: Logprobs should be handled on the prefill engine.
                    self.stream_output(batch.reqs, False)
491
492
493
494
495
496
497
                    if prepare_dp_attn_flag:
                        batch_, result = self._prepare_idle_batch_and_run(
                            None, delay_process=True
                        )
                        if batch_:
                            result_queue.append((batch_.copy(), result))
                            last_batch_in_queue = True
498
                else:
499
500
                    if prepare_dp_attn_flag:
                        self.prepare_dp_attn_batch(batch)
501
502
                    result = self.run_batch(batch)
                    result_queue.append((batch.copy(), result))
503
504
505
506
507
508
509
510
                    last_batch_in_queue = True
            elif prepare_dp_attn_flag:
                batch, result = self._prepare_idle_batch_and_run(
                    None, delay_process=True
                )
                if batch:
                    result_queue.append((batch.copy(), result))
                    last_batch_in_queue = True
511
512

            # Process the results of the previous batch but skip if the last batch is extend
513
            if self.last_batch and self.last_batch_in_queue:
514
515
516
517
518
519
520
521
522
523
524
525
526
                tmp_batch, tmp_result = result_queue.popleft()
                self.process_batch_result(tmp_batch, tmp_result)

            if batch is None and (
                len(self.disagg_decode_transfer_queue.queue)
                + len(self.disagg_decode_prealloc_queue.queue)
                == 0
            ):
                # When the server is idle, do self-check and re-init some states
                self.check_memory()
                self.new_token_ratio = self.init_new_token_ratio

            self.last_batch = batch
527
            self.last_batch_in_queue = last_batch_in_queue
528

Byron Hsu's avatar
Byron Hsu committed
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
    def get_next_disagg_decode_batch_to_run(
        self: Scheduler,
    ) -> Optional[Tuple[ScheduleBatch, bool]]:
        """Create fake completed prefill if possible and merge with running batch"""
        # Merge the prefill batch into the running batch
        last_batch = self.last_batch
        if last_batch and last_batch.forward_mode.is_extend():
            # chunked prefill doesn't happen in decode instance.
            assert self.chunked_req is None
            # Filter finished batches.
            last_batch.filter_batch()
            if not last_batch.is_empty():
                if self.running_batch.is_empty():
                    self.running_batch = last_batch
                else:
                    # merge running_batch with prefill batch
                    self.running_batch.merge_batch(last_batch)

        new_prebuilt_batch = self.get_new_prebuilt_batch()

        ret: Optional[ScheduleBatch] = None
        if new_prebuilt_batch:
            ret = new_prebuilt_batch
        else:
            if self.running_batch.is_empty():
                ret = None
            else:
                self.running_batch = self.update_running_batch(self.running_batch)
                ret = self.running_batch if not self.running_batch.is_empty() else None

        return ret

    def get_new_prebuilt_batch(self: Scheduler) -> Optional[ScheduleBatch]:
        """Create a schedulebatch for fake completed prefill"""
        if len(self.waiting_queue) == 0:
            return None

        curr_batch_size = self.running_batch.batch_size()

        batch_size = min(self.req_to_token_pool.size, self.max_running_requests)

        num_not_used_batch = batch_size - curr_batch_size

        # pop req from waiting queue
        can_run_list: List[Req] = []
        waiting_queue: List[Req] = []

        for i in range(len(self.waiting_queue)):
            req = self.waiting_queue[i]
            # we can only add at least `num_not_used_batch` new batch to the running queue
            if i < num_not_used_batch:
                can_run_list.append(req)
                req.init_next_round_input(self.tree_cache)
            else:
                waiting_queue.append(req)

        self.waiting_queue = waiting_queue
        if len(can_run_list) == 0:
            return None
        # local import to avoid circular import
        from sglang.srt.managers.schedule_batch import ScheduleBatch

        # construct a schedule batch with those requests and mark as decode
        new_batch = ScheduleBatch.init_new(
            can_run_list,
            self.req_to_token_pool,
            self.token_to_kv_pool_allocator,
            self.tree_cache,
            self.model_config,
            self.enable_overlap,
            self.spec_algorithm,
            self.server_args.enable_custom_logit_processor,
        )

        # construct fake completed prefill
        new_batch.prepare_for_prebuilt_extend()
        new_batch.process_prebuilt_extend(self.server_args, self.model_config)

        return new_batch

    def process_decode_queue(self: Scheduler):
        req_conns = self.disagg_decode_prealloc_queue.pop_preallocated()
Liangsheng Yin's avatar
Liangsheng Yin committed
611
612
613
614
615

        def _num_pre_alloc(req):
            return len(req.req.origin_input_ids) + max(len(req.req.output_ids) - 1, 0)

        self.num_tokens_pre_allocated += sum(_num_pre_alloc(req) for req in req_conns)
Byron Hsu's avatar
Byron Hsu committed
616
617
618
619
        self.disagg_decode_transfer_queue.extend(req_conns)
        alloc_reqs = (
            self.disagg_decode_transfer_queue.pop_transferred()
        )  # the requests which kv has arrived
Liangsheng Yin's avatar
Liangsheng Yin committed
620
621
622
        self.num_tokens_pre_allocated -= sum(_num_pre_alloc(req) for req in alloc_reqs)

        self.waiting_queue.extend([req.req for req in alloc_reqs])