llm_engine.py 26.8 KB
Newer Older
Antoni Baum's avatar
Antoni Baum committed
1
import time
2
from typing import Iterable, List, Optional, Type, Union
3

4
5
from transformers import PreTrainedTokenizer

6
import vllm
7
8
9
from vllm.config import (CacheConfig, DecodingConfig, DeviceConfig, LoadConfig,
                         LoRAConfig, ModelConfig, ParallelConfig,
                         SchedulerConfig, SpeculativeConfig,
10
                         VisionLanguageConfig)
Antoni Baum's avatar
Antoni Baum committed
11
from vllm.core.scheduler import Scheduler, SchedulerOutputs
Woosuk Kwon's avatar
Woosuk Kwon committed
12
from vllm.engine.arg_utils import EngineArgs
13
from vllm.engine.metrics import StatLogger, Stats
14
15
16
17
from vllm.engine.output_processor.interfaces import (
    SequenceGroupOutputProcessor)
from vllm.engine.output_processor.stop_checker import StopChecker
from vllm.engine.output_processor.util import create_output_by_sequence_group
18
from vllm.engine.ray_utils import initialize_ray_cluster
19
from vllm.executor.executor_base import ExecutorBase
Woosuk Kwon's avatar
Woosuk Kwon committed
20
from vllm.logger import init_logger
21
from vllm.lora.request import LoRARequest
Woosuk Kwon's avatar
Woosuk Kwon committed
22
23
from vllm.outputs import RequestOutput
from vllm.sampling_params import SamplingParams
24
from vllm.sequence import (MultiModalData, SamplerOutput, Sequence,
25
                           SequenceGroup)
26
from vllm.transformers_utils.detokenizer import Detokenizer
27
28
from vllm.transformers_utils.tokenizer_group import (BaseTokenizerGroup,
                                                     get_tokenizer_group)
yhu422's avatar
yhu422 committed
29
30
from vllm.usage.usage_lib import (UsageContext, is_usage_stats_enabled,
                                  usage_message)
31
from vllm.utils import Counter
32
33

logger = init_logger(__name__)
34
_LOCAL_LOGGING_INTERVAL_SEC = 5
Woosuk Kwon's avatar
Woosuk Kwon committed
35

36

37
class LLMEngine:
Zhuohan Li's avatar
Zhuohan Li committed
38
    """An LLM engine that receives requests and generates texts.
39

Woosuk Kwon's avatar
Woosuk Kwon committed
40
    This is the main class for the vLLM engine. It receives requests
41
42
43
44
45
46
47
    from clients and generates texts from the LLM. It includes a tokenizer, a
    language model (possibly distributed across multiple GPUs), and GPU memory
    space allocated for intermediate states (aka KV cache). This class utilizes
    iteration-level scheduling and efficient memory management to maximize the
    serving throughput.

    The `LLM` class wraps this class for offline batched inference and the
48
    `AsyncLLMEngine` class wraps this class for online serving.
49

Zhuohan Li's avatar
Zhuohan Li committed
50
51
    NOTE: The config arguments are derived from the `EngineArgs` class. For the
    comprehensive list of arguments, see `EngineArgs`.
52
53
54
55
56
57
58

    Args:
        model_config: The configuration related to the LLM model.
        cache_config: The configuration related to the KV cache memory
            management.
        parallel_config: The configuration related to distributed execution.
        scheduler_config: The configuration related to the request scheduler.
59
        device_config: The configuration related to the device.
60
61
62
63
64
        lora_config (Optional): The configuration related to serving multi-LoRA.
        vision_language_config (Optional): The configuration related to vision
            language models.
        speculative_config (Optional): The configuration related to speculative
            decoding.
65
66
        executor_class: The model executor class for managing distributed
            execution.
67
        log_stats: Whether to log statistics.
yhu422's avatar
yhu422 committed
68
        usage_context: Specified entry point, used for usage info collection
69
    """
70
71
72
73
74
75
76

    def __init__(
        self,
        model_config: ModelConfig,
        cache_config: CacheConfig,
        parallel_config: ParallelConfig,
        scheduler_config: SchedulerConfig,
77
        device_config: DeviceConfig,
78
        load_config: LoadConfig,
79
        lora_config: Optional[LoRAConfig],
80
81
        vision_language_config: Optional[VisionLanguageConfig],
        speculative_config: Optional[SpeculativeConfig],
82
        decoding_config: Optional[DecodingConfig],
83
        executor_class: Type[ExecutorBase],
84
        log_stats: bool,
yhu422's avatar
yhu422 committed
85
        usage_context: UsageContext = UsageContext.ENGINE_CONTEXT,
86
87
    ) -> None:
        logger.info(
88
            f"Initializing an LLM engine (v{vllm.__version__}) with config: "
89
            f"model={model_config.model!r}, "
90
            f"speculative_config={speculative_config!r}, "
91
            f"tokenizer={model_config.tokenizer!r}, "
92
            f"tokenizer_mode={model_config.tokenizer_mode}, "
Jasmond L's avatar
Jasmond L committed
93
            f"revision={model_config.revision}, "
94
            f"tokenizer_revision={model_config.tokenizer_revision}, "
95
            f"trust_remote_code={model_config.trust_remote_code}, "
96
            f"dtype={model_config.dtype}, "
97
            f"max_seq_len={model_config.max_model_len}, "
98
99
            f"download_dir={load_config.download_dir!r}, "
            f"load_format={load_config.load_format}, "
100
            f"tensor_parallel_size={parallel_config.tensor_parallel_size}, "
101
102
            f"disable_custom_all_reduce="
            f"{parallel_config.disable_custom_all_reduce}, "
103
            f"quantization={model_config.quantization}, "
104
            f"enforce_eager={model_config.enforce_eager}, "
105
            f"kv_cache_dtype={cache_config.cache_dtype}, "
106
            f"quantization_param_path={model_config.quantization_param_path}, "
107
            f"device_config={device_config.device}, "
108
            f"decoding_config={decoding_config!r}, "
109
            f"seed={model_config.seed})")
110
111
112
113
        # TODO(woosuk): Print more configs in debug mode.

        self.model_config = model_config
        self.cache_config = cache_config
114
        self.lora_config = lora_config
115
        self.vision_language_config = vision_language_config
116
117
        self.parallel_config = parallel_config
        self.scheduler_config = scheduler_config
118
        self.device_config = device_config
119
        self.speculative_config = speculative_config
120
        self.load_config = load_config
121
        self.decoding_config = decoding_config or DecodingConfig()
122
123
        self.log_stats = log_stats

124
        self._init_tokenizer()
125
        self.detokenizer = Detokenizer(self.tokenizer)
126
127
        self.seq_counter = Counter()

128
129
130
131
132
133
134
135
136
        self.model_executor = executor_class(
            model_config=model_config,
            cache_config=cache_config,
            parallel_config=parallel_config,
            scheduler_config=scheduler_config,
            device_config=device_config,
            lora_config=lora_config,
            vision_language_config=vision_language_config,
            speculative_config=speculative_config,
137
            load_config=load_config,
138
        )
139

140
141
        self._initialize_kv_caches()

yhu422's avatar
yhu422 committed
142
143
        # If usage stat is enabled, collect relevant info.
        if is_usage_stats_enabled():
144
145
            from vllm.model_executor.model_loader import (
                get_architecture_class_name)
yhu422's avatar
yhu422 committed
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
            usage_message.report_usage(
                get_architecture_class_name(model_config),
                usage_context,
                extra_kvs={
                    # Common configuration
                    "dtype":
                    str(model_config.dtype),
                    "tensor_parallel_size":
                    parallel_config.tensor_parallel_size,
                    "block_size":
                    cache_config.block_size,
                    "gpu_memory_utilization":
                    cache_config.gpu_memory_utilization,

                    # Quantization
                    "quantization":
                    model_config.quantization,
                    "kv_cache_dtype":
                    cache_config.cache_dtype,

                    # Feature flags
                    "enable_lora":
                    bool(lora_config),
                    "enable_prefix_caching":
                    cache_config.enable_prefix_caching,
                    "enforce_eager":
                    model_config.enforce_eager,
                    "disable_custom_all_reduce":
                    parallel_config.disable_custom_all_reduce,
                })

177
178
179
180
        # Ping the tokenizer to ensure liveness if it runs in a
        # different process.
        self.tokenizer.ping()

181
        # Create the scheduler.
182
183
        # NOTE: the cache_config here have been updated with the numbers of
        # GPU and CPU blocks, which are profiled in the distributed executor.
184
        self.scheduler = Scheduler(scheduler_config, cache_config, lora_config)
Woosuk Kwon's avatar
Woosuk Kwon committed
185

186
187
188
        # Metric Logging.
        if self.log_stats:
            self.stat_logger = StatLogger(
189
190
                local_interval=_LOCAL_LOGGING_INTERVAL_SEC,
                labels=dict(model_name=model_config.model))
191
            self.stat_logger.info("cache_config", self.cache_config)
192

193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
        # Create sequence output processor, e.g. for beam search or
        # speculative decoding.
        self.output_processor = (
            SequenceGroupOutputProcessor.create_output_processor(
                self.scheduler_config,
                self.detokenizer,
                self.scheduler,
                self.seq_counter,
                self.get_tokenizer_for_seq,
                stop_checker=StopChecker(
                    self.scheduler_config.max_model_len,
                    self.get_tokenizer_for_seq,
                ),
            ))

208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
    def _initialize_kv_caches(self) -> None:
        """Initialize the KV cache in the worker(s).

        The workers will determine the number of blocks in both the GPU cache
        and the swap CPU cache.
        """
        num_gpu_blocks, num_cpu_blocks = (
            self.model_executor.determine_num_available_blocks())

        if self.cache_config.num_gpu_blocks_override is not None:
            num_gpu_blocks_override = self.cache_config.num_gpu_blocks_override
            logger.info(f"Overriding {num_gpu_blocks=} with "
                        f"{num_gpu_blocks_override=}")
            num_gpu_blocks = num_gpu_blocks_override

        self.cache_config.num_gpu_blocks = num_gpu_blocks
        self.cache_config.num_cpu_blocks = num_cpu_blocks

        self.model_executor.initialize_cache(num_gpu_blocks, num_cpu_blocks)

228
    @classmethod
yhu422's avatar
yhu422 committed
229
230
231
232
233
    def from_engine_args(
        cls,
        engine_args: EngineArgs,
        usage_context: UsageContext = UsageContext.ENGINE_CONTEXT,
    ) -> "LLMEngine":
234
235
        """Creates an LLM engine from the engine arguments."""
        # Create the engine configs.
236
        engine_config = engine_args.create_engine_config()
237
238

        # Initialize the cluster and specify the executor class.
239
        if engine_config.device_config.device_type == "neuron":
240
241
            from vllm.executor.neuron_executor import NeuronExecutor
            executor_class = NeuronExecutor
242
        elif engine_config.device_config.device_type == "cpu":
243
244
            from vllm.executor.cpu_executor import CPUExecutor
            executor_class = CPUExecutor
245
246
        elif engine_config.parallel_config.worker_use_ray:
            initialize_ray_cluster(engine_config.parallel_config)
247
248
249
            from vllm.executor.ray_gpu_executor import RayGPUExecutor
            executor_class = RayGPUExecutor
        else:
250
            assert engine_config.parallel_config.world_size == 1, (
251
252
253
254
255
                "Ray is required if parallel_config.world_size > 1.")
            from vllm.executor.gpu_executor import GPUExecutor
            executor_class = GPUExecutor

        # Create the LLM engine.
yhu422's avatar
yhu422 committed
256
        engine = cls(
257
            **engine_config.to_dict(),
yhu422's avatar
yhu422 committed
258
259
260
261
            executor_class=executor_class,
            log_stats=not engine_args.disable_log_stats,
            usage_context=usage_context,
        )
262
        return engine
263

264
265
266
267
268
    def __reduce__(self):
        # This is to ensure that the LLMEngine is not referenced in
        # the closure used to initialize Ray worker actors
        raise RuntimeError("LLMEngine should not be pickled!")

269
    def get_tokenizer(self) -> "PreTrainedTokenizer":
270
        return self.tokenizer.get_lora_tokenizer(None)
271
272
273

    def get_tokenizer_for_seq(self,
                              sequence: Sequence) -> "PreTrainedTokenizer":
274
275
276
277
        return self.tokenizer.get_lora_tokenizer(sequence.lora_request)

    def _init_tokenizer(self, **tokenizer_init_kwargs):
        init_kwargs = dict(
278
            tokenizer_id=self.model_config.tokenizer,
279
280
281
282
283
284
285
            enable_lora=bool(self.lora_config),
            max_num_seqs=self.scheduler_config.max_num_seqs,
            max_input_length=None,
            tokenizer_mode=self.model_config.tokenizer_mode,
            trust_remote_code=self.model_config.trust_remote_code,
            revision=self.model_config.tokenizer_revision)
        init_kwargs.update(tokenizer_init_kwargs)
286
287
        self.tokenizer: BaseTokenizerGroup = get_tokenizer_group(
            self.parallel_config.tokenizer_pool_config, **init_kwargs)
288

289
290
    def _verify_args(self) -> None:
        self.model_config.verify_with_parallel_config(self.parallel_config)
291
        self.cache_config.verify_with_parallel_config(self.parallel_config)
292
293
294
295
        if self.lora_config:
            self.lora_config.verify_with_model_config(self.model_config)
            self.lora_config.verify_with_scheduler_config(
                self.scheduler_config)
296

297
298
299
300
301
302
303
304
305
306
307
308
309
310
    def encode_request(
        self,
        request_id: str,  # pylint: disable=unused-argument
        prompt: Optional[str],
        prompt_token_ids: Optional[List[int]] = None,
        lora_request: Optional[LoRARequest] = None,
    ):
        if prompt_token_ids is None:
            assert prompt is not None
            prompt_token_ids = self.tokenizer.encode(request_id=request_id,
                                                     prompt=prompt,
                                                     lora_request=lora_request)
        return prompt_token_ids

311
312
313
    def add_request(
        self,
        request_id: str,
Woosuk Kwon's avatar
Woosuk Kwon committed
314
        prompt: Optional[str],
315
316
317
        sampling_params: SamplingParams,
        prompt_token_ids: Optional[List[int]] = None,
        arrival_time: Optional[float] = None,
318
        lora_request: Optional[LoRARequest] = None,
319
        multi_modal_data: Optional[MultiModalData] = None,
320
    ) -> None:
Zhuohan Li's avatar
Zhuohan Li committed
321
        """Add a request to the engine's request pool.
322
323

        The request is added to the request pool and will be processed by the
Zhuohan Li's avatar
Zhuohan Li committed
324
        scheduler as `engine.step()` is called. The exact scheduling policy is
325
326
327
328
329
330
331
332
333
334
        determined by the scheduler.

        Args:
            request_id: The unique ID of the request.
            prompt: The prompt string. Can be None if prompt_token_ids is
                provided.
            sampling_params: The sampling parameters for text generation.
            prompt_token_ids: The token IDs of the prompt. If None, we
                use the tokenizer to convert the prompts to token IDs.
            arrival_time: The arrival time of the request. If None, we use
335
                the current monotonic time.
336
            multi_modal_data: Multi modal data per request.
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360

        Details:
            - Set arrival_time to the current time if it is None.
            - Set prompt_token_ids to the encoded prompt if it is None.
            - Create `best_of` number of :class:`~vllm.Sequence` objects.
            - Create a :class:`~vllm.SequenceGroup` object
              from the list of :class:`~vllm.Sequence`.
            - Add the :class:`~vllm.SequenceGroup` object to the scheduler.

        Example:
            >>> # initialize engine
            >>> engine = LLMEngine.from_engine_args(engine_args)
            >>> # set request arguments
            >>> example_prompt = "Who is the president of the United States?"
            >>> sampling_params = SamplingParams(temperature=0.0)
            >>> request_id = 0
            >>>
            >>> # add the request to the engine
            >>> engine.add_request(
            >>>    str(request_id),
            >>>    example_prompt,
            >>>    SamplingParams(temperature=0.0))
            >>> # continue the request processing
            >>> ...
361
        """
362
363
364
        if lora_request is not None and not self.lora_config:
            raise ValueError(f"Got lora_request {lora_request} but LoRA is "
                             "not enabled!")
365
366
367
368
369
370
371
        max_logprobs = self.get_model_config().max_logprobs
        if (sampling_params.logprobs
                and sampling_params.logprobs > max_logprobs) or (
                    sampling_params.prompt_logprobs
                    and sampling_params.prompt_logprobs > max_logprobs):
            raise ValueError(f"Cannot request more than "
                             f"{max_logprobs} logprobs.")
372
        if arrival_time is None:
373
            arrival_time = time.time()
374
375
376
377
378
        prompt_token_ids = self.encode_request(
            request_id=request_id,
            prompt=prompt,
            prompt_token_ids=prompt_token_ids,
            lora_request=lora_request)
379
380
381

        # Create the sequences.
        block_size = self.cache_config.block_size
382
        seq_id = next(self.seq_counter)
383
384
        eos_token_id = self.tokenizer.get_lora_tokenizer(
            lora_request).eos_token_id
385
        seq = Sequence(seq_id, prompt, prompt_token_ids, block_size,
386
                       eos_token_id, lora_request)
387

388
389
390
        # Defensive copy of SamplingParams, which are used by the sampler,
        # this doesn't deep-copy LogitsProcessor objects
        sampling_params = sampling_params.clone()
391
392
393
        # inject the eos token id into the sampling_params to support min_tokens
        # processing
        sampling_params.eos_token_id = seq.eos_token_id
394

395
        # Create the sequence group.
396
        seq_group = SequenceGroup(request_id, [seq], sampling_params,
397
                                  arrival_time, lora_request, multi_modal_data)
398
399
400
401

        # Add the sequence group to the scheduler.
        self.scheduler.add_seq_group(seq_group)

Antoni Baum's avatar
Antoni Baum committed
402
403
    def abort_request(self, request_id: Union[str, Iterable[str]]) -> None:
        """Aborts a request(s) with the given ID.
404
405

        Args:
Antoni Baum's avatar
Antoni Baum committed
406
            request_id: The ID(s) of the request to abort.
407
408
409
410
411
412
413
414
415
416
417

        Details:
            - Refer to the
              :meth:`~vllm.core.scheduler.Scheduler.abort_seq_group`
              from class :class:`~vllm.core.scheduler.Scheduler`.

        Example:
            >>> # initialize engine and add a request with request_id
            >>> request_id = str(0)
            >>> # abort the request
            >>> engine.abort_request(request_id)
418
        """
419
420
        self.scheduler.abort_seq_group(request_id)

421
422
423
424
    def get_model_config(self) -> ModelConfig:
        """Gets the model configuration."""
        return self.model_config

425
    def get_num_unfinished_requests(self) -> int:
426
        """Gets the number of unfinished requests."""
427
428
        return self.scheduler.get_num_unfinished_seq_groups()

429
    def has_unfinished_requests(self) -> bool:
430
        """Returns True if there are unfinished requests."""
431
432
        return self.scheduler.has_unfinished_seqs()

433
    def _process_model_outputs(
434
435
436
437
438
439
440
441
            self, output: List[SamplerOutput],
            scheduled_seq_groups: List[SequenceGroup],
            ignored_seq_groups: List[SequenceGroup]) -> List[RequestOutput]:
        """Apply the model output to the sequences in the scheduled seq groups.
        
        Returns RequestOutputs that can be returned to the client.
        """

442
        now = time.time()
443
444
445
446
447
448

        # Organize outputs by [sequence group][step] instead of
        # [step][sequence group].
        output_by_sequence_group = create_output_by_sequence_group(
            sampler_outputs=output, num_seq_groups=len(scheduled_seq_groups))

449
        # Update the scheduled sequence groups with the model outputs.
450
451
        for scheduled_seq_group, outputs in zip(scheduled_seq_groups,
                                                output_by_sequence_group):
452
            seq_group = scheduled_seq_group.seq_group
453
454
            seq_group.update_num_computed_tokens(
                scheduled_seq_group.token_chunk_size)
455
456
457
            # If uncomputed tokens > 0, it means prefill is chunked.
            # We don't need to process outputs in that case.
            if seq_group.get_num_uncomputed_tokens() == 0:
458
                self.output_processor.process_outputs(seq_group, outputs)
459
460
461

        # Free the finished sequence groups.
        self.scheduler.free_finished_seq_groups()
462
463
464

        # Create the outputs.
        request_outputs: List[RequestOutput] = []
465
466
        for scheduled_seq_group in scheduled_seq_groups:
            seq_group = scheduled_seq_group.seq_group
467
            seq_group.maybe_set_first_token_time(now)
468
469
            request_output = RequestOutput.from_seq_group(seq_group)
            request_outputs.append(request_output)
470
        for seq_group in ignored_seq_groups:
471
            request_output = RequestOutput.from_seq_group(seq_group)
472
473
474
            request_outputs.append(request_output)
        return request_outputs

Antoni Baum's avatar
Antoni Baum committed
475
476
477
    def step(self) -> List[RequestOutput]:
        """Performs one decoding iteration and returns newly generated results.

478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
        .. figure:: https://i.imgur.com/sv2HssD.png
            :alt: Overview of the step function
            :align: center

            Overview of the step function.

        Details:
            - Step 1: Schedules the sequences to be executed in the next
              iteration and the token blocks to be swapped in/out/copy.

                - Depending on the scheduling policy,
                  sequences may be `preempted/reordered`.
                - A Sequence Group (SG) refer to a group of sequences
                  that are generated from the same prompt.

493
            - Step 2: Calls the distributed executor to execute the model.
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
            - Step 3: Processes the model output. This mainly includes:

                - Decodes the relevant outputs.
                - Updates the scheduled sequence groups with model outputs
                  based on its `sampling parameters` (`use_beam_search` or not).
                - Frees the finished sequence groups.

            - Finally, it creates and returns the newly generated results.

        Example:
            >>> # Please see the example/ folder for more detailed examples.
            >>>
            >>> # initialize engine and request arguments
            >>> engine = LLMEngine.from_engine_args(engine_args)
            >>> example_inputs = [(0, "What is LLM?",
            >>>    SamplingParams(temperature=0.0))]
            >>>
            >>> # Start the engine with an event loop
            >>> while True:
            >>>     if example_inputs:
            >>>         req_id, prompt, sampling_params = example_inputs.pop(0)
            >>>         engine.add_request(str(req_id), prompt, sampling_params)
            >>>
            >>>     # continue the request processing
            >>>     request_outputs = engine.step()
            >>>     for request_output in request_outputs:
            >>>         if request_output.finished:
            >>>             # return or show the request output
            >>>
            >>>     if not (engine.has_unfinished_requests() or example_inputs):
            >>>         break
Antoni Baum's avatar
Antoni Baum committed
525
        """
526
        seq_group_metadata_list, scheduler_outputs = self.scheduler.schedule()
Antoni Baum's avatar
Antoni Baum committed
527

528
        if not scheduler_outputs.is_empty():
529
            output = self.model_executor.execute_model(
530
531
532
533
534
                seq_group_metadata_list=seq_group_metadata_list,
                blocks_to_swap_in=scheduler_outputs.blocks_to_swap_in,
                blocks_to_swap_out=scheduler_outputs.blocks_to_swap_out,
                blocks_to_copy=scheduler_outputs.blocks_to_copy,
                num_lookahead_slots=scheduler_outputs.num_lookahead_slots)
535
536
        else:
            output = []
Antoni Baum's avatar
Antoni Baum committed
537

538
539
540
541
542
543
544
545
546
        request_outputs = self._process_model_outputs(
            output, scheduler_outputs.scheduled_seq_groups,
            scheduler_outputs.ignored_seq_groups)

        # Log stats.
        if self.log_stats:
            self.stat_logger.log(self._get_stats(scheduler_outputs))

        return request_outputs
Antoni Baum's avatar
Antoni Baum committed
547

548
    def do_log_stats(self) -> None:
549
550
551
        """Forced log when no requests active."""
        if self.log_stats:
            self.stat_logger.log(self._get_stats(scheduler_outputs=None))
552

553
554
555
    def _get_stats(self,
                   scheduler_outputs: Optional[SchedulerOutputs]) -> Stats:
        """Get Stats to be Logged to Prometheus."""
556
        now = time.time()
Woosuk Kwon's avatar
Woosuk Kwon committed
557

558
559
560
561
        # KV Cache Usage in %.
        num_total_gpu = self.cache_config.num_gpu_blocks
        num_free_gpu = self.scheduler.block_manager.get_num_free_gpu_blocks()
        gpu_cache_usage = 1.0 - (num_free_gpu / num_total_gpu)
Woosuk Kwon's avatar
Woosuk Kwon committed
562

563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
        num_total_cpu = self.cache_config.num_cpu_blocks
        cpu_cache_usage = 0.
        if num_total_cpu > 0:
            num_free_cpu = self.scheduler.block_manager.get_num_free_cpu_blocks(
            )
            cpu_cache_usage = 1.0 - (num_free_cpu / num_total_cpu)

        # Scheduler State
        num_running = len(self.scheduler.running)
        num_swapped = len(self.scheduler.swapped)
        num_waiting = len(self.scheduler.waiting)

        # Iteration stats if we have scheduler output.
        num_prompt_tokens = 0
        num_generation_tokens = 0
        time_to_first_tokens = []
        time_per_output_tokens = []
        time_e2e_requests = []
        if scheduler_outputs is not None:
582
            prompt_run = scheduler_outputs.num_prefill_groups > 0
583
584
585

            # Number of Tokens.
            if prompt_run:
586
                num_prompt_tokens = sum(
587
588
589
                    len(scheduled_seq_group.seq_group.prompt_token_ids)
                    for scheduled_seq_group in
                    scheduler_outputs.scheduled_seq_groups)
590
                num_generation_tokens = sum(
591
592
593
                    scheduled_seq_group.seq_group.num_seqs()
                    for scheduled_seq_group in
                    scheduler_outputs.scheduled_seq_groups)
594
595
596
597
598
            else:
                num_generation_tokens = scheduler_outputs.num_batched_tokens

            # Latency Timings.
            time_last_iters = []
599
600
            for scheduled_seq_group in scheduler_outputs.scheduled_seq_groups:
                seq_group = scheduled_seq_group.seq_group
601
602
                # Time since last token.
                # (n.b. updates seq_group.metrics.last_token_time)
603
604
605
                time_last_iters.append(seq_group.get_last_latency(now))
                # Time since arrival for all finished requests.
                if seq_group.is_finished():
606
607
                    time_e2e_requests.append(now -
                                             seq_group.metrics.arrival_time)
608
609
610
611
612
613
614
615
616

            time_to_first_tokens = time_last_iters if prompt_run else []
            time_per_output_tokens = [] if prompt_run else time_last_iters

        return Stats(
            now=now,
            num_running=num_running,
            num_swapped=num_swapped,
            num_waiting=num_waiting,
617
618
            gpu_cache_usage=gpu_cache_usage,
            cpu_cache_usage=cpu_cache_usage,
619
620
621
622
623
            num_prompt_tokens=num_prompt_tokens,
            num_generation_tokens=num_generation_tokens,
            time_to_first_tokens=time_to_first_tokens,
            time_per_output_tokens=time_per_output_tokens,
            time_e2e_requests=time_e2e_requests,
624
625
        )

626
    def add_lora(self, lora_request: LoRARequest) -> bool:
627
        return self.model_executor.add_lora(lora_request)
628
629

    def remove_lora(self, lora_id: int) -> bool:
630
        return self.model_executor.remove_lora(lora_id)
631
632

    def list_loras(self) -> List[int]:
633
        return self.model_executor.list_loras()
634
635

    def check_health(self) -> None:
636
        self.model_executor.check_health()