Unverified Commit d6484ef3 authored by Harry Mellor's avatar Harry Mellor Committed by GitHub
Browse files

Add full API docs and improve the UX of navigating them (#17485)


Signed-off-by: default avatarHarry Mellor <19981378+hmellor@users.noreply.github.com>
parent 46fae69c
# SPDX-License-Identifier: Apache-2.0
from vllm.distributed.kv_transfer.kv_connector.base import KVConnectorBaseType
from vllm.distributed.kv_transfer.kv_transfer_state import (
ensure_kv_transfer_initialized, get_kv_transfer_group,
has_kv_transfer_group, is_v1_kv_transfer_group)
......
......@@ -475,7 +475,7 @@ class _AsyncLLMEngine(LLMEngine):
*,
inputs: Optional[PromptType] = None, # DEPRECATED
) -> None:
"""Async version of :meth:`add_request`."""
"""Async version of {meth}`add_request`."""
if inputs is not None:
prompt = inputs
assert prompt is not None and params is not None
......@@ -582,20 +582,20 @@ async def build_guided_decoding_logits_processor_async(
class AsyncLLMEngine(EngineClient):
"""An asynchronous wrapper for :class:`LLMEngine`.
"""An asynchronous wrapper for {class}`LLMEngine`.
This class is used to wrap the :class:`LLMEngine` class to make it
This class is used to wrap the {class}`LLMEngine` class to make it
asynchronous. It uses asyncio to create a background loop that keeps
processing incoming requests. The :class:`LLMEngine` is kicked by the
processing incoming requests. The {class}`LLMEngine` is kicked by the
generate method when there are requests in the waiting queue. The generate
method yields the outputs from the :class:`LLMEngine` to the caller.
method yields the outputs from the {class}`LLMEngine` to the caller.
Args:
log_requests: Whether to log the requests.
start_engine_loop: If True, the background task to run the engine
will be automatically started in the generate call.
*args: Arguments for :class:`LLMEngine`.
**kwargs: Arguments for :class:`LLMEngine`.
*args: Arguments for {class}`LLMEngine`.
**kwargs: Arguments for {class}`LLMEngine`.
"""
_engine_class: Type[_AsyncLLMEngine] = _AsyncLLMEngine
......@@ -985,7 +985,7 @@ class AsyncLLMEngine(EngineClient):
from the LLMEngine to the caller.
Args:
prompt: The prompt to the LLM. See :class:`~vllm.inputs.PromptType`
prompt: The prompt to the LLM. See {class}`~vllm.inputs.PromptType`
for more details about the format of each input.
sampling_params: The sampling parameters of the request.
request_id: The unique id of the request.
......@@ -1003,7 +1003,7 @@ class AsyncLLMEngine(EngineClient):
Details:
- If the engine is not running, start the background loop,
which iteratively invokes
:meth:`~vllm.engine.async_llm_engine.AsyncLLMEngine.engine_step`
{meth}`~vllm.engine.async_llm_engine.AsyncLLMEngine.engine_step`
to process the waiting requests.
- Add the request to the engine's `RequestTracker`.
On the next background loop, this request will be sent to
......@@ -1075,7 +1075,7 @@ class AsyncLLMEngine(EngineClient):
from the LLMEngine to the caller.
Args:
prompt: The prompt to the LLM. See :class:`~vllm.inputs.PromptType`
prompt: The prompt to the LLM. See {class}`~vllm.inputs.PromptType`
for more details about the format of each input.
pooling_params: The pooling parameters of the request.
request_id: The unique id of the request.
......@@ -1089,46 +1089,48 @@ class AsyncLLMEngine(EngineClient):
for the request.
Details:
- If the engine is not running, start the background loop,
which iteratively invokes
:meth:`~vllm.engine.async_llm_engine.AsyncLLMEngine.engine_step`
to process the waiting requests.
- Add the request to the engine's `RequestTracker`.
On the next background loop, this request will be sent to
the underlying engine.
Also, a corresponding `AsyncStream` will be created.
- Wait for the request outputs from `AsyncStream` and yield them.
- If the engine is not running, start the background loop,
which iteratively invokes
{meth}`~vllm.engine.async_llm_engine.AsyncLLMEngine.engine_step`
to process the waiting requests.
- Add the request to the engine's `RequestTracker`.
On the next background loop, this request will be sent to
the underlying engine.
Also, a corresponding `AsyncStream` will be created.
- Wait for the request outputs from `AsyncStream` and yield them.
Example:
>>> # Please refer to entrypoints/api_server.py for
>>> # the complete example.
>>>
>>> # initialize the engine and the example input
>>> # note that engine_args here is AsyncEngineArgs instance
>>> engine = AsyncLLMEngine.from_engine_args(engine_args)
>>> example_input = {
>>> "input": "What is LLM?",
>>> "request_id": 0,
>>> }
>>>
>>> # start the generation
>>> results_generator = engine.encode(
>>> example_input["input"],
>>> PoolingParams(),
>>> example_input["request_id"])
>>>
>>> # get the results
>>> final_output = None
>>> async for request_output in results_generator:
>>> if await request.is_disconnected():
>>> # Abort the request if the client disconnects.
>>> await engine.abort(request_id)
>>> # Return or raise an error
>>> ...
>>> final_output = request_output
>>>
>>> # Process and return the final output
>>> ...
```
# Please refer to entrypoints/api_server.py for
# the complete example.
# initialize the engine and the example input
# note that engine_args here is AsyncEngineArgs instance
engine = AsyncLLMEngine.from_engine_args(engine_args)
example_input = {
"input": "What is LLM?",
"request_id": 0,
}
# start the generation
results_generator = engine.encode(
example_input["input"],
PoolingParams(),
example_input["request_id"])
# get the results
final_output = None
async for request_output in results_generator:
if await request.is_disconnected():
# Abort the request if the client disconnects.
await engine.abort(request_id)
# Return or raise an error
...
final_output = request_output
# Process and return the final output
...
```
"""
try:
async for output in await self.add_request(
......
......@@ -130,11 +130,11 @@ class LLMEngine:
iteration-level scheduling and efficient memory management to maximize the
serving throughput.
The :class:`~vllm.LLM` class wraps this class for offline batched inference
and the :class:`AsyncLLMEngine` class wraps this class for online serving.
The {class}`~vllm.LLM` class wraps this class for offline batched inference
and the {class}`AsyncLLMEngine` class wraps this class for online serving.
The config arguments are derived from :class:`~vllm.EngineArgs`. (See
:ref:`engine-args`)
The config arguments are derived from {class}`~vllm.EngineArgs`. (See
{ref}`engine-args`)
Args:
model_config: The configuration related to the LLM model.
......@@ -694,11 +694,11 @@ class LLMEngine:
Args:
request_id: The unique ID of the request.
prompt: The prompt to the LLM. See :class:`~vllm.inputs.PromptType`
prompt: The prompt to the LLM. See {class}`~vllm.inputs.PromptType`
for more details about the format of each input.
params: Parameters for sampling or pooling.
:class:`~vllm.SamplingParams` for text generation.
:class:`~vllm.PoolingParams` for pooling.
{class}`~vllm.SamplingParams` for text generation.
{class}`~vllm.PoolingParams` for pooling.
arrival_time: The arrival time of the request. If None, we use
the current monotonic time.
lora_request: The LoRA request to add.
......@@ -710,10 +710,10 @@ class LLMEngine:
Details:
- Set arrival_time to the current time if it is None.
- Set prompt_token_ids to the encoded prompt if it is None.
- Create `n` number of :class:`~vllm.Sequence` objects.
- Create a :class:`~vllm.SequenceGroup` object
from the list of :class:`~vllm.Sequence`.
- Add the :class:`~vllm.SequenceGroup` object to the scheduler.
- Create `n` number of {class}`~vllm.Sequence` objects.
- Create a {class}`~vllm.SequenceGroup` object
from the list of {class}`~vllm.Sequence`.
- Add the {class}`~vllm.SequenceGroup` object to the scheduler.
Example:
>>> # initialize engine
......@@ -861,8 +861,8 @@ class LLMEngine:
Details:
- Refer to the
:meth:`~vllm.core.scheduler.Scheduler.abort_seq_group`
from class :class:`~vllm.core.scheduler.Scheduler`.
{meth}`~vllm.core.scheduler.Scheduler.abort_seq_group`
from class {class}`~vllm.core.scheduler.Scheduler`.
Example:
>>> # initialize engine and add a request with request_id
......@@ -1258,53 +1258,56 @@ class LLMEngine:
def step(self) -> List[Union[RequestOutput, PoolingRequestOutput]]:
"""Performs one decoding iteration and returns newly generated results.
.. figure:: https://i.imgur.com/sv2HssD.png
:alt: Overview of the step function
:align: center
:::{figure} https://i.imgur.com/sv2HssD.png
:alt: Overview of the step function
:align: center
Overview of the step function.
Overview of the step function.
:::
Details:
- Step 1: Schedules the sequences to be executed in the next
iteration and the token blocks to be swapped in/out/copy.
- Step 1: Schedules the sequences to be executed in the next
iteration and the token blocks to be swapped in/out/copy.
- Depending on the scheduling policy,
sequences may be `preempted/reordered`.
- A Sequence Group (SG) refer to a group of sequences
that are generated from the same prompt.
- Depending on the scheduling policy,
sequences may be `preempted/reordered`.
- A Sequence Group (SG) refer to a group of sequences
that are generated from the same prompt.
- Step 2: Calls the distributed executor to execute the model.
- Step 3: Processes the model output. This mainly includes:
- Step 2: Calls the distributed executor to execute the model.
- Step 3: Processes the model output. This mainly includes:
- Decodes the relevant outputs.
- Updates the scheduled sequence groups with model outputs
based on its `sampling parameters` (`use_beam_search` or not).
- Frees the finished sequence groups.
- Decodes the relevant outputs.
- Updates the scheduled sequence groups with model outputs
based on its `sampling parameters` (`use_beam_search` or not).
- Frees the finished sequence groups.
- Finally, it creates and returns the newly generated results.
- Finally, it creates and returns the newly generated results.
Example:
>>> # Please see the example/ folder for more detailed examples.
>>>
>>> # initialize engine and request arguments
>>> engine = LLMEngine.from_engine_args(engine_args)
>>> example_inputs = [(0, "What is LLM?",
>>> SamplingParams(temperature=0.0))]
>>>
>>> # Start the engine with an event loop
>>> while True:
>>> if example_inputs:
>>> req_id, prompt, sampling_params = example_inputs.pop(0)
>>> engine.add_request(str(req_id),prompt,sampling_params)
>>>
>>> # continue the request processing
>>> request_outputs = engine.step()
>>> for request_output in request_outputs:
>>> if request_output.finished:
>>> # return or show the request output
>>>
>>> if not (engine.has_unfinished_requests() or example_inputs):
>>> break
```
# Please see the example/ folder for more detailed examples.
# initialize engine and request arguments
engine = LLMEngine.from_engine_args(engine_args)
example_inputs = [(0, "What is LLM?",
SamplingParams(temperature=0.0))]
# Start the engine with an event loop
while True:
if example_inputs:
req_id, prompt, sampling_params = example_inputs.pop(0)
engine.add_request(str(req_id),prompt,sampling_params)
# continue the request processing
request_outputs = engine.step()
for request_output in request_outputs:
if request_output.finished:
# return or show the request output
if not (engine.has_unfinished_requests() or example_inputs):
break
```
"""
if self.parallel_config.pipeline_parallel_size > 1:
raise NotImplementedError(
......
......@@ -491,7 +491,7 @@ class MQLLMEngineClient(EngineClient):
from the LLMEngine to the caller.
Args:
prompt: The prompt to the LLM. See :class:`~vllm.inputs.PromptType`
prompt: The prompt to the LLM. See {class}`~vllm.inputs.PromptType`
for more details about the format of each input.
sampling_params: The sampling parameters of the request.
request_id: The unique id of the request.
......@@ -560,7 +560,7 @@ class MQLLMEngineClient(EngineClient):
from the LLMEngine to the caller.
Args:
prompt: The prompt to the LLM. See :class:`~vllm.inputs.PromptType`
prompt: The prompt to the LLM. See {class}`~vllm.inputs.PromptType`
for more details about the format of each input.
pooling_params: The pooling parameters of the request.
request_id: The unique id of the request.
......
......@@ -41,18 +41,18 @@ HEALTHY_RESPONSE = (pickle.dumps(VLLM_RPC_SUCCESS_STR), )
class MQLLMEngine:
"""A multiprocessing wrapper for :class:`LLMEngine`.
"""A multiprocessing wrapper for {class}`LLMEngine`.
This class is used to wrap the :class:`LLMEngine` class to enable use
This class is used to wrap the {class}`LLMEngine` class to enable use
in concurrnet manner. It runs a background loop and uses zeromq to
receive new requests and stream outputs incrementally via ipc.
The :class:`LLMEngine` generate or encode process is kicked off when a new
The {class}`LLMEngine` generate or encode process is kicked off when a new
RPCProcessRequest is received by the input_socket.
The self.engine_loop checks the input_socket for new requests,
adds them to the LLMEngine if there are any, calls the internal
:class:`LLMEngine.step()`, and sends the RequestOutputs back over
{class}`LLMEngine.step()`, and sends the RequestOutputs back over
the output_socket.
If use_async_sockets is set, the logic associated with reading new
......@@ -64,8 +64,8 @@ class MQLLMEngine:
ipc_path: Base path for zeromq interprocess messaging
use_async_sockets: Whether to make send/recv async with GPU
log_requests: Whether to log the requests.
*args: Arguments for :class:`LLMEngine`.
**kwargs: Arguments for :class:`LLMEngine`.
*args: Arguments for {class}`LLMEngine`.
**kwargs: Arguments for {class}`LLMEngine`.
"""
def __init__(self,
......
......@@ -56,8 +56,8 @@ class MultiStepOutputProcessor(SequenceGroupOutputProcessor):
scheduled computation.
Args:
seq_group: the outputs are associated with this :class:`SequenceGroup`
outputs: the :class:`SequenceGroupOutput`s for all scheduler steps
seq_group: the outputs are associated with this {class}`SequenceGroup`
outputs: the {class}`SequenceGroupOutput`s for all scheduler steps
"""
for output in outputs:
# Concatenate single-step prompt logprob processing results.
......
......@@ -19,7 +19,7 @@ logger = init_logger(__name__)
def single_step_process_prompt_logprob(
sg_output_proc: SequenceGroupOutputProcessor, seq_group: SequenceGroup,
output: CompletionSequenceGroupOutput) -> None:
"""Process prompt logprobs associated with the :class:`SequenceGroupOutput`
"""Process prompt logprobs associated with the {class}`SequenceGroupOutput`
for a given step.
Do nothing if the output has no prompt logprobs.
......@@ -27,9 +27,9 @@ def single_step_process_prompt_logprob(
Account for the fact that transformers do not compute first-token logprobs.
Args:
sg_output_proc: :class:`SequenceGroupOutputProcessor` instance
seq_group: the output is associated with this :class:`SequenceGroup`
output: the :class:`SequenceGroupOutput` for a single scheduler step
sg_output_proc: {class}`SequenceGroupOutputProcessor` instance
seq_group: the output is associated with this {class}`SequenceGroup`
output: the {class}`SequenceGroupOutput` for a single scheduler step
"""
prompt_logprobs = output.prompt_logprobs
......@@ -103,8 +103,8 @@ class SingleStepOutputProcessor(SequenceGroupOutputProcessor):
scheduled computation.
Args:
seq_group: the output is associated with this :class:`SequenceGroup`
outputs: the :class:`SequenceGroupOutput` for a single scheduler step
seq_group: the output is associated with this {class}`SequenceGroup`
outputs: the {class}`SequenceGroupOutput` for a single scheduler step
"""
assert len(outputs) == 1, "Single step should only have 1 output."
output = outputs[0]
......
......@@ -115,7 +115,7 @@ class LLM:
to eager mode. Additionally for encoder-decoder models, if the
sequence length of the encoder input is larger than this, we fall
back to the eager mode.
disable_custom_all_reduce: See :class:`~vllm.config.ParallelConfig`
disable_custom_all_reduce: See {class}`~vllm.config.ParallelConfig`
disable_async_output_proc: Disable async output processing.
This may result in lower performance.
hf_token: The token to use as HTTP bearer authorization for remote files
......@@ -127,12 +127,13 @@ class LLM:
compilation_config: Either an integer or a dictionary. If it is an
integer, it is used as the level of compilation optimization. If it
is a dictionary, it can specify the full compilation configuration.
**kwargs: Arguments for :class:`~vllm.EngineArgs`. (See
:ref:`engine-args`)
**kwargs: Arguments for {class}`~vllm.EngineArgs`. (See
{ref}`engine-args`)
Note:
This class is intended to be used for offline inference. For online
serving, use the :class:`~vllm.AsyncLLMEngine` class instead.
:::{note}
This class is intended to be used for offline inference. For online
serving, use the {class}`~vllm.AsyncLLMEngine` class instead.
:::
"""
DEPRECATE_LEGACY: ClassVar[bool] = True
......@@ -141,7 +142,7 @@ class LLM:
DEPRECATE_INIT_POSARGS: ClassVar[bool] = True
"""
A flag to toggle whether to deprecate positional arguments in
:meth:`LLM.__init__`.
{meth}`LLM.__init__`.
"""
@classmethod
......@@ -398,7 +399,7 @@ class LLM:
Args:
prompts: The prompts to the LLM. You may pass a sequence of prompts
for batch inference. See :class:`~vllm.inputs.PromptType`
for batch inference. See {class}`~vllm.inputs.PromptType`
for more details about the format of each prompts.
sampling_params: The sampling parameters for text generation. If
None, we use the default sampling parameters.
......@@ -413,13 +414,14 @@ class LLM:
Only applicable when priority scheduling policy is enabled.
Returns:
A list of ``RequestOutput`` objects containing the
A list of `RequestOutput` objects containing the
generated completions in the same order as the input prompts.
Note:
Using ``prompts`` and ``prompt_token_ids`` as keyword parameters is
considered legacy and may be deprecated in the future. You should
instead pass them via the ``inputs`` parameter.
:::{note}
Using `prompts` and `prompt_token_ids` as keyword parameters is
considered legacy and may be deprecated in the future. You should
instead pass them via the `inputs` parameter.
:::
"""
runner_type = self.llm_engine.model_config.runner_type
if runner_type not in ["generate", "transcription"]:
......@@ -488,16 +490,17 @@ class LLM:
`self` argument, in addition to the arguments passed in `args`
and `kwargs`. The `self` argument will be the worker object.
timeout: Maximum time in seconds to wait for execution. Raises a
:exc:`TimeoutError` on timeout. `None` means wait indefinitely.
{exc}`TimeoutError` on timeout. `None` means wait indefinitely.
args: Positional arguments to pass to the worker method.
kwargs: Keyword arguments to pass to the worker method.
Returns:
A list containing the results from each worker.
Note:
It is recommended to use this API to only pass control messages,
and set up data-plane communication to pass data.
:::{note}
It is recommended to use this API to only pass control messages,
and set up data-plane communication to pass data.
:::
"""
return self.llm_engine.collective_rpc(method, timeout, args, kwargs)
......@@ -664,7 +667,7 @@ class LLM:
Generate responses for a chat conversation.
The chat conversation is converted into a text prompt using the
tokenizer and calls the :meth:`generate` method to generate the
tokenizer and calls the {meth}`generate` method to generate the
responses.
Multi-modal inputs can be passed in the same way you would pass them
......@@ -903,7 +906,7 @@ class LLM:
Args:
prompts: The prompts to the LLM. You may pass a sequence of prompts
for batch inference. See :class:`~vllm.inputs.PromptType`
for batch inference. See {class}`~vllm.inputs.PromptType`
for more details about the format of each prompts.
pooling_params: The pooling parameters for pooling. If None, we
use the default pooling parameters.
......@@ -913,13 +916,14 @@ class LLM:
generation, if any.
Returns:
A list of ``PoolingRequestOutput`` objects containing the
A list of `PoolingRequestOutput` objects containing the
pooled hidden states in the same order as the input prompts.
Note:
Using ``prompts`` and ``prompt_token_ids`` as keyword parameters is
considered legacy and may be deprecated in the future. You should
instead pass them via the ``inputs`` parameter.
:::{note}
Using `prompts` and `prompt_token_ids` as keyword parameters is
considered legacy and may be deprecated in the future. You should
instead pass them via the `inputs` parameter.
:::
"""
runner_type = self.llm_engine.model_config.runner_type
if runner_type != "pooling":
......@@ -992,7 +996,7 @@ class LLM:
Args:
prompts: The prompts to the LLM. You may pass a sequence of prompts
for batch inference. See :class:`~vllm.inputs.PromptType`
for batch inference. See {class}`~vllm.inputs.PromptType`
for more details about the format of each prompts.
pooling_params: The pooling parameters for pooling. If None, we
use the default pooling parameters.
......@@ -1036,7 +1040,7 @@ class LLM:
Args:
prompts: The prompts to the LLM. You may pass a sequence of prompts
for batch inference. See :class:`~vllm.inputs.PromptType`
for batch inference. See {class}`~vllm.inputs.PromptType`
for more details about the format of each prompts.
use_tqdm: Whether to use tqdm to display the progress bar.
lora_request: LoRA request to use for generation, if any.
......@@ -1168,7 +1172,7 @@ class LLM:
text_1: can be a single prompt or a list of prompts, in which
case it has to have the same length as the ``text_2`` list
text_2: The texts to pair with the query to form the input
to the LLM. See :class:`~vllm.inputs.PromptType` for
to the LLM. See {class}`~vllm.inputs.PromptType` for
more details about the format of each prompts.
use_tqdm: Whether to use tqdm to display the progress bar.
lora_request: LoRA request to use for generation, if any.
......@@ -1277,7 +1281,7 @@ class LLM:
def wake_up(self, tags: Optional[list[str]] = None):
"""
Wake up the engine from sleep mode. See the :meth:`sleep` method
Wake up the engine from sleep mode. See the {meth}`sleep` method
for more details.
Args:
......
......@@ -5,7 +5,6 @@
import json
import re
import time
from argparse import Namespace
from typing import Annotated, Any, ClassVar, Literal, Optional, Union
import torch
......@@ -25,23 +24,7 @@ from vllm.utils import random_uuid, resolve_obj_by_qualname
logger = init_logger(__name__)
# torch is mocked during docs generation,
# so we have to provide the values as literals
_MOCK_LONG_INFO = Namespace(min=-9223372036854775808, max=9223372036854775807)
_LONG_INFO: Union["torch.iinfo", Namespace]
try:
from sphinx.ext.autodoc.mock import _MockModule
if isinstance(torch, _MockModule):
_LONG_INFO = _MOCK_LONG_INFO
else:
_LONG_INFO = torch.iinfo(torch.long)
except ModuleNotFoundError:
_LONG_INFO = torch.iinfo(torch.long)
assert _LONG_INFO.min == _MOCK_LONG_INFO.min
assert _LONG_INFO.max == _MOCK_LONG_INFO.max
_LONG_INFO = torch.iinfo(torch.long)
class OpenAIBaseModel(BaseModel):
......
......@@ -275,7 +275,7 @@ class OpenAIServing:
add_special_tokens: bool = True,
) -> TextTokensPrompt:
"""
A simpler implementation of :meth:`_tokenize_prompt_input_or_inputs`
A simpler implementation of {meth}`_tokenize_prompt_input_or_inputs`
that assumes single input.
"""
return next(
......@@ -296,7 +296,7 @@ class OpenAIServing:
add_special_tokens: bool = True,
) -> Iterator[TextTokensPrompt]:
"""
A simpler implementation of :meth:`_tokenize_prompt_input_or_inputs`
A simpler implementation of {meth}`_tokenize_prompt_input_or_inputs`
that assumes multiple inputs.
"""
for text in prompt_inputs:
......
......@@ -74,7 +74,7 @@ class ExecutorBase(ABC):
`self` argument, in addition to the arguments passed in `args`
and `kwargs`. The `self` argument will be the worker object.
timeout: Maximum time in seconds to wait for execution. Raises a
:exc:`TimeoutError` on timeout. `None` means wait indefinitely.
{exc}`TimeoutError` on timeout. `None` means wait indefinitely.
args: Positional arguments to pass to the worker method.
kwargs: Keyword arguments to pass to the worker method.
......
......@@ -10,7 +10,7 @@ from .registry import (DummyData, InputContext, InputProcessingContext,
INPUT_REGISTRY = InputRegistry()
"""
The global :class:`~InputRegistry` which is used by :class:`~vllm.LLMEngine`
The global {class}`~InputRegistry` which is used by {class}`~vllm.LLMEngine`
to dispatch data processing according to the target model.
"""
......
......@@ -80,22 +80,22 @@ SingletonPrompt = Union[str, TextPrompt, TokensPrompt, EmbedsPrompt]
"""
Set of possible schemas for a single prompt:
- A text prompt (:class:`str` or :class:`TextPrompt`)
- A tokenized prompt (:class:`TokensPrompt`)
- An embeddings prompt (:class:`EmbedsPrompt`)
- A text prompt ({class}`str` or {class}`TextPrompt`)
- A tokenized prompt ({class}`TokensPrompt`)
- An embeddings prompt ({class}`EmbedsPrompt`)
Note that "singleton" is as opposed to a data structure
which encapsulates multiple prompts, i.e. of the sort
which may be utilized for encoder/decoder models when
the user desires to express both the encoder & decoder
prompts explicitly, i.e. :class:`ExplicitEncoderDecoderPrompt`
prompts explicitly, i.e. {class}`ExplicitEncoderDecoderPrompt`
A prompt of type :class:`SingletonPrompt` may be employed
A prompt of type {class}`SingletonPrompt` may be employed
as (1) input to a decoder-only model, (2) input to
the encoder of an encoder/decoder model, in the scenario
where the decoder-prompt is not specified explicitly, or
(3) as a member of a larger data structure encapsulating
more than one prompt, i.e. :class:`ExplicitEncoderDecoderPrompt`
more than one prompt, i.e. {class}`ExplicitEncoderDecoderPrompt`
"""
_T1_co = TypeVar("_T1_co",
......@@ -115,18 +115,18 @@ class ExplicitEncoderDecoderPrompt(TypedDict, Generic[_T1_co, _T2_co]):
comprising an explicit encoder prompt and a decoder prompt.
The encoder and decoder prompts, respectively, may be formatted
according to any of the :class:`SingletonPrompt` schemas,
according to any of the {class}`SingletonPrompt` schemas,
and are not required to have the same schema.
Only the encoder prompt may have multi-modal data. mm_processor_kwargs
should be at the top-level, and should not be set in the encoder/decoder
prompts, since they are agnostic to the encoder/decoder.
Note that an :class:`ExplicitEncoderDecoderPrompt` may not
Note that an {class}`ExplicitEncoderDecoderPrompt` may not
be used as an input to a decoder-only model,
and that the :code:`encoder_prompt` and :code:`decoder_prompt`
and that the `encoder_prompt` and `decoder_prompt`
fields of this data structure themselves must be
:class:`SingletonPrompt` instances.
{class}`SingletonPrompt` instances.
"""
encoder_prompt: _T1_co
......@@ -141,11 +141,11 @@ PromptType = Union[SingletonPrompt, ExplicitEncoderDecoderPrompt]
Set of possible schemas for an LLM input, including
both decoder-only and encoder/decoder input types:
- A text prompt (:class:`str` or :class:`TextPrompt`)
- A tokenized prompt (:class:`TokensPrompt`)
- An embeddings prompt (:class:`EmbedsPrompt`)
- A text prompt ({class}`str` or {class}`TextPrompt`)
- A tokenized prompt ({class}`TokensPrompt`)
- An embeddings prompt ({class}`EmbedsPrompt`)
- A single data structure containing both an encoder and a decoder prompt
(:class:`ExplicitEncoderDecoderPrompt`)
({class}`ExplicitEncoderDecoderPrompt`)
"""
......@@ -178,7 +178,7 @@ def token_inputs(
prompt: Optional[str] = None,
cache_salt: Optional[str] = None,
) -> TokenInputs:
"""Construct :class:`TokenInputs` from optional values."""
"""Construct {class}`TokenInputs` from optional values."""
inputs = TokenInputs(type="token", prompt_token_ids=prompt_token_ids)
if prompt is not None:
......@@ -221,7 +221,7 @@ def embeds_inputs(
DecoderOnlyInputs = Union[TokenInputs, EmbedsInputs, "MultiModalInputs"]
"""
The inputs in :class:`~vllm.LLMEngine` before they are
The inputs in {class}`~vllm.LLMEngine` before they are
passed to the model executor.
This specifies the data required for decoder-only models.
"""
......@@ -229,7 +229,7 @@ This specifies the data required for decoder-only models.
class EncoderDecoderInputs(TypedDict):
"""
The inputs in :class:`~vllm.LLMEngine` before they are
The inputs in {class}`~vllm.LLMEngine` before they are
passed to the model executor.
This specifies the required data for encoder-decoder models.
......@@ -243,13 +243,13 @@ class EncoderDecoderInputs(TypedDict):
SingletonInputs = Union[TokenInputs, EmbedsInputs, "MultiModalInputs"]
"""
A processed :class:`SingletonPrompt` which can be passed to
:class:`vllm.sequence.Sequence`.
A processed {class}`SingletonPrompt` which can be passed to
{class}`vllm.sequence.Sequence`.
"""
ProcessorInputs = Union[DecoderOnlyInputs, EncoderDecoderInputs]
"""
The inputs to :data:`vllm.inputs.InputProcessor`.
The inputs to {data}`vllm.inputs.InputProcessor`.
"""
_T1 = TypeVar("_T1", bound=SingletonPrompt, default=SingletonPrompt)
......@@ -277,7 +277,7 @@ def zip_enc_dec_prompts(
) -> list[ExplicitEncoderDecoderPrompt[_T1, _T2]]:
"""
Zip encoder and decoder prompts together into a list of
:class:`ExplicitEncoderDecoderPrompt` instances.
{class}`ExplicitEncoderDecoderPrompt` instances.
``mm_processor_kwargs`` may also be provided; if a dict is passed, the same
dictionary will be used for every encoder/decoder prompt. If an iterable is
......
......@@ -224,7 +224,7 @@ class InputPreprocessor:
lora_request: Optional[LoRARequest],
tokenization_kwargs: Optional[dict[str, Any]] = None,
) -> list[int]:
"""Async version of :meth:`_tokenize_prompt`."""
"""Async version of {meth}`_tokenize_prompt`."""
tokenizer = self.get_tokenizer_group()
tokenization_kwargs = self._get_tokenization_kw(tokenization_kwargs)
......@@ -287,7 +287,7 @@ class InputPreprocessor:
lora_request: Optional[LoRARequest],
return_mm_hashes: bool = False,
) -> MultiModalInputs:
"""Async version of :meth:`_process_multimodal`."""
"""Async version of {meth}`_process_multimodal`."""
tokenizer = await self._get_mm_tokenizer_async(lora_request)
mm_processor = self.mm_registry.create_processor(self.model_config,
......@@ -472,7 +472,7 @@ class InputPreprocessor:
Returns:
* :class:`SingletonInputs` instance
* {class}`SingletonInputs` instance
"""
parsed = parse_singleton_prompt(prompt)
......@@ -508,7 +508,7 @@ class InputPreprocessor:
lora_request: Optional[LoRARequest] = None,
return_mm_hashes: bool = False,
) -> SingletonInputs:
"""Async version of :meth:`_prompt_to_llm_inputs`."""
"""Async version of {meth}`_prompt_to_llm_inputs`."""
parsed = parse_singleton_prompt(prompt)
if parsed["type"] == "embeds":
......@@ -644,7 +644,7 @@ class InputPreprocessor:
) -> EncoderDecoderInputs:
"""
For encoder/decoder models only:
Process an input prompt into an :class:`EncoderDecoderInputs` instance.
Process an input prompt into an {class}`EncoderDecoderInputs` instance.
There are two types of input prompts:
singleton prompts which carry only the
......@@ -670,7 +670,7 @@ class InputPreprocessor:
Returns:
* :class:`EncoderDecoderInputs` instance
* {class}`EncoderDecoderInputs` instance
"""
encoder_inputs: SingletonInputs
decoder_inputs: Optional[SingletonInputs]
......@@ -710,7 +710,7 @@ class InputPreprocessor:
prompt: PromptType,
tokenization_kwargs: Optional[dict[str, Any]] = None,
) -> EncoderDecoderInputs:
"""Async version of :meth:`_process_encoder_decoder_prompt`."""
"""Async version of {meth}`_process_encoder_decoder_prompt`."""
encoder_inputs: SingletonInputs
decoder_inputs: Optional[SingletonInputs]
......@@ -778,7 +778,7 @@ class InputPreprocessor:
) -> DecoderOnlyInputs:
"""
For decoder-only models:
Process an input prompt into an :class:`DecoderOnlyInputs` instance.
Process an input prompt into an {class}`DecoderOnlyInputs` instance.
Arguments:
......@@ -789,7 +789,7 @@ class InputPreprocessor:
Returns:
* :class:`DecoderOnlyInputs` instance
* {class}`DecoderOnlyInputs` instance
"""
prompt_comps = self._prompt_to_llm_inputs(
......@@ -812,7 +812,7 @@ class InputPreprocessor:
prompt_adapter_request: Optional[PromptAdapterRequest] = None,
return_mm_hashes: bool = False,
) -> DecoderOnlyInputs:
"""Async version of :meth:`_process_decoder_only_prompt`."""
"""Async version of {meth}`_process_decoder_only_prompt`."""
prompt_comps = await self._prompt_to_llm_inputs_async(
prompt,
tokenization_kwargs=tokenization_kwargs,
......@@ -863,7 +863,7 @@ class InputPreprocessor:
prompt_adapter_request: Optional[PromptAdapterRequest] = None,
return_mm_hashes: bool = False,
) -> ProcessorInputs:
"""Async version of :meth:`preprocess`."""
"""Async version of {meth}`preprocess`."""
if self.model_config.is_encoder_decoder:
assert not return_mm_hashes, (
"Multimodal hashes for encoder-decoder models should not be ",
......
......@@ -38,7 +38,7 @@ class InputContext:
) -> _C:
"""
Get the HuggingFace configuration
(:class:`transformers.PretrainedConfig`) of the model,
({class}`transformers.PretrainedConfig`) of the model,
additionally checking its type.
Raises:
......@@ -79,7 +79,7 @@ class InputContext:
) -> _P:
"""
Get the HuggingFace processor
(:class:`transformers.ProcessorMixin`) of the model,
({class}`transformers.ProcessorMixin`) of the model,
additionally checking its type.
Raises:
......@@ -135,8 +135,8 @@ class InputProcessingContext(InputContext):
kwargs: Mapping[str, object] = {},
) -> BatchFeature:
"""
Call :code:`hf_processor` on the prompt :code:`data`
(text, image, audio...) with configurable options :code:`kwargs`.
Call `hf_processor` on the prompt `data`
(text, image, audio...) with configurable options `kwargs`.
"""
assert callable(hf_processor)
......
......@@ -68,21 +68,21 @@ class _VllmLogger(Logger):
"""
Note:
This class is just to provide type information.
We actually patch the methods directly on the :class:`logging.Logger`
We actually patch the methods directly on the {class}`logging.Logger`
instance to avoid conflicting with other libraries such as
`intel_extension_for_pytorch.utils._logger`.
"""
def info_once(self, msg: str, *args: Hashable) -> None:
"""
As :meth:`info`, but subsequent calls with the same message
As {meth}`info`, but subsequent calls with the same message
are silently dropped.
"""
_print_info_once(self, msg, *args)
def warning_once(self, msg: str, *args: Hashable) -> None:
"""
As :meth:`warning`, but subsequent calls with the same message
As {meth}`warning`, but subsequent calls with the same message
are silently dropped.
"""
_print_warning_once(self, msg, *args)
......
# SPDX-License-Identifier: Apache-2.0
from vllm.lora.ops.triton_ops.lora_expand import lora_expand
from vllm.lora.ops.triton_ops.lora_expand_op import lora_expand
from vllm.lora.ops.triton_ops.lora_kernel_metadata import LoRAKernelMeta
from vllm.lora.ops.triton_ops.lora_shrink import lora_shrink
from vllm.lora.ops.triton_ops.lora_shrink_op import lora_shrink
__all__ = [
"lora_expand",
......
......@@ -261,15 +261,16 @@ class RejectionSampler(SpecDecodeStochasticBaseSampler):
True, then a token can be accepted, else it should be
rejected.
Given :math:`q(\hat{x}_{n+1}|x_1, \dots, x_n)`, the probability of
:math:`\hat{x}_{n+1}` given context :math:`x_1, \dots, x_n` according
to the target model, and :math:`p(\hat{x}_{n+1}|x_1, \dots, x_n)`, the
Given {math}`q(\hat{x}_{n+1}|x_1, \dots, x_n)`, the probability of
{math}`\hat{x}_{n+1}` given context {math}`x_1, \dots, x_n` according
to the target model, and {math}`p(\hat{x}_{n+1}|x_1, \dots, x_n)`, the
same conditional probability according to the draft model, the token
is accepted with probability:
.. math::
\min\left(1, \frac{q(\hat{x}_{n+1}|x_1, \dots, x_n)}
{p(\hat{x}_{n+1}|x_1, \dots, x_n)}\right)
:::{math}
\min\left(1, \frac{q(\hat{x}_{n+1}|x_1, \dots, x_n)}
{p(\hat{x}_{n+1}|x_1, \dots, x_n)}\right)
:::
This implementation does not apply causality. When using the output,
if a token is rejected, subsequent tokens should not be used.
......@@ -312,18 +313,20 @@ class RejectionSampler(SpecDecodeStochasticBaseSampler):
target model is recovered (within hardware numerics).
The probability distribution used in this rejection case is constructed
as follows. Given :math:`q(x|x_1, \dots, x_n)`, the probability of
:math:`x` given context :math:`x_1, \dots, x_n` according to the target
model and :math:`p(x|x_1, \dots, x_n)`, the same conditional probability
as follows. Given {math}`q(x|x_1, \dots, x_n)`, the probability of
{math}`x` given context {math}`x_1, \dots, x_n` according to the target
model and {math}`p(x|x_1, \dots, x_n)`, the same conditional probability
according to the draft model:
.. math::
x_{n+1} \sim (q(x|x_1, \dots, x_n) - p(x|x_1, \dots, x_n))_+
:::{math}
x_{n+1} \sim (q(x|x_1, \dots, x_n) - p(x|x_1, \dots, x_n))_+
:::
where :math:`(f(x))_+` is defined as:
where {math}`(f(x))_+` is defined as:
.. math::
(f(x))_+ = \frac{\max(0, f(x))}{\sum_x \max(0, f(x))}
:::{math}
(f(x))_+ = \frac{\max(0, f(x))}{\sum_x \max(0, f(x))}
:::
See https://github.com/vllm-project/vllm/pull/2336 for a visualization
of the draft, target, and recovered probability distributions.
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment