Unverified Commit a68e293c authored by Hyogeun Oh (오효근)'s avatar Hyogeun Oh (오효근) Committed by GitHub
Browse files

[Doc] Convert Sphinx directives ( `{class}`, `{meth}`, `{attr}`, ...) to...


[Doc]  Convert Sphinx directives ( `{class}`, `{meth}`, `{attr}`, ...) to MkDocs format for better documentation linking (#18663)
Signed-off-by: default avatarZerohertz <ohg3417@gmail.com>
parent 68811079
...@@ -39,7 +39,8 @@ class CompilerInterface: ...@@ -39,7 +39,8 @@ class CompilerInterface:
Gather all the relevant information from the vLLM config, Gather all the relevant information from the vLLM config,
to compute a hash so that we can cache the compiled model. to compute a hash so that we can cache the compiled model.
See {meth}`VllmConfig.compute_hash` to check what information See [`VllmConfig.compute_hash`][vllm.config.VllmConfig.compute_hash]
to check what information
is already considered by default. This function should only is already considered by default. This function should only
consider the information that is specific to the compiler. consider the information that is specific to the compiler.
""" """
......
...@@ -2986,7 +2986,7 @@ class PoolerConfig: ...@@ -2986,7 +2986,7 @@ class PoolerConfig:
pooling_type: Optional[str] = None pooling_type: Optional[str] = None
""" """
The pooling method of the pooling model. This should be a key in The pooling method of the pooling model. This should be a key in
{class}`vllm.model_executor.layers.pooler.PoolingType`. [`vllm.model_executor.layers.pooler.PoolingType`][].
""" """
normalize: Optional[bool] = None normalize: Optional[bool] = None
...@@ -3697,23 +3697,27 @@ class CompilationConfig: ...@@ -3697,23 +3697,27 @@ class CompilationConfig:
"""Configuration for compilation. It has three parts: """Configuration for compilation. It has three parts:
- Top-level Compilation control: - Top-level Compilation control:
- {attr}`level` - [`level`][vllm.config.CompilationConfig.level]
- {attr}`debug_dump_path` - [`debug_dump_path`][vllm.config.CompilationConfig.debug_dump_path]
- {attr}`cache_dir` - [`cache_dir`][vllm.config.CompilationConfig.cache_dir]
- {attr}`backend` - [`backend`][vllm.config.CompilationConfig.backend]
- {attr}`custom_ops` - [`custom_ops`][vllm.config.CompilationConfig.custom_ops]
- {attr}`splitting_ops` - [`splitting_ops`][vllm.config.CompilationConfig.splitting_ops]
- CudaGraph capture: - CudaGraph capture:
- {attr}`use_cudagraph` - [`use_cudagraph`][vllm.config.CompilationConfig.use_cudagraph]
- {attr}`cudagraph_capture_sizes` - [`cudagraph_capture_sizes`]
- {attr}`cudagraph_num_of_warmups` [vllm.config.CompilationConfig.cudagraph_capture_sizes]
- {attr}`cudagraph_copy_inputs` - [`cudagraph_num_of_warmups`]
- {attr}`full_cuda_graph` [vllm.config.CompilationConfig.cudagraph_num_of_warmups]
- [`cudagraph_copy_inputs`]
[vllm.config.CompilationConfig.cudagraph_copy_inputs]
- [`full_cuda_graph`][vllm.config.CompilationConfig.full_cuda_graph]
- Inductor compilation: - Inductor compilation:
- {attr}`use_inductor` - [`use_inductor`][vllm.config.CompilationConfig.use_inductor]
- {attr}`compile_sizes` - [`compile_sizes`][vllm.config.CompilationConfig.compile_sizes]
- {attr}`inductor_compile_config` - [`inductor_compile_config`]
- {attr}`inductor_passes` [vllm.config.CompilationConfig.inductor_compile_config]
- [`inductor_passes`][vllm.config.CompilationConfig.inductor_passes]
- custom inductor passes - custom inductor passes
Why we have different sizes for cudagraph and inductor: Why we have different sizes for cudagraph and inductor:
......
...@@ -167,4 +167,7 @@ class HTTPConnection: ...@@ -167,4 +167,7 @@ class HTTPConnection:
global_http_connection = HTTPConnection() global_http_connection = HTTPConnection()
"""The global {class}`HTTPConnection` instance used by vLLM.""" """
The global [`HTTPConnection`][vllm.connections.HTTPConnection] instance used
by vLLM.
"""
...@@ -475,7 +475,8 @@ class _AsyncLLMEngine(LLMEngine): ...@@ -475,7 +475,8 @@ class _AsyncLLMEngine(LLMEngine):
*, *,
inputs: Optional[PromptType] = None, # DEPRECATED inputs: Optional[PromptType] = None, # DEPRECATED
) -> None: ) -> None:
"""Async version of {meth}`add_request`.""" """Async version of
[`add_request`][vllm.engine.llm_engine.LLMEngine.add_request]."""
if inputs is not None: if inputs is not None:
prompt = inputs prompt = inputs
assert prompt is not None and params is not None assert prompt is not None and params is not None
...@@ -582,20 +583,21 @@ async def build_guided_decoding_logits_processor_async( ...@@ -582,20 +583,21 @@ async def build_guided_decoding_logits_processor_async(
class AsyncLLMEngine(EngineClient): class AsyncLLMEngine(EngineClient):
"""An asynchronous wrapper for {class}`LLMEngine`. """An asynchronous wrapper for [`LLMEngine`][vllm.LLMEngine].
This class is used to wrap the {class}`LLMEngine` class to make it This class is used to wrap the [`LLMEngine`][vllm.LLMEngine] class to
asynchronous. It uses asyncio to create a background loop that keeps make it asynchronous. It uses asyncio to create a background loop that keeps
processing incoming requests. The {class}`LLMEngine` is kicked by the processing incoming requests. The [`LLMEngine`][vllm.LLMEngine] is kicked
generate method when there are requests in the waiting queue. The generate by the generate method when there are requests in the waiting queue. The
method yields the outputs from the {class}`LLMEngine` to the caller. generate method yields the outputs from the [`LLMEngine`][vllm.LLMEngine]
to the caller.
Args: Args:
log_requests: Whether to log the requests. log_requests: Whether to log the requests.
start_engine_loop: If True, the background task to run the engine start_engine_loop: If True, the background task to run the engine
will be automatically started in the generate call. will be automatically started in the generate call.
*args: Arguments for {class}`LLMEngine`. *args: Arguments for [`LLMEngine`][vllm.LLMEngine].
**kwargs: Arguments for {class}`LLMEngine`. **kwargs: Arguments for [`LLMEngine`][vllm.LLMEngine].
""" """
_engine_class: Type[_AsyncLLMEngine] = _AsyncLLMEngine _engine_class: Type[_AsyncLLMEngine] = _AsyncLLMEngine
...@@ -985,8 +987,9 @@ class AsyncLLMEngine(EngineClient): ...@@ -985,8 +987,9 @@ class AsyncLLMEngine(EngineClient):
from the LLMEngine to the caller. from the LLMEngine to the caller.
Args: Args:
prompt: The prompt to the LLM. See {class}`~vllm.inputs.PromptType` prompt: The prompt to the LLM. See
for more details about the format of each input. [`PromptType`][vllm.inputs.PromptType] for more details about
the format of each input.
sampling_params: The sampling parameters of the request. sampling_params: The sampling parameters of the request.
request_id: The unique id of the request. request_id: The unique id of the request.
lora_request: LoRA request to use for generation, if any. lora_request: LoRA request to use for generation, if any.
...@@ -1003,7 +1006,7 @@ class AsyncLLMEngine(EngineClient): ...@@ -1003,7 +1006,7 @@ class AsyncLLMEngine(EngineClient):
Details: Details:
- If the engine is not running, start the background loop, - If the engine is not running, start the background loop,
which iteratively invokes which iteratively invokes
{meth}`~vllm.engine.async_llm_engine.AsyncLLMEngine.engine_step` [`engine_step`][vllm.engine.async_llm_engine.AsyncLLMEngine.engine_step]
to process the waiting requests. to process the waiting requests.
- Add the request to the engine's `RequestTracker`. - Add the request to the engine's `RequestTracker`.
On the next background loop, this request will be sent to On the next background loop, this request will be sent to
...@@ -1075,8 +1078,9 @@ class AsyncLLMEngine(EngineClient): ...@@ -1075,8 +1078,9 @@ class AsyncLLMEngine(EngineClient):
from the LLMEngine to the caller. from the LLMEngine to the caller.
Args: Args:
prompt: The prompt to the LLM. See {class}`~vllm.inputs.PromptType` prompt: The prompt to the LLM. See
for more details about the format of each input. [`PromptType`][vllm.inputs.PromptType] for more details about
the format of each input.
pooling_params: The pooling parameters of the request. pooling_params: The pooling parameters of the request.
request_id: The unique id of the request. request_id: The unique id of the request.
lora_request: LoRA request to use for generation, if any. lora_request: LoRA request to use for generation, if any.
...@@ -1089,15 +1093,15 @@ class AsyncLLMEngine(EngineClient): ...@@ -1089,15 +1093,15 @@ class AsyncLLMEngine(EngineClient):
for the request. for the request.
Details: Details:
- If the engine is not running, start the background loop, - If the engine is not running, start the background loop,
which iteratively invokes which iteratively invokes
{meth}`~vllm.engine.async_llm_engine.AsyncLLMEngine.engine_step` [`vllm.engine.async_llm_engine.AsyncLLMEngine.engine_step`][]
to process the waiting requests. to process the waiting requests.
- Add the request to the engine's `RequestTracker`. - Add the request to the engine's `RequestTracker`.
On the next background loop, this request will be sent to On the next background loop, this request will be sent to
the underlying engine. the underlying engine.
Also, a corresponding `AsyncStream` will be created. Also, a corresponding `AsyncStream` will be created.
- Wait for the request outputs from `AsyncStream` and yield them. - Wait for the request outputs from `AsyncStream` and yield them.
Example: Example:
``` ```
......
...@@ -130,11 +130,11 @@ class LLMEngine: ...@@ -130,11 +130,11 @@ class LLMEngine:
iteration-level scheduling and efficient memory management to maximize the iteration-level scheduling and efficient memory management to maximize the
serving throughput. serving throughput.
The [LLM][vllm.LLM] class wraps this class for offline batched inference The [`LLM`][vllm.LLM] class wraps this class for offline batched inference
and the [AsyncLLMEngine][] class wraps this class for online serving. and the [`AsyncLLMEngine`][vllm.engine.async_llm_engine.AsyncLLMEngine]
class wraps this class for online serving.
The config arguments are derived from [EngineArgs][vllm.EngineArgs]. (See The config arguments are derived from [`EngineArgs`][vllm.EngineArgs].
[engine-args][])
Args: Args:
vllm_config: The configuration for initializing and running vLLM. vllm_config: The configuration for initializing and running vLLM.
......
...@@ -492,8 +492,9 @@ class MQLLMEngineClient(EngineClient): ...@@ -492,8 +492,9 @@ class MQLLMEngineClient(EngineClient):
from the LLMEngine to the caller. from the LLMEngine to the caller.
Args: Args:
prompt: The prompt to the LLM. See {class}`~vllm.inputs.PromptType` prompt: The prompt to the LLM. See
for more details about the format of each input. [`PromptType`][vllm.inputs.PromptType] for more details about
the format of each input.
sampling_params: The sampling parameters of the request. sampling_params: The sampling parameters of the request.
request_id: The unique id of the request. request_id: The unique id of the request.
lora_request: LoRA request to use for generation, if any. lora_request: LoRA request to use for generation, if any.
...@@ -561,8 +562,9 @@ class MQLLMEngineClient(EngineClient): ...@@ -561,8 +562,9 @@ class MQLLMEngineClient(EngineClient):
from the LLMEngine to the caller. from the LLMEngine to the caller.
Args: Args:
prompt: The prompt to the LLM. See {class}`~vllm.inputs.PromptType` prompt: The prompt to the LLM. See
for more details about the format of each input. [`PromptType`][vllm.inputs.PromptType] for more details about
the format of each input.
pooling_params: The pooling parameters of the request. pooling_params: The pooling parameters of the request.
request_id: The unique id of the request. request_id: The unique id of the request.
lora_request: LoRA request to use for generation, if any. lora_request: LoRA request to use for generation, if any.
......
...@@ -42,19 +42,22 @@ HEALTHY_RESPONSE = (pickle.dumps(VLLM_RPC_SUCCESS_STR), ) ...@@ -42,19 +42,22 @@ HEALTHY_RESPONSE = (pickle.dumps(VLLM_RPC_SUCCESS_STR), )
class MQLLMEngine: class MQLLMEngine:
"""A multiprocessing wrapper for {class}`LLMEngine`. """A multiprocessing wrapper for
[`LLMEngine`][vllm.engine.llm_engine.LLMEngine].
This class is used to wrap the {class}`LLMEngine` class to enable use This class is used to wrap the
[`LLMEngine`][vllm.engine.llm_engine.LLMEngine] class to enable use
in concurrnet manner. It runs a background loop and uses zeromq to in concurrnet manner. It runs a background loop and uses zeromq to
receive new requests and stream outputs incrementally via ipc. receive new requests and stream outputs incrementally via ipc.
The {class}`LLMEngine` generate or encode process is kicked off when a new The [`LLMEngine`][vllm.engine.llm_engine.LLMEngine] generate or encode
RPCProcessRequest is received by the input_socket. process is kicked off when a new RPCProcessRequest is received by the
input_socket.
The self.engine_loop checks the input_socket for new requests, The self.engine_loop checks the input_socket for new requests,
adds them to the LLMEngine if there are any, calls the internal adds them to the LLMEngine if there are any, calls the internal
{class}`LLMEngine.step()`, and sends the RequestOutputs back over [`LLMEngine.step()`][vllm.engine.llm_engine.LLMEngine.step], and sends
the output_socket. the RequestOutputs back over the output_socket.
If use_async_sockets is set, the logic associated with reading new If use_async_sockets is set, the logic associated with reading new
requests from the socket and sending data to the socket is passed requests from the socket and sending data to the socket is passed
...@@ -65,8 +68,8 @@ class MQLLMEngine: ...@@ -65,8 +68,8 @@ class MQLLMEngine:
ipc_path: Base path for zeromq interprocess messaging ipc_path: Base path for zeromq interprocess messaging
use_async_sockets: Whether to make send/recv async with GPU use_async_sockets: Whether to make send/recv async with GPU
log_requests: Whether to log the requests. log_requests: Whether to log the requests.
*args: Arguments for {class}`LLMEngine`. *args: Arguments for [`LLMEngine`][vllm.engine.llm_engine.LLMEngine].
**kwargs: Arguments for {class}`LLMEngine`. **kwargs: Arguments for [`LLMEngine`][vllm.engine.llm_engine.LLMEngine].
""" """
def __init__(self, def __init__(self,
......
...@@ -56,8 +56,11 @@ class MultiStepOutputProcessor(SequenceGroupOutputProcessor): ...@@ -56,8 +56,11 @@ class MultiStepOutputProcessor(SequenceGroupOutputProcessor):
scheduled computation. scheduled computation.
Args: Args:
seq_group: the outputs are associated with this {class}`SequenceGroup` seq_group: the outputs are associated with this
outputs: the {class}`SequenceGroupOutput`s for all scheduler steps [`SequenceGroup`][vllm.sequence.SequenceGroup]
outputs: the
[`SequenceGroupOutput`][vllm.sequence.SequenceGroupOutput]s
for all scheduler steps
""" """
for output in outputs: for output in outputs:
# Concatenate single-step prompt logprob processing results. # Concatenate single-step prompt logprob processing results.
......
...@@ -19,17 +19,21 @@ logger = init_logger(__name__) ...@@ -19,17 +19,21 @@ logger = init_logger(__name__)
def single_step_process_prompt_logprob( def single_step_process_prompt_logprob(
sg_output_proc: SequenceGroupOutputProcessor, seq_group: SequenceGroup, sg_output_proc: SequenceGroupOutputProcessor, seq_group: SequenceGroup,
output: CompletionSequenceGroupOutput) -> None: output: CompletionSequenceGroupOutput) -> None:
"""Process prompt logprobs associated with the {class}`SequenceGroupOutput` """Process prompt logprobs associated with the
for a given step. [`SequenceGroupOutput`][vllm.sequence.SequenceGroupOutput] for a given step.
Do nothing if the output has no prompt logprobs. Do nothing if the output has no prompt logprobs.
Account for the fact that transformers do not compute first-token logprobs. Account for the fact that transformers do not compute first-token logprobs.
Args: Args:
sg_output_proc: {class}`SequenceGroupOutputProcessor` instance sg_output_proc:
seq_group: the output is associated with this {class}`SequenceGroup` [`SequenceGroupOutputProcessor`][vllm.engine.output_processor.interfaces.SequenceGroupOutputProcessor]
output: the {class}`SequenceGroupOutput` for a single scheduler step instance
seq_group: the output is associated with this
[`SequenceGroup`][vllm.sequence.SequenceGroup]
output: the [`SequenceGroupOutput`][vllm.sequence.SequenceGroupOutput]
for a single scheduler step
""" """
prompt_logprobs = output.prompt_logprobs prompt_logprobs = output.prompt_logprobs
...@@ -103,8 +107,11 @@ class SingleStepOutputProcessor(SequenceGroupOutputProcessor): ...@@ -103,8 +107,11 @@ class SingleStepOutputProcessor(SequenceGroupOutputProcessor):
scheduled computation. scheduled computation.
Args: Args:
seq_group: the output is associated with this {class}`SequenceGroup` seq_group: the output is associated with this
outputs: the {class}`SequenceGroupOutput` for a single scheduler step [`SequenceGroup`][vllm.sequence.SequenceGroup]
outputs: the
[`SequenceGroupOutput`][vllm.sequence.SequenceGroupOutput]
for a single scheduler step
""" """
assert len(outputs) == 1, "Single step should only have 1 output." assert len(outputs) == 1, "Single step should only have 1 output."
output = outputs[0] output = outputs[0]
......
...@@ -129,8 +129,7 @@ class LLM: ...@@ -129,8 +129,7 @@ class LLM:
compilation_config: Either an integer or a dictionary. If it is an compilation_config: Either an integer or a dictionary. If it is an
integer, it is used as the level of compilation optimization. If it integer, it is used as the level of compilation optimization. If it
is a dictionary, it can specify the full compilation configuration. is a dictionary, it can specify the full compilation configuration.
**kwargs: Arguments for [EngineArgs][vllm.EngineArgs]. (See **kwargs: Arguments for [`EngineArgs`][vllm.EngineArgs].
[engine-args][])
Note: Note:
This class is intended to be used for offline inference. For online This class is intended to be used for offline inference. For online
...@@ -494,7 +493,7 @@ class LLM: ...@@ -494,7 +493,7 @@ class LLM:
`self` argument, in addition to the arguments passed in `args` `self` argument, in addition to the arguments passed in `args`
and `kwargs`. The `self` argument will be the worker object. and `kwargs`. The `self` argument will be the worker object.
timeout: Maximum time in seconds to wait for execution. Raises a timeout: Maximum time in seconds to wait for execution. Raises a
{exc}`TimeoutError` on timeout. `None` means wait indefinitely. [`TimeoutError`][] on timeout. `None` means wait indefinitely.
args: Positional arguments to pass to the worker method. args: Positional arguments to pass to the worker method.
kwargs: Keyword arguments to pass to the worker method. kwargs: Keyword arguments to pass to the worker method.
......
...@@ -582,7 +582,8 @@ class OpenAIServing: ...@@ -582,7 +582,8 @@ class OpenAIServing:
add_special_tokens: bool = True, add_special_tokens: bool = True,
) -> TextTokensPrompt: ) -> TextTokensPrompt:
""" """
A simpler implementation of {meth}`_tokenize_prompt_input_or_inputs` A simpler implementation of
[`_tokenize_prompt_input_or_inputs`][vllm.entrypoints.openai.serving_engine.OpenAIServing._tokenize_prompt_input_or_inputs]
that assumes single input. that assumes single input.
""" """
return next( return next(
...@@ -603,7 +604,8 @@ class OpenAIServing: ...@@ -603,7 +604,8 @@ class OpenAIServing:
add_special_tokens: bool = True, add_special_tokens: bool = True,
) -> Iterator[TextTokensPrompt]: ) -> Iterator[TextTokensPrompt]:
""" """
A simpler implementation of {meth}`_tokenize_prompt_input_or_inputs` A simpler implementation of
[`_tokenize_prompt_input_or_inputs`][vllm.entrypoints.openai.serving_engine.OpenAIServing._tokenize_prompt_input_or_inputs]
that assumes multiple inputs. that assumes multiple inputs.
""" """
for text in prompt_inputs: for text in prompt_inputs:
......
...@@ -74,7 +74,7 @@ class ExecutorBase(ABC): ...@@ -74,7 +74,7 @@ class ExecutorBase(ABC):
`self` argument, in addition to the arguments passed in `args` `self` argument, in addition to the arguments passed in `args`
and `kwargs`. The `self` argument will be the worker object. and `kwargs`. The `self` argument will be the worker object.
timeout: Maximum time in seconds to wait for execution. Raises a timeout: Maximum time in seconds to wait for execution. Raises a
{exc}`TimeoutError` on timeout. `None` means wait indefinitely. [`TimeoutError`][] on timeout. `None` means wait indefinitely.
args: Positional arguments to pass to the worker method. args: Positional arguments to pass to the worker method.
kwargs: Keyword arguments to pass to the worker method. kwargs: Keyword arguments to pass to the worker method.
......
...@@ -10,8 +10,9 @@ from .registry import (DummyData, InputContext, InputProcessingContext, ...@@ -10,8 +10,9 @@ from .registry import (DummyData, InputContext, InputProcessingContext,
INPUT_REGISTRY = InputRegistry() INPUT_REGISTRY = InputRegistry()
""" """
The global {class}`~InputRegistry` which is used by {class}`~vllm.LLMEngine` The global [`InputRegistry`][vllm.inputs.registry.InputRegistry] which is used
to dispatch data processing according to the target model. by [`LLMEngine`][vllm.LLMEngine] to dispatch data processing according to the
target model.
""" """
__all__ = [ __all__ = [
......
...@@ -80,22 +80,24 @@ SingletonPrompt = Union[str, TextPrompt, TokensPrompt, EmbedsPrompt] ...@@ -80,22 +80,24 @@ SingletonPrompt = Union[str, TextPrompt, TokensPrompt, EmbedsPrompt]
""" """
Set of possible schemas for a single prompt: Set of possible schemas for a single prompt:
- A text prompt ({class}`str` or {class}`TextPrompt`) - A text prompt ([`str`][] or [`TextPrompt`][vllm.inputs.data.TextPrompt])
- A tokenized prompt ({class}`TokensPrompt`) - A tokenized prompt ([`TokensPrompt`][vllm.inputs.data.TokensPrompt])
- An embeddings prompt ({class}`EmbedsPrompt`) - An embeddings prompt ([`EmbedsPrompt`][vllm.inputs.data.EmbedsPrompt])
Note that "singleton" is as opposed to a data structure Note that "singleton" is as opposed to a data structure
which encapsulates multiple prompts, i.e. of the sort which encapsulates multiple prompts, i.e. of the sort
which may be utilized for encoder/decoder models when which may be utilized for encoder/decoder models when
the user desires to express both the encoder & decoder the user desires to express both the encoder & decoder
prompts explicitly, i.e. {class}`ExplicitEncoderDecoderPrompt` prompts explicitly, i.e.
[`ExplicitEncoderDecoderPrompt`][vllm.inputs.data.ExplicitEncoderDecoderPrompt]
A prompt of type {class}`SingletonPrompt` may be employed A prompt of type [`SingletonPrompt`][vllm.inputs.data.SingletonPrompt] may be
as (1) input to a decoder-only model, (2) input to employed as (1) input to a decoder-only model, (2) input to
the encoder of an encoder/decoder model, in the scenario the encoder of an encoder/decoder model, in the scenario
where the decoder-prompt is not specified explicitly, or where the decoder-prompt is not specified explicitly, or
(3) as a member of a larger data structure encapsulating (3) as a member of a larger data structure encapsulating
more than one prompt, i.e. {class}`ExplicitEncoderDecoderPrompt` more than one prompt, i.e.
[`ExplicitEncoderDecoderPrompt`][vllm.inputs.data.ExplicitEncoderDecoderPrompt]
""" """
...@@ -126,18 +128,20 @@ class ExplicitEncoderDecoderPrompt(TypedDict, Generic[_T1_co, _T2_co]): ...@@ -126,18 +128,20 @@ class ExplicitEncoderDecoderPrompt(TypedDict, Generic[_T1_co, _T2_co]):
comprising an explicit encoder prompt and a decoder prompt. comprising an explicit encoder prompt and a decoder prompt.
The encoder and decoder prompts, respectively, may be formatted The encoder and decoder prompts, respectively, may be formatted
according to any of the {class}`SingletonPrompt` schemas, according to any of the
[`SingletonPrompt`][vllm.inputs.data.SingletonPrompt] schemas,
and are not required to have the same schema. and are not required to have the same schema.
Only the encoder prompt may have multi-modal data. mm_processor_kwargs Only the encoder prompt may have multi-modal data. mm_processor_kwargs
should be at the top-level, and should not be set in the encoder/decoder should be at the top-level, and should not be set in the encoder/decoder
prompts, since they are agnostic to the encoder/decoder. prompts, since they are agnostic to the encoder/decoder.
Note that an {class}`ExplicitEncoderDecoderPrompt` may not Note that an
be used as an input to a decoder-only model, [`ExplicitEncoderDecoderPrompt`][vllm.inputs.data.ExplicitEncoderDecoderPrompt]
may not be used as an input to a decoder-only model,
and that the `encoder_prompt` and `decoder_prompt` and that the `encoder_prompt` and `decoder_prompt`
fields of this data structure themselves must be fields of this data structure themselves must be
{class}`SingletonPrompt` instances. [`SingletonPrompt`][vllm.inputs.data.SingletonPrompt] instances.
""" """
encoder_prompt: _T1_co encoder_prompt: _T1_co
...@@ -152,11 +156,11 @@ PromptType = Union[SingletonPrompt, ExplicitEncoderDecoderPrompt] ...@@ -152,11 +156,11 @@ PromptType = Union[SingletonPrompt, ExplicitEncoderDecoderPrompt]
Set of possible schemas for an LLM input, including Set of possible schemas for an LLM input, including
both decoder-only and encoder/decoder input types: both decoder-only and encoder/decoder input types:
- A text prompt ({class}`str` or {class}`TextPrompt`) - A text prompt ([`str`][] or [`TextPrompt`][vllm.inputs.data.TextPrompt])
- A tokenized prompt ({class}`TokensPrompt`) - A tokenized prompt ([`TokensPrompt`][vllm.inputs.data.TokensPrompt])
- An embeddings prompt ({class}`EmbedsPrompt`) - An embeddings prompt ([`EmbedsPrompt`][vllm.inputs.data.EmbedsPrompt])
- A single data structure containing both an encoder and a decoder prompt - A single data structure containing both an encoder and a decoder prompt
({class}`ExplicitEncoderDecoderPrompt`) ([`ExplicitEncoderDecoderPrompt`][vllm.inputs.data.ExplicitEncoderDecoderPrompt])
""" """
...@@ -189,7 +193,8 @@ def token_inputs( ...@@ -189,7 +193,8 @@ def token_inputs(
prompt: Optional[str] = None, prompt: Optional[str] = None,
cache_salt: Optional[str] = None, cache_salt: Optional[str] = None,
) -> TokenInputs: ) -> TokenInputs:
"""Construct {class}`TokenInputs` from optional values.""" """Construct [`TokenInputs`][vllm.inputs.data.TokenInputs] from optional
values."""
inputs = TokenInputs(type="token", prompt_token_ids=prompt_token_ids) inputs = TokenInputs(type="token", prompt_token_ids=prompt_token_ids)
if prompt is not None: if prompt is not None:
...@@ -221,7 +226,8 @@ def embeds_inputs( ...@@ -221,7 +226,8 @@ def embeds_inputs(
prompt_embeds: torch.Tensor, prompt_embeds: torch.Tensor,
cache_salt: Optional[str] = None, cache_salt: Optional[str] = None,
) -> EmbedsInputs: ) -> EmbedsInputs:
"""Construct :class:`EmbedsInputs` from optional values.""" """Construct [`EmbedsInputs`][vllm.inputs.data.EmbedsInputs] from optional
values."""
inputs = EmbedsInputs(type="embeds", prompt_embeds=prompt_embeds) inputs = EmbedsInputs(type="embeds", prompt_embeds=prompt_embeds)
if cache_salt is not None: if cache_salt is not None:
...@@ -232,7 +238,7 @@ def embeds_inputs( ...@@ -232,7 +238,7 @@ def embeds_inputs(
DecoderOnlyInputs = Union[TokenInputs, EmbedsInputs, "MultiModalInputs"] DecoderOnlyInputs = Union[TokenInputs, EmbedsInputs, "MultiModalInputs"]
""" """
The inputs in {class}`~vllm.LLMEngine` before they are The inputs in [`LLMEngine`][vllm.engine.llm_engine.LLMEngine] before they are
passed to the model executor. passed to the model executor.
This specifies the data required for decoder-only models. This specifies the data required for decoder-only models.
""" """
...@@ -240,11 +246,12 @@ This specifies the data required for decoder-only models. ...@@ -240,11 +246,12 @@ This specifies the data required for decoder-only models.
class EncoderDecoderInputs(TypedDict): class EncoderDecoderInputs(TypedDict):
""" """
The inputs in {class}`~vllm.LLMEngine` before they are The inputs in [`LLMEngine`][vllm.engine.llm_engine.LLMEngine] before they
passed to the model executor. are passed to the model executor.
This specifies the required data for encoder-decoder models. This specifies the required data for encoder-decoder models.
""" """
encoder: Union[TokenInputs, "MultiModalInputs"] encoder: Union[TokenInputs, "MultiModalInputs"]
"""The inputs for the encoder portion.""" """The inputs for the encoder portion."""
...@@ -254,13 +261,13 @@ class EncoderDecoderInputs(TypedDict): ...@@ -254,13 +261,13 @@ class EncoderDecoderInputs(TypedDict):
SingletonInputs = Union[TokenInputs, EmbedsInputs, "MultiModalInputs"] SingletonInputs = Union[TokenInputs, EmbedsInputs, "MultiModalInputs"]
""" """
A processed {class}`SingletonPrompt` which can be passed to A processed [`SingletonPrompt`][vllm.inputs.data.SingletonPrompt] which can be
{class}`vllm.sequence.Sequence`. passed to [`vllm.sequence.Sequence`][].
""" """
ProcessorInputs = Union[DecoderOnlyInputs, EncoderDecoderInputs] ProcessorInputs = Union[DecoderOnlyInputs, EncoderDecoderInputs]
""" """
The inputs to {data}`vllm.inputs.InputProcessor`. The outputs from [`vllm.inputs.preprocess.InputPreprocessor`][].
""" """
_T1 = TypeVar("_T1", bound=SingletonPrompt, default=SingletonPrompt) _T1 = TypeVar("_T1", bound=SingletonPrompt, default=SingletonPrompt)
...@@ -277,7 +284,8 @@ def build_explicit_enc_dec_prompt( ...@@ -277,7 +284,8 @@ def build_explicit_enc_dec_prompt(
return ExplicitEncoderDecoderPrompt( return ExplicitEncoderDecoderPrompt(
encoder_prompt=encoder_prompt, encoder_prompt=encoder_prompt,
decoder_prompt=decoder_prompt, decoder_prompt=decoder_prompt,
mm_processor_kwargs=mm_processor_kwargs) mm_processor_kwargs=mm_processor_kwargs,
)
def zip_enc_dec_prompts( def zip_enc_dec_prompts(
...@@ -288,7 +296,8 @@ def zip_enc_dec_prompts( ...@@ -288,7 +296,8 @@ def zip_enc_dec_prompts(
) -> list[ExplicitEncoderDecoderPrompt[_T1, _T2]]: ) -> list[ExplicitEncoderDecoderPrompt[_T1, _T2]]:
""" """
Zip encoder and decoder prompts together into a list of Zip encoder and decoder prompts together into a list of
{class}`ExplicitEncoderDecoderPrompt` instances. [`ExplicitEncoderDecoderPrompt`][vllm.inputs.data.ExplicitEncoderDecoderPrompt]
instances.
``mm_processor_kwargs`` may also be provided; if a dict is passed, the same ``mm_processor_kwargs`` may also be provided; if a dict is passed, the same
dictionary will be used for every encoder/decoder prompt. If an iterable is dictionary will be used for every encoder/decoder prompt. If an iterable is
...@@ -299,10 +308,11 @@ def zip_enc_dec_prompts( ...@@ -299,10 +308,11 @@ def zip_enc_dec_prompts(
if isinstance(mm_processor_kwargs, dict): if isinstance(mm_processor_kwargs, dict):
return [ return [
build_explicit_enc_dec_prompt( build_explicit_enc_dec_prompt(
encoder_prompt, decoder_prompt, encoder_prompt,
cast(dict[str, Any], mm_processor_kwargs)) decoder_prompt,
for (encoder_prompt, cast(dict[str, Any], mm_processor_kwargs),
decoder_prompt) in zip(enc_prompts, dec_prompts) ) for (encoder_prompt,
decoder_prompt) in zip(enc_prompts, dec_prompts)
] ]
return [ return [
build_explicit_enc_dec_prompt(encoder_prompt, decoder_prompt, build_explicit_enc_dec_prompt(encoder_prompt, decoder_prompt,
......
...@@ -23,13 +23,13 @@ class ParsedTokens(TypedDict): ...@@ -23,13 +23,13 @@ class ParsedTokens(TypedDict):
@overload @overload
def parse_and_batch_prompt( def parse_and_batch_prompt(
prompt: Union[str, list[str]]) -> Sequence[ParsedText]: prompt: Union[str, list[str]], ) -> Sequence[ParsedText]:
... ...
@overload @overload
def parse_and_batch_prompt( def parse_and_batch_prompt(
prompt: Union[list[int], list[list[int]]]) -> Sequence[ParsedTokens]: prompt: Union[list[int], list[list[int]]], ) -> Sequence[ParsedTokens]:
... ...
...@@ -86,7 +86,7 @@ class ParsedTokensPrompt(TypedDict): ...@@ -86,7 +86,7 @@ class ParsedTokensPrompt(TypedDict):
class ParsedEmbedsPrompt(TypedDict): class ParsedEmbedsPrompt(TypedDict):
type: Literal['embeds'] type: Literal["embeds"]
content: EmbedsPrompt content: EmbedsPrompt
...@@ -133,7 +133,7 @@ def parse_singleton_prompt(prompt: SingletonPrompt) -> ParsedSingletonPrompt: ...@@ -133,7 +133,7 @@ def parse_singleton_prompt(prompt: SingletonPrompt) -> ParsedSingletonPrompt:
def is_explicit_encoder_decoder_prompt( def is_explicit_encoder_decoder_prompt(
prompt: PromptType) -> TypeIs[ExplicitEncoderDecoderPrompt]: prompt: PromptType, ) -> TypeIs[ExplicitEncoderDecoderPrompt]:
return isinstance(prompt, dict) and "encoder_prompt" in prompt return isinstance(prompt, dict) and "encoder_prompt" in prompt
......
...@@ -67,11 +67,11 @@ class InputPreprocessor: ...@@ -67,11 +67,11 @@ class InputPreprocessor:
return self.tokenizer.get_lora_tokenizer(lora_request).eos_token_id return self.tokenizer.get_lora_tokenizer(lora_request).eos_token_id
def get_decoder_start_token_id(self) -> Optional[int]: def get_decoder_start_token_id(self) -> Optional[int]:
''' """
Obtain the decoder start token id employed by an encoder/decoder Obtain the decoder start token id employed by an encoder/decoder
model. Returns None for non-encoder/decoder models or if the model. Returns None for non-encoder/decoder models or if the
model config is unavailable. model config is unavailable.
''' """
if not self.model_config.is_encoder_decoder: if not self.model_config.is_encoder_decoder:
logger.warning_once( logger.warning_once(
...@@ -79,14 +79,14 @@ class InputPreprocessor: ...@@ -79,14 +79,14 @@ class InputPreprocessor:
"this is not an encoder/decoder model.") "this is not an encoder/decoder model.")
return None return None
if (self.model_config is None or self.model_config.hf_config is None): if self.model_config is None or self.model_config.hf_config is None:
logger.warning_once( logger.warning_once(
"Using None for decoder start token id because " "Using None for decoder start token id because "
"model config is not available.") "model config is not available.")
return None return None
dec_start_token_id = getattr(self.model_config.hf_config, dec_start_token_id = getattr(self.model_config.hf_config,
'decoder_start_token_id', None) "decoder_start_token_id", None)
if dec_start_token_id is None: if dec_start_token_id is None:
logger.warning_once( logger.warning_once(
"Falling back on <BOS> for decoder start token " "Falling back on <BOS> for decoder start token "
...@@ -97,7 +97,7 @@ class InputPreprocessor: ...@@ -97,7 +97,7 @@ class InputPreprocessor:
return dec_start_token_id return dec_start_token_id
def _get_default_enc_dec_decoder_prompt(self) -> list[int]: def _get_default_enc_dec_decoder_prompt(self) -> list[int]:
''' """
Specifically for encoder/decoder models: Specifically for encoder/decoder models:
generate a default decoder prompt for when generate a default decoder prompt for when
the user specifies only the encoder prompt. the user specifies only the encoder prompt.
...@@ -126,7 +126,7 @@ class InputPreprocessor: ...@@ -126,7 +126,7 @@ class InputPreprocessor:
Returns: Returns:
* prompt_token_ids * prompt_token_ids
''' """
bos_token_id = self.get_bos_token_id() bos_token_id = self.get_bos_token_id()
assert bos_token_id is not None assert bos_token_id is not None
...@@ -224,7 +224,10 @@ class InputPreprocessor: ...@@ -224,7 +224,10 @@ class InputPreprocessor:
lora_request: Optional[LoRARequest], lora_request: Optional[LoRARequest],
tokenization_kwargs: Optional[dict[str, Any]] = None, tokenization_kwargs: Optional[dict[str, Any]] = None,
) -> list[int]: ) -> list[int]:
"""Async version of {meth}`_tokenize_prompt`.""" """
Async version of
[`_tokenize_prompt`][vllm.inputs.preprocess.InputPreprocessor._tokenize_prompt].
"""
tokenizer = self.get_tokenizer_group() tokenizer = self.get_tokenizer_group()
tokenization_kwargs = self._get_tokenization_kw(tokenization_kwargs) tokenization_kwargs = self._get_tokenization_kw(tokenization_kwargs)
...@@ -287,7 +290,10 @@ class InputPreprocessor: ...@@ -287,7 +290,10 @@ class InputPreprocessor:
lora_request: Optional[LoRARequest], lora_request: Optional[LoRARequest],
return_mm_hashes: bool = False, return_mm_hashes: bool = False,
) -> MultiModalInputs: ) -> MultiModalInputs:
"""Async version of {meth}`_process_multimodal`.""" """
Async version of
[`_process_multimodal`][vllm.inputs.preprocess.InputPreprocessor._process_multimodal].
"""
tokenizer = await self._get_mm_tokenizer_async(lora_request) tokenizer = await self._get_mm_tokenizer_async(lora_request)
mm_processor = self.mm_registry.create_processor(self.model_config, mm_processor = self.mm_registry.create_processor(self.model_config,
...@@ -472,7 +478,7 @@ class InputPreprocessor: ...@@ -472,7 +478,7 @@ class InputPreprocessor:
Returns: Returns:
* {class}`SingletonInputs` instance * [`SingletonInputs`][vllm.inputs.data.SingletonInputs] instance
""" """
parsed = parse_singleton_prompt(prompt) parsed = parse_singleton_prompt(prompt)
...@@ -508,7 +514,10 @@ class InputPreprocessor: ...@@ -508,7 +514,10 @@ class InputPreprocessor:
lora_request: Optional[LoRARequest] = None, lora_request: Optional[LoRARequest] = None,
return_mm_hashes: bool = False, return_mm_hashes: bool = False,
) -> SingletonInputs: ) -> SingletonInputs:
"""Async version of {meth}`_prompt_to_llm_inputs`.""" """
Async version of
[`_prompt_to_llm_inputs`][vllm.inputs.preprocess.InputPreprocessor._prompt_to_llm_inputs].
"""
parsed = parse_singleton_prompt(prompt) parsed = parse_singleton_prompt(prompt)
if parsed["type"] == "embeds": if parsed["type"] == "embeds":
...@@ -644,7 +653,9 @@ class InputPreprocessor: ...@@ -644,7 +653,9 @@ class InputPreprocessor:
) -> EncoderDecoderInputs: ) -> EncoderDecoderInputs:
""" """
For encoder/decoder models only: For encoder/decoder models only:
Process an input prompt into an {class}`EncoderDecoderInputs` instance. Process an input prompt into an
[`EncoderDecoderInputs`][vllm.inputs.data.EncoderDecoderInputs]
instance.
There are two types of input prompts: There are two types of input prompts:
singleton prompts which carry only the singleton prompts which carry only the
...@@ -670,7 +681,8 @@ class InputPreprocessor: ...@@ -670,7 +681,8 @@ class InputPreprocessor:
Returns: Returns:
* {class}`EncoderDecoderInputs` instance * [`EncoderDecoderInputs`][vllm.inputs.data.EncoderDecoderInputs]
instance
""" """
encoder_inputs: SingletonInputs encoder_inputs: SingletonInputs
decoder_inputs: Optional[SingletonInputs] decoder_inputs: Optional[SingletonInputs]
...@@ -710,7 +722,10 @@ class InputPreprocessor: ...@@ -710,7 +722,10 @@ class InputPreprocessor:
prompt: PromptType, prompt: PromptType,
tokenization_kwargs: Optional[dict[str, Any]] = None, tokenization_kwargs: Optional[dict[str, Any]] = None,
) -> EncoderDecoderInputs: ) -> EncoderDecoderInputs:
"""Async version of {meth}`_process_encoder_decoder_prompt`.""" """
Async version of
[`_process_encoder_decoder_prompt`][vllm.inputs.preprocess.InputPreprocessor._process_encoder_decoder_prompt].
"""
encoder_inputs: SingletonInputs encoder_inputs: SingletonInputs
decoder_inputs: Optional[SingletonInputs] decoder_inputs: Optional[SingletonInputs]
...@@ -778,7 +793,8 @@ class InputPreprocessor: ...@@ -778,7 +793,8 @@ class InputPreprocessor:
) -> DecoderOnlyInputs: ) -> DecoderOnlyInputs:
""" """
For decoder-only models: For decoder-only models:
Process an input prompt into an {class}`DecoderOnlyInputs` instance. Process an input prompt into a
[`DecoderOnlyInputs`][vllm.inputs.data.DecoderOnlyInputs] instance.
Arguments: Arguments:
...@@ -789,7 +805,7 @@ class InputPreprocessor: ...@@ -789,7 +805,7 @@ class InputPreprocessor:
Returns: Returns:
* {class}`DecoderOnlyInputs` instance * [`DecoderOnlyInputs`][vllm.inputs.data.DecoderOnlyInputs] instance
""" """
prompt_comps = self._prompt_to_llm_inputs( prompt_comps = self._prompt_to_llm_inputs(
...@@ -812,7 +828,10 @@ class InputPreprocessor: ...@@ -812,7 +828,10 @@ class InputPreprocessor:
prompt_adapter_request: Optional[PromptAdapterRequest] = None, prompt_adapter_request: Optional[PromptAdapterRequest] = None,
return_mm_hashes: bool = False, return_mm_hashes: bool = False,
) -> DecoderOnlyInputs: ) -> DecoderOnlyInputs:
"""Async version of {meth}`_process_decoder_only_prompt`.""" """
Async version of
[`_process_decoder_only_prompt`][vllm.inputs.preprocess.InputPreprocessor._process_decoder_only_prompt].
"""
prompt_comps = await self._prompt_to_llm_inputs_async( prompt_comps = await self._prompt_to_llm_inputs_async(
prompt, prompt,
tokenization_kwargs=tokenization_kwargs, tokenization_kwargs=tokenization_kwargs,
...@@ -863,7 +882,10 @@ class InputPreprocessor: ...@@ -863,7 +882,10 @@ class InputPreprocessor:
prompt_adapter_request: Optional[PromptAdapterRequest] = None, prompt_adapter_request: Optional[PromptAdapterRequest] = None,
return_mm_hashes: bool = False, return_mm_hashes: bool = False,
) -> ProcessorInputs: ) -> ProcessorInputs:
"""Async version of {meth}`preprocess`.""" """
Async version of
[`preprocess`][vllm.inputs.preprocess.InputPreprocessor.preprocess].
"""
if self.model_config.is_encoder_decoder: if self.model_config.is_encoder_decoder:
assert not return_mm_hashes, ( assert not return_mm_hashes, (
"Multimodal hashes for encoder-decoder models should not be ", "Multimodal hashes for encoder-decoder models should not be ",
......
...@@ -38,7 +38,7 @@ class InputContext: ...@@ -38,7 +38,7 @@ class InputContext:
) -> _C: ) -> _C:
""" """
Get the HuggingFace configuration Get the HuggingFace configuration
({class}`transformers.PretrainedConfig`) of the model, (`transformers.PretrainedConfig`) of the model,
additionally checking its type. additionally checking its type.
Raises: Raises:
...@@ -79,7 +79,7 @@ class InputContext: ...@@ -79,7 +79,7 @@ class InputContext:
) -> _P: ) -> _P:
""" """
Get the HuggingFace processor Get the HuggingFace processor
({class}`transformers.ProcessorMixin`) of the model, (`transformers.ProcessorMixin`) of the model,
additionally checking its type. additionally checking its type.
Raises: Raises:
......
...@@ -68,22 +68,22 @@ class _VllmLogger(Logger): ...@@ -68,22 +68,22 @@ class _VllmLogger(Logger):
""" """
Note: Note:
This class is just to provide type information. This class is just to provide type information.
We actually patch the methods directly on the {class}`logging.Logger` We actually patch the methods directly on the [`logging.Logger`][]
instance to avoid conflicting with other libraries such as instance to avoid conflicting with other libraries such as
`intel_extension_for_pytorch.utils._logger`. `intel_extension_for_pytorch.utils._logger`.
""" """
def info_once(self, msg: str, *args: Hashable) -> None: def info_once(self, msg: str, *args: Hashable) -> None:
""" """
As {meth}`info`, but subsequent calls with the same message As [`info`][logging.Logger.info], but subsequent calls with
are silently dropped. the same message are silently dropped.
""" """
_print_info_once(self, msg, *args) _print_info_once(self, msg, *args)
def warning_once(self, msg: str, *args: Hashable) -> None: def warning_once(self, msg: str, *args: Hashable) -> None:
""" """
As {meth}`warning`, but subsequent calls with the same message As [`warning`][logging.Logger.warning], but subsequent calls with
are silently dropped. the same message are silently dropped.
""" """
_print_warning_once(self, msg, *args) _print_warning_once(self, msg, *args)
......
...@@ -228,17 +228,19 @@ class Sampler(nn.Module): ...@@ -228,17 +228,19 @@ class Sampler(nn.Module):
) -> Optional[SamplerOutput]: ) -> Optional[SamplerOutput]:
""" """
Single-step scheduling: Single-step scheduling:
* Perform GPU-side sampling computation & compute * Perform GPU-side sampling computation & compute
GPU-side logprobs tensor GPU-side logprobs tensor
* Pythonize sampling result & logprobs tensor * Pythonize sampling result & logprobs tensor
Multi-step scheduling: Multi-step scheduling:
* Perform GPU-side sampling computation & compute * Perform GPU-side sampling computation & compute
GPU-side logprobs tensor GPU-side logprobs tensor
* Defer Pythonization of sampling result & logprobs * Defer Pythonization of sampling result & logprobs
tensor tensor
* Encapsulate arguments required for deferred Pythonization * Encapsulate arguments required for deferred Pythonization
in the {class}`SamplerOutput` structure in the
[`SamplerOutput`][vllm.model_executor.layers.sampler.SamplerOutput]
structure
Args: Args:
logits: (num_tokens, vocab_size). logits: (num_tokens, vocab_size).
......
...@@ -226,9 +226,11 @@ class SupportsPP(Protocol): ...@@ -226,9 +226,11 @@ class SupportsPP(Protocol):
intermediate_tensors: Optional["IntermediateTensors"], intermediate_tensors: Optional["IntermediateTensors"],
) -> Union[Tensor, "IntermediateTensors"]: ) -> Union[Tensor, "IntermediateTensors"]:
""" """
Accept {class}`IntermediateTensors` when PP rank > 0. Accept [`IntermediateTensors`][vllm.sequence.IntermediateTensors] when
PP rank > 0.
Return {class}`IntermediateTensors` only for the last PP rank. Return [`IntermediateTensors`][vllm.sequence.IntermediateTensors] only
for the last PP rank.
""" """
... ...
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment