Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
273cb3b4
Unverified
Commit
273cb3b4
authored
May 24, 2025
by
Cyrus Leung
Committed by
GitHub
May 23, 2025
Browse files
[Doc] Fix top-level API links/docs (#18621)
Signed-off-by:
DarkLight1337
<
tlleungac@connect.ust.hk
>
parent
8ddd1cf2
Changes
9
Show whitespace changes
Inline
Side-by-side
Showing
9 changed files
with
88 additions
and
103 deletions
+88
-103
vllm/benchmarks/datasets.py
vllm/benchmarks/datasets.py
+12
-10
vllm/config.py
vllm/config.py
+1
-1
vllm/distributed/kv_transfer/kv_connector/v1/base.py
vllm/distributed/kv_transfer/kv_connector/v1/base.py
+5
-4
vllm/distributed/kv_transfer/kv_pipe/pynccl_pipe.py
vllm/distributed/kv_transfer/kv_pipe/pynccl_pipe.py
+20
-20
vllm/engine/llm_engine.py
vllm/engine/llm_engine.py
+15
-25
vllm/entrypoints/llm.py
vllm/entrypoints/llm.py
+32
-31
vllm/multimodal/__init__.py
vllm/multimodal/__init__.py
+1
-1
vllm/multimodal/registry.py
vllm/multimodal/registry.py
+2
-2
vllm/outputs.py
vllm/outputs.py
+0
-9
No files found.
vllm/benchmarks/datasets.py
View file @
273cb3b4
...
...
@@ -129,16 +129,17 @@ class BenchmarkDataset(ABC):
Args:
tokenizer (PreTrainedTokenizerBase): The base tokenizer to use if no
LoRA is selected. max_loras (Optional[int]): The maximum number of
LoRAs available. If None, LoRA is not used. lora_path
(Optional[str]): Path to the LoRA parameters on disk. If None, LoRA
is not used.
LoRA is selected.
max_loras (Optional[int]): The maximum number of LoRAs available.
If `None`, LoRA is not used.
lora_path (Optional[str]): Path to the LoRA parameters on disk.
If `None`, LoRA is not used.
Returns:
tuple[Optional[LoRARequest], AnyTokenizer]:
A tuple w
here
the f
irst
element is a
LoRARequest (or None if not applicable)
and the second
element is t
he tokenizer associated with the LoRA request
(or the
base tokenizer).
A tuple w
ith
the f
ollowing elements:
- A new [
LoRARequest
][]
(or
`
None
`
if not applicable)
.
- T
he tokenizer associated with the LoRA request
(or the
base tokenizer).
"""
if
max_loras
is
None
or
lora_path
is
None
:
return
None
,
tokenizer
...
...
@@ -184,7 +185,8 @@ class BenchmarkDataset(ABC):
Args:
requests (List[SampleRequest]): The current list of sampled
requests. num_requests (int): The target number of requests.
requests.
num_requests (int): The target number of requests.
"""
if
len
(
requests
)
<
num_requests
:
random
.
seed
(
self
.
random_seed
)
...
...
vllm/config.py
View file @
273cb3b4
...
...
@@ -4552,7 +4552,7 @@ def contains_object_print(text):
text (str): The text to check
Returns:
bool: True if a match is found, False otherwise
result (
bool
)
:
`
True
`
if a match is found,
`
False
`
otherwise
.
"""
pattern
=
r
'at 0x[a-fA-F0-9]{2,16}>'
match
=
re
.
search
(
pattern
,
text
)
...
...
vllm/distributed/kv_transfer/kv_connector/v1/base.py
View file @
273cb3b4
...
...
@@ -210,9 +210,10 @@ class KVConnectorBase_V1(ABC):
computed tokens for this request
Returns:
* the number of tokens that can be loaded from the
A tuple with the following elements:
- The number of tokens that can be loaded from the
external KV cache beyond what is already computed.
* t
rue if external KV cache tokens will be loaded
- `T
rue
`
if external KV cache tokens will be loaded
asynchronously (between scheduler steps).
"""
pass
...
...
vllm/distributed/kv_transfer/kv_pipe/pynccl_pipe.py
View file @
273cb3b4
...
...
@@ -118,11 +118,11 @@ class PyNcclPipe(KVPipeBase):
"""
Create the metadata as a dictionary based on the input tensor.
Parameter
s:
-
tensor: The input tensor or None if no tensor is provided.
Arg
s:
tensor: The input tensor or None if no tensor is provided.
Returns:
-
metadata: A dictionary with the following keys:
metadata: A dictionary with the following keys:
- "dtype": The data type of the tensor or None.
- "shape": The shape of the tensor or None.
"""
...
...
@@ -135,13 +135,13 @@ class PyNcclPipe(KVPipeBase):
"""
Create a buffer to receive the tensor based on the provided metadata.
Parameter
s:
-
metadata: A dictionary with keys "dtype" and "shape",
describing
the tensor's data type and shape.
Arg
s:
metadata: A dictionary with keys "dtype" and "shape",
describing
the tensor's data type and shape.
Returns:
-
buffer: A tensor of the specified type and shape,
allocated on
self.device.
buffer: A tensor of the specified type and shape,
allocated on `
self.device
`
.
"""
return
torch
.
empty
(
metadata
[
"shape"
],
dtype
=
metadata
[
"dtype"
],
...
...
@@ -151,8 +151,8 @@ class PyNcclPipe(KVPipeBase):
"""
Send the metadata dictionary to the target rank.
Parameter
s:
-
metadata: A dictionary with keys "dtype" and "shape".
Arg
s:
metadata: A dictionary with keys "dtype" and "shape".
"""
self
.
group
.
send_obj
(
metadata
,
self
.
target_rank_for_send
)
...
...
@@ -161,8 +161,8 @@ class PyNcclPipe(KVPipeBase):
Receive the metadata dictionary from the target rank.
Returns:
-
metadata: A dictionary with keys "dtype" and "shape"
describing
the tensor.
metadata: A dictionary with keys "dtype" and "shape"
describing
the tensor.
"""
return
self
.
group
.
recv_obj
(
self
.
target_rank_for_recv
)
...
...
@@ -171,8 +171,8 @@ class PyNcclPipe(KVPipeBase):
The actual implementation of sending the tensor and its metadata to the
target rank.
Parameter
s:
-
tensor: The input tensor to be sent, or None if no tensor is
Arg
s:
tensor: The input tensor to be sent, or
`
None
`
if no tensor is
being sent.
"""
metadata
=
self
.
_make_metadata
(
tensor
)
...
...
@@ -187,7 +187,7 @@ class PyNcclPipe(KVPipeBase):
the target rank.
Returns:
-
buffer: The received tensor, or None if no tensor is received.
buffer: The received tensor, or
`
None
`
if no tensor is received.
"""
metadata
=
self
.
_recv_metadata
()
if
metadata
[
"dtype"
]
is
None
:
...
...
@@ -227,8 +227,8 @@ class PyNcclPipe(KVPipeBase):
Sends a tensor and its metadata to the destination rank in a
non-blocking way.
Parameter
s:
-
tensor: The tensor to send, or None if no tensor is being sent.
Arg
s:
tensor: The tensor to send, or
`
None
`
if no tensor is being sent.
"""
if
self
.
transport_thread
is
None
:
self
.
transport_thread
=
ThreadPoolExecutor
(
max_workers
=
1
)
...
...
@@ -250,8 +250,8 @@ class PyNcclPipe(KVPipeBase):
"""
Receives a tensor and its metadata from the source rank. Blocking call.
Return
s:
-
tensor: The received tensor, or None if no tensor is received.
Arg
s:
tensor: The received tensor, or
`
None
`
if no tensor is received.
"""
if
self
.
transport_thread
is
None
:
self
.
transport_thread
=
ThreadPoolExecutor
(
max_workers
=
1
)
...
...
vllm/engine/llm_engine.py
View file @
273cb3b4
...
...
@@ -130,26 +130,16 @@ class LLMEngine:
iteration-level scheduling and efficient memory management to maximize the
serving throughput.
The
{class}`~
vllm.LLM
`
class wraps this class for offline batched inference
and the
{class}`
AsyncLLMEngine
`
class wraps this class for online serving.
The
[LLM][
vllm.LLM
]
class wraps this class for offline batched inference
and the
[
AsyncLLMEngine
][]
class wraps this class for online serving.
The config arguments are derived from
{class}`~
vllm.EngineArgs
`
. (See
{ref}`
engine-args
`
)
The config arguments are derived from
[EngineArgs][
vllm.EngineArgs
]
. (See
[
engine-args
][]
)
Args:
model_config: The configuration related to the LLM model.
cache_config: The configuration related to the KV cache memory
management.
parallel_config: The configuration related to distributed execution.
scheduler_config: The configuration related to the request scheduler.
device_config: The configuration related to the device.
lora_config (Optional): The configuration related to serving multi-LoRA.
speculative_config (Optional): The configuration related to speculative
decoding.
vllm_config: The configuration for initializing and running vLLM.
executor_class: The model executor class for managing distributed
execution.
prompt_adapter_config (Optional): The configuration related to serving
prompt adapters.
log_stats: Whether to log statistics.
usage_context: Specified entry point, used for usage info collection.
"""
...
...
@@ -695,11 +685,12 @@ class LLMEngine:
Args:
request_id: The unique ID of the request.
prompt: The prompt to the LLM. See {class}`~vllm.inputs.PromptType`
prompt: The prompt to the LLM. See
[PromptType][vllm.inputs.PromptType]
for more details about the format of each input.
params: Parameters for sampling or pooling.
{class}`~
vllm.SamplingParams
`
for text generation.
{class}`~
vllm.PoolingParams
`
for pooling.
[SamplingParams][
vllm.SamplingParams
]
for text generation.
[PoolingParams][
vllm.PoolingParams
]
for pooling.
arrival_time: The arrival time of the request. If None, we use
the current monotonic time.
lora_request: The LoRA request to add.
...
...
@@ -711,10 +702,11 @@ class LLMEngine:
Details:
- Set arrival_time to the current time if it is None.
- Set prompt_token_ids to the encoded prompt if it is None.
- Create `n` number of {class}`~vllm.Sequence` objects.
- Create a {class}`~vllm.SequenceGroup` object
from the list of {class}`~vllm.Sequence`.
- Add the {class}`~vllm.SequenceGroup` object to the scheduler.
- Create `n` number of [Sequence][vllm.Sequence] objects.
- Create a [SequenceGroup][vllm.SequenceGroup] object
from the list of [Sequence][vllm.Sequence].
- Add the [SequenceGroup][vllm.SequenceGroup] object to the
scheduler.
Example:
>>> # initialize engine
...
...
@@ -861,9 +853,7 @@ class LLMEngine:
request_id: The ID(s) of the request to abort.
Details:
- Refer to the
{meth}`~vllm.core.scheduler.Scheduler.abort_seq_group`
from class {class}`~vllm.core.scheduler.Scheduler`.
- Refer to [vllm.core.scheduler.Scheduler.abort_seq_group][].
Example:
>>> # initialize engine and add a request with request_id
...
...
vllm/entrypoints/llm.py
View file @
273cb3b4
...
...
@@ -116,7 +116,8 @@ class LLM:
to eager mode. Additionally for encoder-decoder models, if the
sequence length of the encoder input is larger than this, we fall
back to the eager mode.
disable_custom_all_reduce: See {class}`~vllm.config.ParallelConfig`
disable_custom_all_reduce: See
[ParallelConfig][vllm.config.ParallelConfig].
disable_async_output_proc: Disable async output processing.
This may result in lower performance.
hf_token: The token to use as HTTP bearer authorization for remote files
...
...
@@ -128,12 +129,12 @@ class LLM:
compilation_config: Either an integer or a dictionary. If it is an
integer, it is used as the level of compilation optimization. If it
is a dictionary, it can specify the full compilation configuration.
**kwargs: Arguments for
{class}`~
vllm.EngineArgs
`
. (See
{ref}`
engine-args
`
)
**kwargs: Arguments for
[EngineArgs][
vllm.EngineArgs
]
. (See
[
engine-args
][]
)
Note:
This class is intended to be used for offline inference. For online
serving, use the
{class}`~
vllm.AsyncLLMEngine
`
class instead.
serving, use the
[AsyncLLMEngine][
vllm.AsyncLLMEngine
]
class instead.
"""
DEPRECATE_LEGACY
:
ClassVar
[
bool
]
=
True
...
...
@@ -142,7 +143,7 @@ class LLM:
DEPRECATE_INIT_POSARGS
:
ClassVar
[
bool
]
=
True
"""
A flag to toggle whether to deprecate positional arguments in
{meth}`
LLM.__init__
`
.
[
LLM.__init__
][]
.
"""
@
classmethod
...
...
@@ -403,7 +404,7 @@ class LLM:
Args:
prompts: The prompts to the LLM. You may pass a sequence of prompts
for batch inference. See
{class}`~
vllm.inputs.PromptType
`
for batch inference. See
[PromptType][
vllm.inputs.PromptType
]
for more details about the format of each prompts.
sampling_params: The sampling parameters for text generation. If
None, we use the default sampling parameters.
...
...
@@ -669,7 +670,7 @@ class LLM:
Generate responses for a chat conversation.
The chat conversation is converted into a text prompt using the
tokenizer and calls the
{meth}`
generate
`
method to generate the
tokenizer and calls the
[
generate
][]
method to generate the
responses.
Multi-modal inputs can be passed in the same way you would pass them
...
...
@@ -693,23 +694,23 @@ class LLM:
chat_template_content_format: The format to render message content.
- "string" will render the content as a string.
Example:
`
`"Who are you?"`
`
Example: `"Who are you?"`
- "openai" will render the content as a list of dictionaries,
similar to OpenAI schema.
Example:
`
`[{"type": "text", "text": "Who are you?"}]`
`
Example: `[{"type": "text", "text": "Who are you?"}]`
add_generation_prompt: If True, adds a generation template
to each message.
continue_final_message: If True, continues the final message in
the conversation instead of starting a new one. Cannot be
`
`True`
`
if
`
`add_generation_prompt`
`
is also
`
`True`
`
.
`True` if `add_generation_prompt` is also `True`.
chat_template_kwargs: Additional kwargs to pass to the chat
template.
mm_processor_kwargs: Multimodal processor kwarg overrides for this
chat request. Only used for offline requests.
Returns:
A list of
`
`RequestOutput`
`
objects containing the generated
A list of `RequestOutput` objects containing the generated
responses in the same order as the input messages.
"""
list_of_messages
:
list
[
list
[
ChatCompletionMessageParam
]]
...
...
@@ -908,7 +909,7 @@ class LLM:
Args:
prompts: The prompts to the LLM. You may pass a sequence of prompts
for batch inference. See
{class}`~
vllm.inputs.PromptType
`
for batch inference. See
[PromptType][
vllm.inputs.PromptType
]
for more details about the format of each prompts.
pooling_params: The pooling parameters for pooling. If None, we
use the default pooling parameters.
...
...
@@ -997,7 +998,7 @@ class LLM:
Args:
prompts: The prompts to the LLM. You may pass a sequence of prompts
for batch inference. See
{class}`~
vllm.inputs.PromptType
`
for batch inference. See
[PromptType][
vllm.inputs.PromptType
]
for more details about the format of each prompts.
pooling_params: The pooling parameters for pooling. If None, we
use the default pooling parameters.
...
...
@@ -1007,7 +1008,7 @@ class LLM:
generation, if any.
Returns:
A list of
`
`EmbeddingRequestOutput`
`
objects containing the
A list of `EmbeddingRequestOutput` objects containing the
embedding vectors in the same order as the input prompts.
"""
if
self
.
llm_engine
.
model_config
.
task
!=
"embed"
:
...
...
@@ -1041,7 +1042,7 @@ class LLM:
Args:
prompts: The prompts to the LLM. You may pass a sequence of prompts
for batch inference. See
{class}`~
vllm.inputs.PromptType
`
for batch inference. See
[PromptType][
vllm.inputs.PromptType
]
for more details about the format of each prompts.
use_tqdm: Whether to use tqdm to display the progress bar.
lora_request: LoRA request to use for generation, if any.
...
...
@@ -1049,7 +1050,7 @@ class LLM:
generation, if any.
Returns:
A list of
`
`ClassificationRequestOutput`
`
objects containing the
A list of `ClassificationRequestOutput` objects containing the
embedding vectors in the same order as the input prompts.
"""
if
self
.
llm_engine
.
model_config
.
task
!=
"classify"
:
...
...
@@ -1159,11 +1160,11 @@ class LLM:
lora_request
:
Optional
[
Union
[
list
[
LoRARequest
],
LoRARequest
]]
=
None
,
prompt_adapter_request
:
Optional
[
PromptAdapterRequest
]
=
None
,
)
->
list
[
ScoringRequestOutput
]:
"""Generate similarity scores for all pairs
`
`<text,text_pair>`
`
.
"""Generate similarity scores for all pairs `<text,text_pair>`.
The inputs can be
`
`1 -> 1`
`
,
`
`1 -> N`
`
or
`
`N -> N`
`
.
In the
`
`1 - N`
`
case the
`
`text_1`
`
sentence will be replicated
`
`N`
`
times to pair with the
`
`text_2`
`
sentences.
The inputs can be `1 -> 1`, `1 -> N` or `N -> N`.
In the `1 - N` case the `text_1` sentence will be replicated `N`
times to pair with the `text_2` sentences.
The input pairs are used to build a list of prompts for the
cross encoder model. This class automatically batches the prompts,
considering the memory constraint. For the best performance, put all
...
...
@@ -1171,9 +1172,9 @@ class LLM:
Args:
text_1: can be a single prompt or a list of prompts, in which
case it has to have the same length as the
`
`text_2`
`
list
case it has to have the same length as the `text_2` list
text_2: The texts to pair with the query to form the input
to the LLM. See
{class}`~
vllm.inputs.PromptType
`
for
to the LLM. See
[PromptType][
vllm.inputs.PromptType
]
for
more details about the format of each prompts.
use_tqdm: Whether to use tqdm to display the progress bar.
lora_request: LoRA request to use for generation, if any.
...
...
@@ -1181,7 +1182,7 @@ class LLM:
generation, if any.
Returns:
A list of
`
`ScoringRequestOutput`
`
objects containing the
A list of `ScoringRequestOutput` objects containing the
generated scores in the same order as the input prompts.
"""
runner_type
=
self
.
llm_engine
.
model_config
.
runner_type
...
...
@@ -1282,13 +1283,13 @@ class LLM:
def
wake_up
(
self
,
tags
:
Optional
[
list
[
str
]]
=
None
):
"""
Wake up the engine from sleep mode. See the
{meth}`
sleep
`
method
Wake up the engine from sleep mode. See the
[
sleep
][]
method
for more details.
Args:
tags: An optional list of tags to reallocate the engine memory
for specific memory allocations. Values must be in
("weights", "kv_cache"
,
). If None, all memory is reallocated.
`
("weights", "kv_cache")
`
. If None, all memory is reallocated.
wake_up should be called with all tags (or None) before the
engine is used again.
"""
...
...
vllm/multimodal/__init__.py
View file @
273cb3b4
...
...
@@ -12,7 +12,7 @@ The global {class}`~MultiModalRegistry` is used by model runners to
dispatch data processing according to the target model.
Info:
{ref}`
mm-processing
`
[
mm-processing
][]
"""
__all__
=
[
...
...
vllm/multimodal/registry.py
View file @
273cb3b4
...
...
@@ -215,7 +215,7 @@ class MultiModalRegistry:
invoked to transform the data into a dictionary of model inputs.
Info:
{ref}`
mm-processing
`
[
mm-processing
][]
"""
def
wrapper
(
model_cls
:
N
)
->
N
:
...
...
@@ -260,7 +260,7 @@ class MultiModalRegistry:
Create a multi-modal processor for a specific model and tokenizer.
Info:
{ref}`
mm-processing
`
[
mm-processing
][]
"""
if
not
model_config
.
is_multimodal_model
:
raise
ValueError
(
f
"
{
model_config
.
model
}
is not a multimodal model"
)
...
...
vllm/outputs.py
View file @
273cb3b4
...
...
@@ -391,15 +391,6 @@ class PoolingRequestOutput(Generic[_O]):
prompt_token_ids
,
finished
)
def
__repr__
(
self
):
"""
Returns a string representation of an PoolingRequestOutput instance.
The representation includes the request_id and the number of outputs,
providing a quick overview of the pooling request's results.
Returns:
str: A string representation of the PoolingRequestOutput instance.
"""
return
(
f
"
{
type
(
self
).
__name__
}
(request_id=
{
self
.
request_id
!
r
}
, "
f
"outputs=
{
self
.
outputs
!
r
}
, "
f
"prompt_token_ids=
{
self
.
prompt_token_ids
}
, "
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment