Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
38d80967
Commit
38d80967
authored
Sep 12, 2025
by
zhuwenwen
Browse files
Merge tag 'v0.10.2rc2' into v0.10.2rc2-ori
parents
33650733
880c741b
Changes
544
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
535 additions
and
397 deletions
+535
-397
docs/usage/v1_guide.md
docs/usage/v1_guide.md
+5
-4
examples/offline_inference/audio_language.py
examples/offline_inference/audio_language.py
+32
-1
examples/offline_inference/chat_with_tools.py
examples/offline_inference/chat_with_tools.py
+1
-1
examples/offline_inference/data_parallel.py
examples/offline_inference/data_parallel.py
+8
-0
examples/offline_inference/disaggregated_prefill.py
examples/offline_inference/disaggregated_prefill.py
+6
-6
examples/offline_inference/encoder_decoder.py
examples/offline_inference/encoder_decoder.py
+2
-0
examples/offline_inference/encoder_decoder_multimodal.py
examples/offline_inference/encoder_decoder_multimodal.py
+3
-0
examples/offline_inference/logits_processor/custom.py
examples/offline_inference/logits_processor/custom.py
+0
-0
examples/offline_inference/logits_processor/custom_req.py
examples/offline_inference/logits_processor/custom_req.py
+151
-0
examples/offline_inference/logits_processor/custom_req_init.py
...les/offline_inference/logits_processor/custom_req_init.py
+165
-0
examples/offline_inference/multilora_inference.py
examples/offline_inference/multilora_inference.py
+1
-1
examples/offline_inference/neuron.py
examples/offline_inference/neuron.py
+0
-49
examples/offline_inference/neuron_eagle.py
examples/offline_inference/neuron_eagle.py
+0
-61
examples/offline_inference/neuron_int8_quantization.py
examples/offline_inference/neuron_int8_quantization.py
+0
-63
examples/offline_inference/neuron_multimodal.py
examples/offline_inference/neuron_multimodal.py
+0
-110
examples/offline_inference/neuron_speculation.py
examples/offline_inference/neuron_speculation.py
+0
-64
examples/offline_inference/prithvi_geospatial_mae.py
examples/offline_inference/prithvi_geospatial_mae.py
+5
-1
examples/offline_inference/prithvi_geospatial_mae_io_processor.py
.../offline_inference/prithvi_geospatial_mae_io_processor.py
+4
-3
examples/offline_inference/rlhf_colocate.py
examples/offline_inference/rlhf_colocate.py
+77
-18
examples/offline_inference/rlhf_utils.py
examples/offline_inference/rlhf_utils.py
+75
-15
No files found.
Too many changes to show.
To preserve performance only
544 of 544+
files are displayed.
Plain diff
Email patch
docs/usage/v1_guide.md
View file @
38d80967
...
...
@@ -83,7 +83,7 @@ based on assigned priority, with FCFS as a tie-breaker), configurable via the
| Model Type | Status |
|-----------------------------|------------------------------------------------------------------------------------|
|
**Decoder-only Models**
|
<nobr>
🚀 Optimized
</nobr>
|
|
**Encoder-Decoder Models**
|
<nobr>
🟠 Delayed
</nobr>
|
|
**Encoder-Decoder Models**
|
<nobr>
🟢 Whisper only
</nobr>
|
|
**Embedding Models**
|
<nobr>
🟢 Functional
</nobr>
|
|
**Mamba Models**
|
<nobr>
🟢 (Mamba-2), 🟢 (Mamba-1)
</nobr>
|
|
**Multimodal Models**
|
<nobr>
🟢 Functional
</nobr>
|
...
...
@@ -110,7 +110,7 @@ Models using selective state-space mechanisms instead of standard transformer at
Models that use Mamba-2 and Mamba-1 layers (e.g.,
`Mamba2ForCausalLM`
,
`MambaForCausalLM`
,
`FalconMambaForCausalLM`
) are supported.
Hybrid models that combine Mamba-2 and Mamba-1 layers with standard attention layers are also supported (e.g.,
`BambaForCausalLM`
,
`Zamba2ForCausalLM`
,
`NemotronHForCausalLM`
,
`FalconH1ForCausalLM`
and
`GraniteMoeHybridForCausalLM`
,
`JambaForCausalLM`
).
`Zamba2ForCausalLM`
,
`NemotronHForCausalLM`
,
`FalconH1ForCausalLM`
and
`GraniteMoeHybridForCausalLM`
,
`JambaForCausalLM`
,
`Plamo2ForCausalLM`
).
Hybrid models with mechanisms different to Mamba are also supported (e.g,
`MiniMaxText01ForCausalLM`
,
`MiniMaxM1ForCausalLM`
,
`Lfm2ForCausalLM`
).
...
...
@@ -118,8 +118,9 @@ Please note that prefix caching is not yet supported for any of the above models
#### Encoder-Decoder Models
Models requiring cross-attention between separate encoder and decoder (e.g.,
`BartForConditionalGeneration`
,
`MllamaForConditionalGeneration`
)
are not yet supported.
Whisper is supported. Other models requiring cross-attention between separate
encoder and decoder (e.g.,
`BartForConditionalGeneration`
,
`MllamaForConditionalGeneration`
) are not yet supported.
### Features
...
...
examples/offline_inference/audio_language.py
View file @
38d80967
...
...
@@ -117,7 +117,7 @@ def run_gemma3n(question: str, audio_count: int) -> ModelRequestData:
# Granite Speech
def
run_granite_speech
(
question
:
str
,
audio_count
:
int
)
->
ModelRequestData
:
# NOTE - the setting in this example are somehat different
than
what is
# NOTE - the setting in this example are some
w
hat different
from
what is
# optimal for granite speech, and it is generally recommended to use beam
# search. Check the model README for suggested settings.
# https://huggingface.co/ibm-granite/granite-speech-3.3-8b
...
...
@@ -146,6 +146,36 @@ def run_granite_speech(question: str, audio_count: int) -> ModelRequestData:
)
# MiDashengLM
def
run_midashenglm
(
question
:
str
,
audio_count
:
int
):
model_name
=
"mispeech/midashenglm-7b"
engine_args
=
EngineArgs
(
model
=
model_name
,
trust_remote_code
=
True
,
max_model_len
=
4096
,
max_num_seqs
=
5
,
limit_mm_per_prompt
=
{
"audio"
:
audio_count
},
)
audio_in_prompt
=
""
.
join
(
[
"<|audio_bos|><|AUDIO|><|audio_eos|>"
for
idx
in
range
(
audio_count
)]
)
default_system
=
"You are a helpful language and speech assistant."
prompt
=
(
f
"<|im_start|>system
\n
{
default_system
}
<|im_end|>
\n
"
"<|im_start|>user
\n
"
f
"
{
audio_in_prompt
}{
question
}
<|im_end|>
\n
"
"<|im_start|>assistant
\n
"
)
return
ModelRequestData
(
engine_args
=
engine_args
,
prompt
=
prompt
,
)
# MiniCPM-O
def
run_minicpmo
(
question
:
str
,
audio_count
:
int
)
->
ModelRequestData
:
model_name
=
"openbmb/MiniCPM-o-2_6"
...
...
@@ -352,6 +382,7 @@ model_example_map = {
"voxtral"
:
run_voxtral
,
"gemma3n"
:
run_gemma3n
,
"granite_speech"
:
run_granite_speech
,
"midashenglm"
:
run_midashenglm
,
"minicpmo"
:
run_minicpmo
,
"phi4_mm"
:
run_phi4mm
,
"phi4_multimodal"
:
run_phi4_multimodal
,
...
...
examples/offline_inference/chat_with_tools.py
View file @
38d80967
...
...
@@ -143,5 +143,5 @@ outputs = llm.chat(messages, sampling_params, tools=tools)
print
(
outputs
[
0
].
outputs
[
0
].
text
.
strip
())
# yields
# 'The weather in Dallas, TX is 85 degrees
f
ahrenheit. '
# 'The weather in Dallas, TX is 85 degrees
F
ahrenheit. '
# 'It is partly cloudly, with highs in the 90's.'
examples/offline_inference/data_parallel.py
View file @
38d80967
...
...
@@ -87,6 +87,11 @@ def parse_args():
default
=
0.8
,
help
=
(
"Fraction of GPU memory vLLM is allowed to allocate (0.0, 1.0]."
),
)
parser
.
add_argument
(
"--compilation-config"
,
type
=
int
,
help
=
(
"Compilation optimization (O) level 0-3."
),
)
parser
.
add_argument
(
"--quantization"
,
type
=
str
,
...
...
@@ -106,6 +111,7 @@ def main(
trust_remote_code
,
max_num_seqs
,
max_model_len
,
compilation_config
,
gpu_memory_utilization
,
quantization
,
):
...
...
@@ -162,6 +168,7 @@ def main(
max_model_len
=
max_model_len
,
gpu_memory_utilization
=
gpu_memory_utilization
,
quantization
=
quantization
,
compilation_config
=
compilation_config
,
)
outputs
=
llm
.
generate
(
prompts
,
sampling_params
)
# Print the outputs.
...
...
@@ -218,6 +225,7 @@ if __name__ == "__main__":
args
.
trust_remote_code
,
args
.
max_num_seqs
,
args
.
max_model_len
,
args
.
compilation_config
,
args
.
gpu_memory_utilization
,
args
.
quantization
,
),
...
...
examples/offline_inference/disaggregated_prefill.py
View file @
38d80967
...
...
@@ -30,12 +30,12 @@ def run_prefill(prefill_done):
]
sampling_params
=
SamplingParams
(
temperature
=
0
,
top_p
=
0.95
,
max_tokens
=
1
)
# Using P
y
NcclConnector to transmit KV caches between vLLM instances.
# Using P
2p
NcclConnector to transmit KV caches between vLLM instances.
# This instance is the prefill node (kv_producer, rank 0).
# The number of parallel instances for KV cache transfer is set to 2,
# as required for P
y
NcclConnector.
# as required for P
2p
NcclConnector.
ktc
=
KVTransferConfig
(
kv_connector
=
"P
y
NcclConnector"
,
kv_connector
=
"P
2p
NcclConnector"
,
kv_role
=
"kv_producer"
,
kv_rank
=
0
,
kv_parallel_size
=
2
,
...
...
@@ -74,12 +74,12 @@ def run_decode(prefill_done):
]
sampling_params
=
SamplingParams
(
temperature
=
0
,
top_p
=
0.95
)
# Using P
y
NcclConnector to transmit KV caches between vLLM instances.
# Using P
2p
NcclConnector to transmit KV caches between vLLM instances.
# This instance is the decode node (kv_consumer, rank 1).
# The number of parallel instances for KV cache transfer is set to 2,
# as required for P
y
NcclConnector.
# as required for P
2p
NcclConnector.
ktc
=
KVTransferConfig
(
kv_connector
=
"P
y
NcclConnector"
,
kv_connector
=
"P
2p
NcclConnector"
,
kv_role
=
"kv_consumer"
,
kv_rank
=
1
,
kv_parallel_size
=
2
,
...
...
examples/offline_inference/encoder_decoder.py
View file @
38d80967
...
...
@@ -5,6 +5,8 @@ Demonstrate prompting of text-to-text
encoder/decoder models, specifically BART and mBART.
This script is refactored to allow model selection via command-line arguments.
NOTE: This example is not yet supported in V1.
"""
import
argparse
...
...
examples/offline_inference/encoder_decoder_multimodal.py
View file @
38d80967
...
...
@@ -5,6 +5,7 @@ This example shows how to use vLLM for running offline inference with
the explicit/implicit prompt format on enc-dec LMMs for text generation.
"""
import
os
import
time
from
collections.abc
import
Sequence
from
dataclasses
import
asdict
...
...
@@ -130,6 +131,8 @@ def run_mllama():
def
run_whisper
():
os
.
environ
[
"VLLM_WORKER_MULTIPROC_METHOD"
]
=
"spawn"
engine_args
=
EngineArgs
(
model
=
"openai/whisper-large-v3-turbo"
,
max_model_len
=
448
,
...
...
examples/offline_inference/logits_processor.py
→
examples/offline_inference/logits_processor
/custom
.py
View file @
38d80967
File moved
examples/offline_inference/logits_processor/custom_req.py
0 → 100644
View file @
38d80967
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""This example demonstrates wrapping a request-level logits processor to be
compatible with vLLM's batch-level logits processing
For demo purposes, a dummy logits processor is employed which, if
`target_token` is passed as a keyword argument to `SamplingParams.extra_args`,
will mask out all tokens except `target_token`. This logits processor can be
applied to a vector of logits associated with a single decode step for a single
request. The logits processor cannot be applied to a request which does not
pass in a `target_token` custom argument.
The request-level dummy logits processor is wrapped to create a batch-level
logits processor, which can apply the logits processor to output logits from
all requests in the persistent batch in a given decode step. For requests which
do not provide a `target_token` argument, the corresponding row of `logits`
will not be modified.
A batch is constructed with `temperature=0.0` and 50% of requests specifying
`target_token`, and for these requests - and *only* these requests - we
expect the `target_token` to be decoded in each step, yielding an output
similar to that shown below:
Generated Outputs:
------------------------------------------------------------
Prompt: 'Hello, my name is'
Output: " ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' '"
------------------------------------------------------------
Prompt: 'The president of the United States is'
Output: " not a racist. He is a racist.
\n
He's a racist because he"
------------------------------------------------------------
Prompt: 'The capital of France is'
Output: ' also also also also also also also also also also also also also
also also also'
------------------------------------------------------------
Prompt: 'The future of AI is'
Output: ' in the hands of the people.
\n\n
The future of AI is in the'
------------------------------------------------------------
"""
from
typing
import
Any
,
Optional
import
torch
from
vllm
import
LLM
,
SamplingParams
from
vllm.logger
import
init_logger
from
vllm.v1.sample.logits_processor
import
(
AdapterLogitsProcessor
,
RequestLogitsProcessor
,
)
logger
=
init_logger
(
__name__
)
class
DummyPerReqLogitsProcessor
:
"""The request-level logits processor masks out all logits except the
token id identified by `target_token`"""
def
__init__
(
self
,
target_token
:
int
)
->
None
:
"""Specify `target_token`"""
self
.
target_token
=
target_token
def
__call__
(
self
,
output_ids
:
list
[
int
],
logits
:
torch
.
Tensor
,
)
->
torch
.
Tensor
:
val_to_keep
=
logits
[
self
.
target_token
].
item
()
logits
[:]
=
float
(
"-inf"
)
logits
[
self
.
target_token
]
=
val_to_keep
return
logits
class
WrappedPerReqLogitsProcessor
(
AdapterLogitsProcessor
):
"""Example of wrapping a fake request-level logit processor to create a
batch-level logits processor"""
def
is_argmax_invariant
(
self
)
->
bool
:
return
False
def
new_req_logits_processor
(
self
,
params
:
SamplingParams
,
)
->
Optional
[
RequestLogitsProcessor
]:
"""This method returns a new request-level logits processor, customized
to the `target_token` value associated with a particular request.
Returns None if the logits processor should not be applied to the
particular request. To use the logits processor the request must have
a "target_token" custom argument with an integer value.
Args:
params: per-request sampling params
Returns:
`Callable` request logits processor, or None
"""
target_token
:
Optional
[
Any
]
=
params
.
extra_args
and
params
.
extra_args
.
get
(
"target_token"
)
if
target_token
is
None
:
return
None
if
not
isinstance
(
target_token
,
int
):
logger
.
warning
(
"target_token value %s is not int; not applying logits"
" processor to request."
,
target_token
,
)
return
None
return
DummyPerReqLogitsProcessor
(
target_token
)
# Sample prompts.
prompts
=
[
"Hello, my name is"
,
"The president of the United States is"
,
"The capital of France is"
,
"The future of AI is"
,
]
# Create a mixture of requests which do and don't utilize the dummy logitproc
sampling_params_list
=
[
SamplingParams
(
temperature
=
0.0
,
extra_args
=
{
"target_token"
:
128
}),
SamplingParams
(
temperature
=
0.0
),
SamplingParams
(
temperature
=
0.0
,
extra_args
=
{
"target_token"
:
67
}),
SamplingParams
(
temperature
=
0.0
),
]
def
main
():
# Create an LLM.
llm
=
LLM
(
model
=
"facebook/opt-125m"
,
logits_processors
=
[
WrappedPerReqLogitsProcessor
],
)
# Generate texts from the prompts.
# The output is a list of RequestOutput objects
# that contain the prompt, generated text, and other information.
outputs
=
llm
.
generate
(
prompts
,
sampling_params_list
)
# Print the outputs.
print
(
"
\n
Generated Outputs:
\n
"
+
"-"
*
60
)
for
output
in
outputs
:
prompt
=
output
.
prompt
generated_text
=
output
.
outputs
[
0
].
text
print
(
f
"Prompt:
{
prompt
!
r
}
"
)
print
(
f
"Output:
{
generated_text
!
r
}
"
)
print
(
"-"
*
60
)
if
__name__
==
"__main__"
:
main
()
examples/offline_inference/logits_processor/custom_req_init.py
0 → 100644
View file @
38d80967
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""This example demonstrates a special case of wrapping a request-level logits
processor, namely the case where it is necessary to utilize engine config or
environment info passed to the constructor. The subclass must override the
wrapper base class `__init__()` method to access the engine config, the device
identifier, or the flag which indicates whether pinned memory is available.
For demo purposes, a request-level dummy logits processor is employed which
causes the same token (`target_token`) to be decoded in each step. The
request-level dummy logits processor is wrapped to create a batch-level logits
processor, which can apply the logits processor to output logits from all
requests in the persistent batch in a given decode step.
The wrapped dummy logits processor below models a scenario where we must
disable the logits processor on non-"cuda" platforms. The wrapper base class
`__init__()` is overridden in order to check this condition and set a flag.
A batch is constructed with `temperature=0.0` and 50% of requests specifying
`target_token`, and for these requests - and *only* these requests - we
expect that on a "cuda" device the output will look something like:
Generated Outputs:
------------------------------------------------------------
Prompt: 'Hello, my name is'
Output: " ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' '"
------------------------------------------------------------
Prompt: 'The president of the United States is'
Output: " not a racist. He is a racist.
\n
He's a racist because he"
------------------------------------------------------------
Prompt: 'The capital of France is'
Output: ' also also also also also also also also also also also also also
also also also'
------------------------------------------------------------
Prompt: 'The future of AI is'
Output: ' in the hands of the people.
\n\n
The future of AI is in the'
------------------------------------------------------------
which indicates that the logits processor is running. However, on a non-"cuda"
device, the first and third requests would not repeat the same token.
"""
from
typing
import
Optional
import
torch
from
vllm
import
LLM
,
SamplingParams
from
vllm.config
import
VllmConfig
from
vllm.logger
import
init_logger
from
vllm.v1.sample.logits_processor
import
(
AdapterLogitsProcessor
,
RequestLogitsProcessor
,
)
logger
=
init_logger
(
__name__
)
class
DummyPerReqLogitsProcessor
:
"""The request-level logits processor masks out all logits except the
token id identified by `target_token`"""
def
__init__
(
self
,
target_token
:
int
)
->
None
:
"""Specify `target_token`"""
self
.
target_token
=
target_token
def
__call__
(
self
,
output_ids
:
list
[
int
],
logits
:
torch
.
Tensor
,
)
->
torch
.
Tensor
:
val_to_keep
=
logits
[
self
.
target_token
].
item
()
logits
[:]
=
float
(
"-inf"
)
logits
[
self
.
target_token
]
=
val_to_keep
return
logits
class
WrappedPerReqLogitsProcessor
(
AdapterLogitsProcessor
):
"""Example of overriding the wrapper class `__init__()` in order to utilize
info about the device type"""
def
__init__
(
self
,
vllm_config
:
VllmConfig
,
device
:
torch
.
device
,
is_pin_memory
:
bool
):
super
().
__init__
(
vllm_config
,
device
,
is_pin_memory
)
self
.
is_cuda
=
device
.
type
==
"cuda"
def
is_argmax_invariant
(
self
)
->
bool
:
return
False
def
new_req_logits_processor
(
self
,
params
:
SamplingParams
,
)
->
Optional
[
RequestLogitsProcessor
]:
"""This method returns a new request-level logits processor, customized
to the `target_token` value associated with a particular request.
Returns None if the logits processor should not be applied to the
particular request. To use the logits processor the request must have
a "target_token" custom argument with an integer value, and the device
must be "cuda"-type
Args:
params: per-request sampling params
Returns:
`Callable` request logits processor, or None
"""
if
(
not
self
.
is_cuda
or
(
target_token
:
=
params
.
extra_args
and
params
.
extra_args
.
get
(
"target_token"
)
)
is
None
):
return
None
if
not
isinstance
(
target_token
,
int
):
logger
.
warning
(
"target_token value %s is not int; not applying logits"
" processor to request."
,
target_token
,
)
return
None
return
DummyPerReqLogitsProcessor
(
target_token
)
# Sample prompts.
prompts
=
[
"Hello, my name is"
,
"The president of the United States is"
,
"The capital of France is"
,
"The future of AI is"
,
]
# Create a mixture of requests which do and don't utilize the dummy logitproc
sampling_params_list
=
[
SamplingParams
(
temperature
=
0.0
,
extra_args
=
{
"target_token"
:
128
}),
SamplingParams
(
temperature
=
0.0
),
SamplingParams
(
temperature
=
0.0
,
extra_args
=
{
"target_token"
:
67
}),
SamplingParams
(
temperature
=
0.0
),
]
def
main
():
# Create an LLM.
llm
=
LLM
(
model
=
"facebook/opt-125m"
,
logits_processors
=
[
WrappedPerReqLogitsProcessor
],
)
# Generate texts from the prompts.
# The output is a list of RequestOutput objects
# that contain the prompt, generated text, and other information.
outputs
=
llm
.
generate
(
prompts
,
sampling_params_list
)
# Print the outputs.
print
(
"
\n
Generated Outputs:
\n
"
+
"-"
*
60
)
for
output
in
outputs
:
prompt
=
output
.
prompt
generated_text
=
output
.
outputs
[
0
].
text
print
(
f
"Prompt:
{
prompt
!
r
}
"
)
print
(
f
"Output:
{
generated_text
!
r
}
"
)
print
(
"-"
*
60
)
if
__name__
==
"__main__"
:
main
()
examples/offline_inference/multilora_inference.py
View file @
38d80967
...
...
@@ -23,7 +23,7 @@ def create_test_prompts(
2 requests for base model, 4 requests for the LoRA. We define 2
different LoRA adapters (using the same model for demo purposes).
Since we also set `max_loras=1`, the expectation is that the requests
with the second LoRA adapter will be r
a
n after all requests with the
with the second LoRA adapter will be r
u
n after all requests with the
first adapter have finished.
"""
return
[
...
...
examples/offline_inference/neuron.py
deleted
100644 → 0
View file @
33650733
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from
vllm
import
LLM
,
SamplingParams
# Sample prompts.
prompts
=
[
"Hello, my name is"
,
"The president of the United States is"
,
"The capital of France is"
,
"The future of AI is"
,
]
# Create a sampling params object.
sampling_params
=
SamplingParams
(
temperature
=
0.8
,
top_p
=
0.95
)
def
main
():
# Create an LLM.
llm
=
LLM
(
model
=
"TinyLlama/TinyLlama-1.1B-Chat-v1.0"
,
max_num_seqs
=
8
,
# The max_model_len and block_size arguments are required to be same as
# max sequence length when targeting neuron device.
# Currently, this is a known limitation in continuous batching support
# in transformers-neuronx.
# TODO(liangfu): Support paged-attention in transformers-neuronx.
max_model_len
=
1024
,
block_size
=
1024
,
# ruff: noqa: E501
# The device can be automatically detected when AWS Neuron SDK is installed.
# The device argument can be either unspecified for automated detection,
# or explicitly assigned.
device
=
"neuron"
,
tensor_parallel_size
=
2
,
)
# Generate texts from the prompts. The output is a list of RequestOutput objects
# that contain the prompt, generated text, and other information.
outputs
=
llm
.
generate
(
prompts
,
sampling_params
)
# Print the outputs.
print
(
"-"
*
50
)
for
output
in
outputs
:
prompt
=
output
.
prompt
generated_text
=
output
.
outputs
[
0
].
text
print
(
f
"Prompt:
{
prompt
!
r
}
\n
Generated text:
{
generated_text
!
r
}
"
)
print
(
"-"
*
50
)
if
__name__
==
"__main__"
:
main
()
examples/offline_inference/neuron_eagle.py
deleted
100644 → 0
View file @
33650733
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""
This example shows how to run offline inference with an EAGLE speculative
decoding model on neuron. To use EAGLE speculative decoding, you must use
a draft model that is specifically fine-tuned for EAGLE speculation.
Additionally, to use EAGLE with NxD Inference, the draft model must include
the LM head weights from the target model. These weights are shared between
the draft and target model.
"""
from
vllm
import
LLM
,
SamplingParams
# Sample prompts.
prompts
=
[
"What is annapurna labs?"
,
]
def
main
():
# Create a sampling params object.
sampling_params
=
SamplingParams
(
top_k
=
1
,
max_tokens
=
500
,
ignore_eos
=
True
)
# Create an LLM.
llm
=
LLM
(
model
=
"/home/ubuntu/model_hf/Meta-Llama-3.1-70B-Instruct"
,
speculative_config
=
{
"model"
:
"/home/ubuntu/model_hf/Llama-3.1-70B-Instruct-EAGLE-Draft"
,
"num_speculative_tokens"
:
5
,
"max_model_len"
:
2048
,
},
max_num_seqs
=
4
,
# The max_model_len and block_size arguments are required to be same as
# max sequence length when targeting neuron device.
# Currently, this is a known limitation in continuous batching support
# in neuronx-distributed-inference.
max_model_len
=
2048
,
block_size
=
2048
,
# The device can be automatically detected when AWS Neuron SDK is installed.
# The device argument can be either unspecified for automated detection,
# or explicitly assigned.
device
=
"neuron"
,
tensor_parallel_size
=
32
,
override_neuron_config
=
{
"enable_eagle_speculation"
:
True
,
"enable_fused_speculation"
:
True
,
},
)
# Generate texts from the prompts. The output is a list of RequestOutput objects
# that contain the prompt, generated text, and other information.
outputs
=
llm
.
generate
(
prompts
,
sampling_params
)
# Print the outputs.
for
output
in
outputs
:
prompt
=
output
.
prompt
generated_text
=
output
.
outputs
[
0
].
text
print
(
f
"Prompt:
{
prompt
!
r
}
,
\n\n\n
Generated text:
{
generated_text
!
r
}
"
)
if
__name__
==
"__main__"
:
main
()
examples/offline_inference/neuron_int8_quantization.py
deleted
100644 → 0
View file @
33650733
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import
os
from
vllm
import
LLM
,
SamplingParams
# creates XLA hlo graphs for all the context length buckets.
os
.
environ
[
"NEURON_CONTEXT_LENGTH_BUCKETS"
]
=
"128,512,1024,2048"
# creates XLA hlo graphs for all the token gen buckets.
os
.
environ
[
"NEURON_TOKEN_GEN_BUCKETS"
]
=
"128,512,1024,2048"
# Quantizes neuron model weight to int8 ,
# The default config for quantization is int8 dtype.
os
.
environ
[
"NEURON_QUANT_DTYPE"
]
=
"s8"
# Sample prompts.
prompts
=
[
"Hello, my name is"
,
"The president of the United States is"
,
"The capital of France is"
,
"The future of AI is"
,
]
# Create a sampling params object.
sampling_params
=
SamplingParams
(
temperature
=
0.8
,
top_p
=
0.95
)
def
main
():
# Create an LLM.
llm
=
LLM
(
model
=
"TinyLlama/TinyLlama-1.1B-Chat-v1.0"
,
max_num_seqs
=
8
,
# The max_model_len and block_size arguments are required to be same as
# max sequence length when targeting neuron device.
# Currently, this is a known limitation in continuous batching support
# in transformers-neuronx.
# TODO(liangfu): Support paged-attention in transformers-neuronx.
max_model_len
=
2048
,
block_size
=
2048
,
# ruff: noqa: E501
# The device can be automatically detected when AWS Neuron SDK is installed.
# The device argument can be either unspecified for automated detection,
# or explicitly assigned.
device
=
"neuron"
,
quantization
=
"neuron_quant"
,
override_neuron_config
=
{
"cast_logits_dtype"
:
"bfloat16"
,
},
tensor_parallel_size
=
2
,
)
# Generate texts from the prompts. The output is a list of RequestOutput objects
# that contain the prompt, generated text, and other information.
outputs
=
llm
.
generate
(
prompts
,
sampling_params
)
# Print the outputs.
print
(
"-"
*
50
)
for
output
in
outputs
:
prompt
=
output
.
prompt
generated_text
=
output
.
outputs
[
0
].
text
print
(
f
"Prompt:
{
prompt
!
r
}
\n
Generated text:
{
generated_text
!
r
}
"
)
print
(
"-"
*
50
)
if
__name__
==
"__main__"
:
main
()
examples/offline_inference/neuron_multimodal.py
deleted
100644 → 0
View file @
33650733
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import
requests
import
torch
from
neuronx_distributed_inference.models.mllama.utils
import
add_instruct
from
PIL
import
Image
from
vllm
import
LLM
,
SamplingParams
,
TextPrompt
def
get_image
(
image_url
):
image
=
Image
.
open
(
requests
.
get
(
image_url
,
stream
=
True
).
raw
)
return
image
# Model Inputs
PROMPTS
=
[
"What is in this image? Tell me a story"
,
"What is the recipe of mayonnaise in two sentences?"
,
"Describe this image"
,
"What is the capital of Italy famous for?"
,
]
IMAGES
=
[
get_image
(
"https://images.pexels.com/photos/1108099/pexels-photo-1108099.jpeg?auto=compress&cs=tinysrgb&dpr=1&w=500"
),
None
,
get_image
(
"https://images.pexels.com/photos/1108099/pexels-photo-1108099.jpeg?auto=compress&cs=tinysrgb&dpr=1&w=500"
),
None
,
]
SAMPLING_PARAMS
=
[
dict
(
top_k
=
1
,
temperature
=
1.0
,
top_p
=
1.0
,
max_tokens
=
16
)
for
_
in
range
(
len
(
PROMPTS
))
]
def
get_VLLM_mllama_model_inputs
(
prompt
,
single_image
,
sampling_params
):
# Prepare all inputs for mllama generation, including:
# 1. put text prompt into instruct chat template
# 2. compose single text and single image prompt into Vllm's prompt class
# 3. prepare sampling parameters
input_image
=
single_image
has_image
=
torch
.
tensor
([
1
])
if
isinstance
(
single_image
,
torch
.
Tensor
)
and
single_image
.
numel
()
==
0
:
has_image
=
torch
.
tensor
([
0
])
instruct_prompt
=
add_instruct
(
prompt
,
has_image
)
inputs
=
TextPrompt
(
prompt
=
instruct_prompt
)
if
input_image
is
not
None
:
inputs
[
"multi_modal_data"
]
=
{
"image"
:
input_image
}
sampling_params
=
SamplingParams
(
**
sampling_params
)
return
inputs
,
sampling_params
def
print_outputs
(
outputs
):
# Print the outputs.
for
output
in
outputs
:
prompt
=
output
.
prompt
generated_text
=
output
.
outputs
[
0
].
text
print
(
f
"Prompt:
{
prompt
!
r
}
, Generated text:
{
generated_text
!
r
}
"
)
def
main
():
assert
(
len
(
PROMPTS
)
==
len
(
IMAGES
)
==
len
(
SAMPLING_PARAMS
)
),
f
"""Text, image prompts and sampling parameters should have the
same batch size; but got
{
len
(
PROMPTS
)
}
,
{
len
(
IMAGES
)
}
,
and
{
len
(
SAMPLING_PARAMS
)
}
"""
# Create an LLM.
llm
=
LLM
(
model
=
"meta-llama/Llama-3.2-11B-Vision-Instruct"
,
max_num_seqs
=
1
,
max_model_len
=
4096
,
block_size
=
4096
,
device
=
"neuron"
,
tensor_parallel_size
=
32
,
override_neuron_config
=
{
"sequence_parallel_enabled"
:
False
,
"skip_warmup"
:
True
,
"save_sharded_checkpoint"
:
True
,
"on_device_sampling_config"
:
{
"global_topk"
:
1
,
"dynamic"
:
False
,
"deterministic"
:
False
,
},
},
)
batched_inputs
=
[]
batched_sample_params
=
[]
for
pmpt
,
img
,
params
in
zip
(
PROMPTS
,
IMAGES
,
SAMPLING_PARAMS
):
inputs
,
sampling_params
=
get_VLLM_mllama_model_inputs
(
pmpt
,
img
,
params
)
# test batch-size = 1
outputs
=
llm
.
generate
(
inputs
,
sampling_params
)
print_outputs
(
outputs
)
batched_inputs
.
append
(
inputs
)
batched_sample_params
.
append
(
sampling_params
)
# test batch-size = 4
outputs
=
llm
.
generate
(
batched_inputs
,
batched_sample_params
)
print_outputs
(
outputs
)
if
__name__
==
"__main__"
:
main
()
examples/offline_inference/neuron_speculation.py
deleted
100644 → 0
View file @
33650733
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""
This example shows how to run offline inference with a speculative
decoding model on neuron.
"""
import
os
from
vllm
import
LLM
,
SamplingParams
# Sample prompts.
prompts
=
[
"Hello, I am a language model and I can help"
,
"The president of the United States is"
,
"The capital of France is"
,
]
def
config_buckets
():
"""Configure context length and token gen buckets."""
# creates XLA hlo graphs for all the context length buckets.
os
.
environ
[
"NEURON_CONTEXT_LENGTH_BUCKETS"
]
=
"128,512,1024,2048"
# creates XLA hlo graphs for all the token gen buckets.
os
.
environ
[
"NEURON_TOKEN_GEN_BUCKETS"
]
=
"128,512,1024,2048"
def
initialize_llm
():
"""Create an LLM with speculative decoding."""
return
LLM
(
model
=
"openlm-research/open_llama_7b"
,
speculative_config
=
{
"model"
:
"openlm-research/open_llama_3b"
,
"num_speculative_tokens"
:
4
,
"max_model_len"
:
2048
,
},
max_num_seqs
=
4
,
max_model_len
=
2048
,
block_size
=
2048
,
device
=
"neuron"
,
tensor_parallel_size
=
32
,
)
def
process_requests
(
llm
:
LLM
,
sampling_params
:
SamplingParams
):
"""Generate texts from prompts and print them."""
outputs
=
llm
.
generate
(
prompts
,
sampling_params
)
for
output
in
outputs
:
prompt
=
output
.
prompt
generated_text
=
output
.
outputs
[
0
].
text
print
(
f
"Prompt:
{
prompt
!
r
}
, Generated text:
{
generated_text
!
r
}
"
)
def
main
():
"""Main function that sets up the llm and processes prompts."""
config_buckets
()
llm
=
initialize_llm
()
# Create a sampling params object.
sampling_params
=
SamplingParams
(
max_tokens
=
100
,
top_k
=
1
)
process_requests
(
llm
,
sampling_params
)
if
__name__
==
"__main__"
:
main
()
examples/offline_inference/prithvi_geospatial_mae.py
View file @
38d80967
...
...
@@ -45,7 +45,11 @@ datamodule_config = {
class
PrithviMAE
:
def
__init__
(
self
,
model
):
self
.
model
=
LLM
(
model
=
model
,
skip_tokenizer_init
=
True
,
dtype
=
"float16"
,
enforce_eager
=
True
model
=
model
,
skip_tokenizer_init
=
True
,
dtype
=
"float16"
,
enforce_eager
=
True
,
model_impl
=
"terratorch"
,
)
def
run
(
self
,
input_data
,
location_coords
):
...
...
examples/offline_inference/prithvi_geospatial_mae_io_processor.py
View file @
38d80967
...
...
@@ -12,13 +12,13 @@ from vllm.pooling_params import PoolingParams
# multimodal data. In this specific case this example will take a geotiff
# image as input, process it using the multimodal data processor, and
# perform inference.
# Reuirement - install plugin at:
# Re
q
uirement - install plugin at:
# https://github.com/christian-pinto/prithvi_io_processor_plugin
def
main
():
torch
.
set_default_dtype
(
torch
.
float16
)
image_url
=
"https://huggingface.co/christian-pinto/Prithvi-EO-2.0-300M-TL-VLLM/resolve/main/
India_900498_S2Hand
.tif"
# noqa: E501
image_url
=
"https://huggingface.co/christian-pinto/Prithvi-EO-2.0-300M-TL-VLLM/resolve/main/
valencia_example_2024-10-26
.tif
f
"
# noqa: E501
img_prompt
=
dict
(
data
=
image_url
,
...
...
@@ -36,7 +36,8 @@ def main():
# to avoid the model going OOM.
# The maximum number depends on the available GPU memory
max_num_seqs
=
32
,
io_processor_plugin
=
"prithvi_to_tiff_india"
,
io_processor_plugin
=
"prithvi_to_tiff"
,
model_impl
=
"terratorch"
,
)
pooling_params
=
PoolingParams
(
task
=
"encode"
,
softmax
=
False
)
...
...
examples/offline_inference/rlhf_colocate.py
View file @
38d80967
...
...
@@ -28,12 +28,15 @@ Learn more about Ray placement groups:
https://docs.ray.io/en/latest/placement-groups.html
"""
import
gc
import
os
import
ray
import
torch
import
zmq
from
ray.util.placement_group
import
placement_group
from
ray.util.scheduling_strategies
import
PlacementGroupSchedulingStrategy
from
torch.multiprocessing.reductions
import
reduce_tensor
from
vllm
import
LLM
...
...
@@ -86,20 +89,72 @@ class RayTrainingActor:
from
vllm.platforms
import
current_platform
self
.
device_uuid
=
current_platform
.
get_device_uuid
(
0
)
self
.
zmq_context
=
zmq
.
Context
()
self
.
zmq_address_counter
=
0
self
.
zmq_handle
=
None
def
report_device_id
(
self
)
->
str
:
return
self
.
device_uuid
def
get_weight_ipc_handles
(
self
):
from
torch.multiprocessing.reductions
import
reduce_tensor
def
get_zmq_handles
(
self
)
->
dict
[
str
,
str
]:
suffix
=
f
"
{
self
.
device_uuid
}
-
{
self
.
zmq_address_counter
}
"
self
.
zmq_handle
=
f
"ipc:///tmp/rl-colocate-zmq-
{
suffix
}
.sock"
self
.
zmq_address_counter
+=
1
return
{
self
.
device_uuid
:
self
.
zmq_handle
}
data
=
{}
for
name
,
p
in
self
.
model
.
named_parameters
():
# A training actor might hold only a subset of the weights and may
# need to gather weights from other actors. For demonstration
# purposes, each training actor owns the full weight set.
data
[
name
]
=
reduce_tensor
(
p
.
detach
())
return
{
self
.
device_uuid
:
data
}
def
update_weights
(
self
):
# align size to avoid misaligned address
align_size
=
256
def
get_size
(
p
:
torch
.
Tensor
)
->
int
:
return
(
p
.
nbytes
+
align_size
-
1
)
//
align_size
*
align_size
named_parameters
:
dict
[
str
,
torch
.
nn
.
Parameter
]
=
dict
(
self
.
model
.
named_parameters
()
)
max_tensor_size
=
max
(
get_size
(
p
)
for
p
in
named_parameters
.
values
())
# use max_tensor_size * 2 as buffer size
buffer
=
torch
.
empty
(
max_tensor_size
*
2
,
dtype
=
torch
.
uint8
,
device
=
"cuda:0"
)
s
=
self
.
zmq_context
.
socket
(
zmq
.
REQ
)
s
.
bind
(
self
.
zmq_handle
)
handle
=
reduce_tensor
(
buffer
)
offset
=
0
buckets
:
list
[
tuple
[
list
[
dict
],
list
[
torch
.
Tensor
]]]
=
[]
named_tensors
:
list
[
dict
]
=
[]
real_tensors
:
list
[
torch
.
Tensor
]
=
[]
for
name
,
p
in
named_parameters
.
items
():
size
=
get_size
(
p
)
if
offset
+
size
>
buffer
.
numel
():
buckets
.
append
((
named_tensors
,
real_tensors
))
named_tensors
,
real_tensors
=
[],
[]
offset
=
0
# assume tensors are contiguous
named_tensors
.
append
(
{
"name"
:
name
,
"dtype"
:
p
.
dtype
,
"shape"
:
p
.
shape
,
"offset"
:
offset
}
)
real_tensors
.
append
(
p
)
offset
+=
size
if
named_tensors
:
buckets
.
append
((
named_tensors
,
real_tensors
))
s
.
send_pyobj
(
handle
)
s
.
recv
()
for
named_tensors
,
real_tensors
in
buckets
:
offset
=
0
for
p
in
real_tensors
:
buffer
[
offset
:
offset
+
p
.
nbytes
].
data
.
copy_
(
p
.
data
.
view
(
-
1
).
view
(
dtype
=
torch
.
uint8
),
non_blocking
=
True
)
offset
+=
get_size
(
p
)
torch
.
cuda
.
synchronize
()
s
.
send_pyobj
(
named_tensors
)
s
.
recv
()
s
.
send_pyobj
(
None
)
s
.
recv
()
s
.
close
()
del
buffer
gc
.
collect
()
torch
.
cuda
.
empty_cache
()
# Ray manages four GPUs.
...
...
@@ -175,18 +230,22 @@ assert training_actor_device_ids[:2] == inference_engine_device_ids[0]
# the second inference engine.
assert
training_actor_device_ids
[
2
:]
==
inference_engine_device_ids
[
1
]
print
(
"Gather all the
IPC
handles from the training actors."
)
ipc
_handles
=
{}
print
(
"Gather all the
ZMQ
handles from the training actors."
)
zmq
_handles
=
{}
for
actor
in
training_actors
:
ipc_handles
.
update
(
ray
.
get
(
actor
.
get_weight_ipc_handles
.
remote
()))
zmq_handles
.
update
(
ray
.
get
(
actor
.
get_zmq_handles
.
remote
()))
print
(
f
"ZMQ handles:
{
zmq_handles
}
"
)
print
(
"Update the weights of the inference engines."
)
for
llm
in
inference_engines
:
ray
.
get
(
llm
.
collective_rpc
.
remote
(
"update_weights_from_ipc_handles"
,
args
=
(
ipc_handles
,)
)
)
ray
.
get
(
[
actor
.
update_weights
.
remote
()
for
actor
in
training_actors
]
+
[
llm
.
collective_rpc
.
remote
(
"update_weights_from_ipc"
,
args
=
(
zmq_handles
,))
for
llm
in
inference_engines
]
)
print
(
"Check if the weights are updated."
)
for
llm
in
inference_engines
:
assert
ray
.
get
(
llm
.
collective_rpc
.
remote
(
"check_weights_changed"
,
args
=
tuple
()))
examples/offline_inference/rlhf_utils.py
View file @
38d80967
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import
gc
from
typing
import
Callable
,
Optional
,
TypedDict
import
torch
import
zmq
def
stateless_init_process_group
(
master_address
,
master_port
,
rank
,
world_size
,
device
):
...
...
@@ -66,6 +70,27 @@ class WorkerExtension:
return
weights_updated
def
rebuild_ipc
(
handle
:
tuple
[
Callable
,
tuple
],
device_id
:
Optional
[
int
]
=
None
)
->
torch
.
Tensor
:
func
,
args
=
handle
list_args
=
list
(
args
)
if
device_id
is
not
None
:
# the key is to change device id to the current device id
# in case two processes have different CUDA_VISIBLE_DEVICES
list_args
[
6
]
=
device_id
buffer
=
func
(
*
list_args
)
return
buffer
class
FlattenedTensorMetadata
(
TypedDict
):
name
:
str
shape
:
torch
.
Size
dtype
:
torch
.
dtype
# specify the start offset of this tensor in shared ipc_buffer tensor
offset
:
int
class
ColocateWorkerExtension
:
"""
The class for vLLM's worker to inherit from, in the colocate setting.
...
...
@@ -76,27 +101,62 @@ class ColocateWorkerExtension:
should pass the full qualified name as `worker_extension_cls` argument.
"""
def
update_weights_from_ipc
(
self
,
zmq_handles
:
dict
[
str
,
str
]):
from
vllm.model_executor.model_loader.utils
import
process_weights_after_loading
assert
self
.
device
is
not
None
if
not
hasattr
(
self
,
"_zmq_ctx"
)
or
self
.
_zmq_ctx
is
None
:
self
.
_zmq_ctx
=
zmq
.
Context
()
socket
=
self
.
_zmq_ctx
.
socket
(
zmq
.
REP
)
socket
.
connect
(
zmq_handles
[
self
.
report_device_id
()])
buffer
:
Optional
[
torch
.
Tensor
]
=
None
while
True
:
payload
:
tuple
[
Callable
,
tuple
]
|
list
[
FlattenedTensorMetadata
]
|
None
=
(
socket
.
recv_pyobj
()
)
if
payload
is
None
:
# means the update is done
process_weights_after_loading
(
self
.
model_runner
.
model
,
self
.
model_config
,
self
.
device
)
torch
.
cuda
.
synchronize
()
socket
.
send
(
b
""
)
break
if
isinstance
(
payload
,
tuple
):
# an ipc handle that vLLM can use `func, args = handle`
# and `func(*args)` to rebuild GPU tensor.
buffer
=
rebuild_ipc
(
payload
,
self
.
device
.
index
)
assert
buffer
.
dtype
==
torch
.
uint8
socket
.
send
(
b
""
)
continue
assert
isinstance
(
payload
,
list
)
assert
buffer
is
not
None
weights
=
[]
for
item
in
payload
:
shape
=
item
[
"shape"
]
if
isinstance
(
shape
,
(
list
,
tuple
)):
shape
=
torch
.
Size
(
shape
)
assert
isinstance
(
shape
,
torch
.
Size
)
dtype
,
offset
=
item
[
"dtype"
],
item
[
"offset"
]
size
=
dtype
.
itemsize
*
shape
.
numel
()
tensor
=
buffer
[
offset
:
offset
+
size
].
view
(
dtype
=
dtype
).
view
(
shape
)
weights
.
append
((
item
[
"name"
],
tensor
))
self
.
model_runner
.
model
.
load_weights
(
weights
=
weights
)
del
weights
torch
.
cuda
.
synchronize
()
socket
.
send
(
b
""
)
socket
.
close
()
del
buffer
gc
.
collect
()
torch
.
cuda
.
empty_cache
()
def
report_device_id
(
self
)
->
str
:
from
vllm.platforms
import
current_platform
self
.
device_uuid
=
current_platform
.
get_device_uuid
(
self
.
device
.
index
)
return
self
.
device_uuid
def
update_weights_from_ipc_handles
(
self
,
ipc_handles
):
handles
=
ipc_handles
[
self
.
device_uuid
]
device_id
=
self
.
device
.
index
weights
=
[]
for
name
,
handle
in
handles
.
items
():
func
,
args
=
handle
list_args
=
list
(
args
)
# the key is to change device id to the current device id
# in case two processes have different CUDA_VISIBLE_DEVICES
list_args
[
6
]
=
device_id
tensor
=
func
(
*
list_args
)
weights
.
append
((
name
,
tensor
))
self
.
model_runner
.
model
.
load_weights
(
weights
=
weights
)
torch
.
cuda
.
synchronize
()
def
check_weights_changed
(
self
):
"""
Check if the weights are updated to 0.
...
...
Prev
1
…
3
4
5
6
7
8
9
10
11
…
28
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment