Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
711aa9d5
Commit
711aa9d5
authored
Jul 30, 2025
by
zhuwenwen
Browse files
Merge tag 'v0.10.0' into v0.10.0-dev
parents
751c492c
6d8d0a24
Changes
519
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
683 additions
and
290 deletions
+683
-290
examples/offline_inference/basic/embed.py
examples/offline_inference/basic/embed.py
+2
-2
examples/offline_inference/basic/score.py
examples/offline_inference/basic/score.py
+2
-2
examples/offline_inference/batch_llm_inference.py
examples/offline_inference/batch_llm_inference.py
+11
-9
examples/offline_inference/convert_model_to_seq_cls.py
examples/offline_inference/convert_model_to_seq_cls.py
+134
-0
examples/offline_inference/embed_jina_embeddings_v3.py
examples/offline_inference/embed_jina_embeddings_v3.py
+2
-2
examples/offline_inference/embed_matryoshka_fy.py
examples/offline_inference/embed_matryoshka_fy.py
+2
-2
examples/offline_inference/neuron_eagle.py
examples/offline_inference/neuron_eagle.py
+1
-1
examples/offline_inference/neuron_speculation.py
examples/offline_inference/neuron_speculation.py
+6
-7
examples/offline_inference/prithvi_geospatial_mae.py
examples/offline_inference/prithvi_geospatial_mae.py
+68
-173
examples/offline_inference/qwen3_reranker.py
examples/offline_inference/qwen3_reranker.py
+4
-4
examples/offline_inference/rlhf.py
examples/offline_inference/rlhf.py
+49
-36
examples/offline_inference/rlhf_colocate.py
examples/offline_inference/rlhf_colocate.py
+74
-41
examples/offline_inference/skip_loading_weights_in_engine_init.py
.../offline_inference/skip_loading_weights_in_engine_init.py
+53
-0
examples/offline_inference/spec_decode.py
examples/offline_inference/spec_decode.py
+1
-0
examples/offline_inference/vision_language.py
examples/offline_inference/vision_language.py
+39
-0
examples/offline_inference/vision_language_pooling.py
examples/offline_inference/vision_language_pooling.py
+89
-7
examples/online_serving/disaggregated_serving_p2p_nccl_xpyd/disagg_example_p2p_nccl_xpyd.sh
...ted_serving_p2p_nccl_xpyd/disagg_example_p2p_nccl_xpyd.sh
+1
-0
examples/online_serving/disaggregated_serving_p2p_nccl_xpyd/disagg_proxy_p2p_nccl_xpyd.py
...gated_serving_p2p_nccl_xpyd/disagg_proxy_p2p_nccl_xpyd.py
+35
-4
examples/online_serving/elastic_ep/bench.sh
examples/online_serving/elastic_ep/bench.sh
+57
-0
examples/online_serving/elastic_ep/scale.py
examples/online_serving/elastic_ep/scale.py
+53
-0
No files found.
Too many changes to show.
To preserve performance only
519 of 519+
files are displayed.
Plain diff
Email patch
examples/offline_inference/basic/embed.py
View file @
711aa9d5
...
...
@@ -31,10 +31,10 @@ def main(args: Namespace):
# Create an LLM.
# You should pass task="embed" for embedding models
model
=
LLM
(
**
vars
(
args
))
llm
=
LLM
(
**
vars
(
args
))
# Generate embedding. The output is a list of EmbeddingRequestOutputs.
outputs
=
model
.
embed
(
prompts
)
outputs
=
llm
.
embed
(
prompts
)
# Print the outputs.
print
(
"
\n
Generated Outputs:
\n
"
+
"-"
*
60
)
...
...
examples/offline_inference/basic/score.py
View file @
711aa9d5
...
...
@@ -27,10 +27,10 @@ def main(args: Namespace):
# Create an LLM.
# You should pass task="score" for cross-encoder models
model
=
LLM
(
**
vars
(
args
))
llm
=
LLM
(
**
vars
(
args
))
# Generate scores. The output is a list of ScoringRequestOutputs.
outputs
=
model
.
score
(
text_1
,
texts_2
)
outputs
=
llm
.
score
(
text_1
,
texts_2
)
# Print the outputs.
print
(
"
\n
Generated Outputs:
\n
"
+
"-"
*
60
)
...
...
examples/offline_inference/batch_llm_inference.py
View file @
711aa9d5
...
...
@@ -3,17 +3,19 @@
"""
This example shows how to use Ray Data for data parallel batch inference.
Ray Data is a data processing framework that can handle large datasets
and integrates tightly with vLLM for data-parallel inference.
As of Ray 2.44, Ray Data has a native integration with
vLLM (under ray.data.llm).
Ray Data is a data processing framework that can process very large datasets
with first-class support for vLLM.
Ray Data provides functionality for:
* Reading and writing to cloud storage (S3, GCS, etc.)
* Automatic sharding and load-balancing across a cluster
* Optimized configuration of vLLM using continuous batching
* Compatible with tensor/pipeline parallel inference as well.
* Reading and writing to most popular file formats and cloud object storage.
* Streaming execution, so you can run inference on datasets that far exceed
the aggregate RAM of the cluster.
* Scale up the workload without code changes.
* Automatic sharding, load-balancing, and autoscaling across a Ray cluster,
with built-in fault-tolerance and retry semantics.
* Continuous batching that keeps vLLM replicas saturated and maximizes GPU
utilization.
* Compatible with tensor/pipeline parallel inference.
Learn more about Ray Data's LLM integration:
https://docs.ray.io/en/latest/data/working-with-llms.html
...
...
examples/offline_inference/convert_model_to_seq_cls.py
0 → 100644
View file @
711aa9d5
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
# ruff: noqa: E501
import
argparse
import
json
import
torch
import
transformers
# Usage:
# for BAAI/bge-reranker-v2-gemma
# Caution: "Yes" and "yes" are two different tokens
# python convert_model_to_seq_cls.py --model_name BAAI/bge-reranker-v2-gemma --classifier_from_tokens '["Yes"]' --method no_post_processing --path ./bge-reranker-v2-gemma-seq-cls
# for mxbai-rerank-v2
# python convert_model_to_seq_cls.py --model_name mixedbread-ai/mxbai-rerank-base-v2 --classifier_from_tokens '["0", "1"]' --method from_2_way_softmax --path ./mxbai-rerank-base-v2-seq-cls
# for Qwen3-Reranker
# python convert_model_to_seq_cls.py --model_name Qwen/Qwen3-Reranker-0.6B --classifier_from_tokens '["no", "yes"]' --method from_2_way_softmax --path ./Qwen3-Reranker-0.6B-seq-cls
def
from_2_way_softmax
(
causal_lm
,
seq_cls_model
,
tokenizer
,
tokens
,
device
):
# refer to https://huggingface.co/Qwen/Qwen3-Reranker-0.6B/discussions/3
assert
len
(
tokens
)
==
2
lm_head_weights
=
causal_lm
.
lm_head
.
weight
false_id
=
tokenizer
.
convert_tokens_to_ids
(
tokens
[
0
])
true_id
=
tokenizer
.
convert_tokens_to_ids
(
tokens
[
1
])
score_weight
=
lm_head_weights
[
true_id
].
to
(
device
).
to
(
torch
.
float32
)
-
lm_head_weights
[
false_id
].
to
(
device
).
to
(
torch
.
float32
)
with
torch
.
no_grad
():
seq_cls_model
.
score
.
weight
.
copy_
(
score_weight
.
unsqueeze
(
0
))
if
seq_cls_model
.
score
.
bias
is
not
None
:
seq_cls_model
.
score
.
bias
.
zero_
()
def
no_post_processing
(
causal_lm
,
seq_cls_model
,
tokenizer
,
tokens
,
device
):
lm_head_weights
=
causal_lm
.
lm_head
.
weight
token_ids
=
[
tokenizer
.
convert_tokens_to_ids
(
t
)
for
t
in
tokens
]
score_weight
=
lm_head_weights
[
token_ids
].
to
(
device
)
with
torch
.
no_grad
():
seq_cls_model
.
score
.
weight
.
copy_
(
score_weight
)
if
seq_cls_model
.
score
.
bias
is
not
None
:
seq_cls_model
.
score
.
bias
.
zero_
()
method_map
=
{
function
.
__name__
:
function
for
function
in
[
from_2_way_softmax
,
no_post_processing
]
}
def
converting
(
model_name
,
classifier_from_tokens
,
path
,
method
,
use_pad_token
=
False
,
device
=
"cpu"
):
assert
method
in
method_map
if
method
==
"from_2_way_softmax"
:
assert
len
(
classifier_from_tokens
)
==
2
num_labels
=
1
else
:
num_labels
=
len
(
classifier_from_tokens
)
tokenizer
=
transformers
.
AutoTokenizer
.
from_pretrained
(
model_name
)
causal_lm
=
transformers
.
AutoModelForCausalLM
.
from_pretrained
(
model_name
,
device_map
=
device
)
seq_cls_model
=
transformers
.
AutoModelForSequenceClassification
.
from_pretrained
(
model_name
,
num_labels
=
num_labels
,
ignore_mismatched_sizes
=
True
,
device_map
=
device
,
)
method_map
[
method
](
causal_lm
,
seq_cls_model
,
tokenizer
,
classifier_from_tokens
,
device
)
# `llm as reranker` defaults to not using pad_token
seq_cls_model
.
config
.
use_pad_token
=
use_pad_token
seq_cls_model
.
config
.
pad_token_id
=
tokenizer
.
pad_token_id
seq_cls_model
.
save_pretrained
(
path
)
tokenizer
.
save_pretrained
(
path
)
def
parse_args
():
parser
=
argparse
.
ArgumentParser
(
description
=
"Converting *ForCausalLM models to "
"*ForSequenceClassification models."
)
parser
.
add_argument
(
"--model_name"
,
type
=
str
,
default
=
"BAAI/bge-reranker-v2-gemma"
,
help
=
"Model name"
,
)
parser
.
add_argument
(
"--classifier_from_tokens"
,
type
=
str
,
default
=
'["Yes"]'
,
help
=
"classifier from tokens"
,
)
parser
.
add_argument
(
"--method"
,
type
=
str
,
default
=
"no_post_processing"
,
help
=
"Converting converting"
)
parser
.
add_argument
(
"--use-pad-token"
,
action
=
"store_true"
,
help
=
"Whether to use pad_token"
)
parser
.
add_argument
(
"--path"
,
type
=
str
,
default
=
"./bge-reranker-v2-gemma-seq-cls"
,
help
=
"Path to save converted model"
,
)
return
parser
.
parse_args
()
if
__name__
==
"__main__"
:
args
=
parse_args
()
converting
(
model_name
=
args
.
model_name
,
classifier_from_tokens
=
json
.
loads
(
args
.
classifier_from_tokens
),
method
=
args
.
method
,
use_pad_token
=
args
.
use_pad_token
,
path
=
args
.
path
,
)
examples/offline_inference/embed_jina_embeddings_v3.py
View file @
711aa9d5
...
...
@@ -30,11 +30,11 @@ def main(args: Namespace):
# Create an LLM.
# You should pass task="embed" for embedding models
model
=
LLM
(
**
vars
(
args
))
llm
=
LLM
(
**
vars
(
args
))
# Generate embedding. The output is a list of EmbeddingRequestOutputs.
# Only text matching task is supported for now. See #16120
outputs
=
model
.
embed
(
prompts
)
outputs
=
llm
.
embed
(
prompts
)
# Print the outputs.
print
(
"
\n
Generated Outputs:"
)
...
...
examples/offline_inference/embed_matryoshka_fy.py
View file @
711aa9d5
...
...
@@ -30,10 +30,10 @@ def main(args: Namespace):
# Create an LLM.
# You should pass task="embed" for embedding models
model
=
LLM
(
**
vars
(
args
))
llm
=
LLM
(
**
vars
(
args
))
# Generate embedding. The output is a list of EmbeddingRequestOutputs.
outputs
=
model
.
embed
(
prompts
,
pooling_params
=
PoolingParams
(
dimensions
=
32
))
outputs
=
llm
.
embed
(
prompts
,
pooling_params
=
PoolingParams
(
dimensions
=
32
))
# Print the outputs.
print
(
"
\n
Generated Outputs:"
)
...
...
examples/offline_inference/neuron_eagle.py
View file @
711aa9d5
...
...
@@ -54,7 +54,7 @@ def main():
for
output
in
outputs
:
prompt
=
output
.
prompt
generated_text
=
output
.
outputs
[
0
].
text
print
(
f
"Prompt:
{
prompt
!
r
}
,
\n\n\n
\
Generated text:
{
generated_text
!
r
}
"
)
print
(
f
"Prompt:
{
prompt
!
r
}
,
\n\n\n
Generated text:
{
generated_text
!
r
}
"
)
if
__name__
==
"__main__"
:
...
...
examples/offline_inference/neuron_speculation.py
View file @
711aa9d5
...
...
@@ -25,7 +25,7 @@ def config_buckets():
os
.
environ
[
"NEURON_TOKEN_GEN_BUCKETS"
]
=
"128,512,1024,2048"
def
initialize_
model
():
def
initialize_
llm
():
"""Create an LLM with speculative decoding."""
return
LLM
(
model
=
"openlm-research/open_llama_7b"
,
...
...
@@ -37,15 +37,14 @@ def initialize_model():
max_num_seqs
=
4
,
max_model_len
=
2048
,
block_size
=
2048
,
use_v2_block_manager
=
True
,
device
=
"neuron"
,
tensor_parallel_size
=
32
,
)
def
process_requests
(
model
:
LLM
,
sampling_params
:
SamplingParams
):
def
process_requests
(
llm
:
LLM
,
sampling_params
:
SamplingParams
):
"""Generate texts from prompts and print them."""
outputs
=
model
.
generate
(
prompts
,
sampling_params
)
outputs
=
llm
.
generate
(
prompts
,
sampling_params
)
for
output
in
outputs
:
prompt
=
output
.
prompt
generated_text
=
output
.
outputs
[
0
].
text
...
...
@@ -53,12 +52,12 @@ def process_requests(model: LLM, sampling_params: SamplingParams):
def
main
():
"""Main function that sets up the
model
and processes prompts."""
"""Main function that sets up the
llm
and processes prompts."""
config_buckets
()
model
=
initialize_
model
()
llm
=
initialize_
llm
()
# Create a sampling params object.
sampling_params
=
SamplingParams
(
max_tokens
=
100
,
top_k
=
1
)
process_requests
(
model
,
sampling_params
)
process_requests
(
llm
,
sampling_params
)
if
__name__
==
"__main__"
:
...
...
examples/offline_inference/prithvi_geospatial_mae.py
View file @
711aa9d5
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""
This is a demo script showing how to use the
PrithviGeospatialMAE model with vLLM
This script is based on: https://huggingface.co/ibm-nasa-geospatial/Prithvi-EO-2.0-300M-TL-Sen1Floods11/blob/main/inference.py # noqa
Target model weights: https://huggingface.co/ibm-nasa-geospatial/Prithvi-EO-2.0-300M-TL-Sen1Floods11/resolve/main/Prithvi-EO-V2-300M-TL-Sen1Floods11.pt # noqa
The requirements for running this script are:
- Installing [terratorch, albumentations, rasterio] in your python environment
- downloading the model weights in a 'model' folder local to the script
(temporary measure until the proper config.json file is uploaded to HF)
- download an input example image (India_900498_S2Hand.tif) and place it in
the same folder with the script (or specify with the --data_file argument)
Run the example:
python prithvi_geospatial_mae.py
"""
# noqa: E501
import
argparse
import
datetime
import
os
import
re
from
typing
import
Union
import
albumentations
import
numpy
as
np
import
rasterio
import
regex
as
re
import
torch
from
einops
import
rearrange
from
terratorch.datamodules
import
Sen1Floods11NonGeoDataModule
from
vllm
import
LLM
torch
.
set_default_dtype
(
torch
.
float16
)
NO_DATA
=
-
9999
NO_DATA_FLOAT
=
0.0001
OFFSET
=
0
PERCENTILE
=
99
model_config
=
"""{
"architectures": ["PrithviGeoSpatialMAE"],
"num_classes": 0,
"pretrained_cfg": {
"task_args": {
"task": "SemanticSegmentationTask",
"model_factory": "EncoderDecoderFactory",
"loss": "ce",
"ignore_index": -1,
"lr": 0.001,
"freeze_backbone": false,
"freeze_decoder": false,
"plot_on_val": 10,
"optimizer": "AdamW",
"scheduler": "CosineAnnealingLR"
},
"model_args": {
"backbone_pretrained": false,
"backbone": "prithvi_eo_v2_300_tl",
"decoder": "UperNetDecoder",
"decoder_channels": 256,
"decoder_scale_modules": true,
"num_classes": 2,
"rescale": true,
"backbone_bands": [
"BLUE",
"GREEN",
"RED",
"NIR_NARROW",
"SWIR_1",
"SWIR_2"
],
"head_dropout": 0.1,
"necks": [
{
"name": "SelectIndices",
"indices": [
5,
11,
17,
23
]
},
{
"name": "ReshapeTokensToImage"
}
]
},
"optimizer_params" : {
"lr": 5.0e-05,
"betas": [0.9, 0.999],
"eps": [1.0e-08],
"weight_decay": 0.05,
"amsgrad": false,
"maximize": false,
"capturable": false,
"differentiable": false
},
"scheduler_params" : {
"T_max": 50,
"eta_min": 0,
"last_epoch": -1,
"verbose": "deprecated"
}
},
"torch_dtype": "float32"
}
"""
# Temporarily creating the "config.json" for the model.
# This is going to disappear once the correct config.json is available on HF
with
open
(
os
.
path
.
join
(
os
.
path
.
dirname
(
__file__
),
"./model/config.json"
),
"w"
)
as
config_file
:
config_file
.
write
(
model_config
)
datamodule_config
=
{
"bands"
:
[
"BLUE"
,
"GREEN"
,
"RED"
,
"NIR_NARROW"
,
"SWIR_1"
,
"SWIR_2"
],
"batch_size"
:
16
,
...
...
@@ -138,28 +43,24 @@ datamodule_config = {
class
PrithviMAE
:
def
__init__
(
self
):
print
(
"Initializing PrithviMAE model"
)
def
__init__
(
self
,
model
):
self
.
model
=
LLM
(
model
=
os
.
path
.
join
(
os
.
path
.
dirname
(
__file__
),
"./model"
),
skip_tokenizer_init
=
True
,
dtype
=
"float32"
,
model
=
model
,
skip_tokenizer_init
=
True
,
dtype
=
"float16"
,
enforce_eager
=
True
)
def
run
(
self
,
input_data
,
location_coords
):
print
(
"################ Running inference on vLLM ##############"
)
# merge the inputs into one data structure
if
input_data
is
not
None
and
input_data
.
dtype
==
torch
.
float32
:
input_data
=
input_data
.
to
(
torch
.
float16
)
input_data
=
input_data
[
0
]
mm_data
=
{
"pixel_values"
:
torch
.
empty
(
0
)
if
input_data
is
None
else
input_data
,
"location_coords"
:
torch
.
empty
(
0
)
if
location_coords
is
None
else
location_coords
,
"pixel_values"
:
input_data
,
"location_coords"
:
location_coords
,
}
prompt
=
{
"prompt_token_ids"
:
[
1
],
"multi_modal_data"
:
mm_data
}
outputs
=
self
.
model
.
encode
(
prompt
,
use_tqdm
=
False
)
print
(
"################ Inference done (it took seconds) ##############"
)
return
outputs
[
0
].
outputs
.
data
...
...
@@ -181,11 +82,12 @@ def process_channel_group(orig_img, channels):
"""
Args:
orig_img: torch.Tensor representing original image (reference)
with shape = (bands, H, W).
with shape = (bands, H, W).
channels: list of indices representing RGB channels.
Returns:
torch.Tensor with shape (num_channels, height, width) for original image
torch.Tensor with shape (num_channels, height, width)
for original image
"""
orig_img
=
orig_img
[
channels
,
...]
...
...
@@ -260,10 +162,10 @@ def load_example(
Args:
file_paths: list of file paths .
mean: list containing mean values for each band in the
images
in *file_paths*.
std: list containing std values for each band in the
images
in *file_paths*.
mean: list containing mean values for each band in the
images
in *file_paths*.
std: list containing std values for each band in the
images
in *file_paths*.
Returns:
np.array containing created example
...
...
@@ -308,7 +210,7 @@ def load_example(
print
(
f
"Could not extract timestamp for
{
file
}
(
{
e
}
)"
)
imgs
=
np
.
stack
(
imgs
,
axis
=
0
)
# num_frames, H, W, C
imgs
=
np
.
moveaxis
(
imgs
,
-
1
,
0
).
astype
(
"float32"
)
imgs
=
np
.
moveaxis
(
imgs
,
-
1
,
0
).
astype
(
"float32"
)
# C, num_frames, H, W
imgs
=
np
.
expand_dims
(
imgs
,
axis
=
0
)
# add batch di
return
imgs
,
temporal_coords
,
location_coords
,
metas
...
...
@@ -332,8 +234,10 @@ def run_model(
)
# Build sliding window
batch_size
=
1
batch
=
torch
.
tensor
(
input_data
,
device
=
"cpu"
)
# batch = torch.tensor(input_data, device="cpu")
batch
=
torch
.
tensor
(
input_data
)
windows
=
batch
.
unfold
(
3
,
img_size
,
img_size
).
unfold
(
4
,
img_size
,
img_size
)
h1
,
w1
=
windows
.
shape
[
3
:
5
]
windows
=
rearrange
(
...
...
@@ -344,18 +248,16 @@ def run_model(
num_batches
=
windows
.
shape
[
0
]
//
batch_size
if
windows
.
shape
[
0
]
>
batch_size
else
1
windows
=
torch
.
tensor_split
(
windows
,
num_batches
,
dim
=
0
)
device
=
torch
.
device
(
"cuda"
)
if
torch
.
cuda
.
is_available
()
else
torch
.
device
(
"cpu"
)
if
temporal_coords
:
temporal_coords
=
torch
.
tensor
(
temporal_coords
,
device
=
device
).
unsqueeze
(
0
)
temporal_coords
=
torch
.
tensor
(
temporal_coords
).
unsqueeze
(
0
)
else
:
temporal_coords
=
None
if
location_coords
:
location_coords
=
torch
.
tensor
(
location_coords
[
0
]
,
device
=
device
).
unsqueeze
(
0
)
location_coords
=
torch
.
tensor
(
location_coords
[
0
]).
unsqueeze
(
0
)
else
:
location_coords
=
None
# Run
model
# Run
Prithvi-EO-V2-300M-TL-Sen1Floods11
pred_imgs
=
[]
for
x
in
windows
:
# Apply standardization
...
...
@@ -363,15 +265,7 @@ def run_model(
x
=
datamodule
.
aug
(
x
)[
"image"
]
with
torch
.
no_grad
():
x
=
x
.
to
(
device
)
pred
=
model
.
run
(
x
,
location_coords
=
location_coords
)
if
lightning_model
:
pred_lightning
=
lightning_model
(
x
,
temporal_coords
=
temporal_coords
,
location_coords
=
location_coords
)
pred_lightning
=
pred_lightning
.
output
.
detach
().
cpu
()
if
not
torch
.
equal
(
pred
,
pred_lightning
):
print
(
"Inference output is not equal"
)
y_hat
=
pred
.
argmax
(
dim
=
1
)
y_hat
=
torch
.
nn
.
functional
.
interpolate
(
...
...
@@ -403,52 +297,18 @@ def run_model(
return
pred_imgs
def
parse_args
():
parser
=
argparse
.
ArgumentParser
(
"MAE run inference"
,
add_help
=
False
)
parser
.
add_argument
(
"--data_file"
,
type
=
str
,
default
=
"./India_900498_S2Hand.tif"
,
help
=
"Path to the file."
,
)
parser
.
add_argument
(
"--output_dir"
,
type
=
str
,
default
=
"output"
,
help
=
"Path to the directory where to save outputs."
,
)
parser
.
add_argument
(
"--input_indices"
,
default
=
[
1
,
2
,
3
,
8
,
11
,
12
],
type
=
int
,
nargs
=
"+"
,
help
=
"0-based indices of the six Prithvi channels to be selected from the "
"input. By default selects [1,2,3,8,11,12] for S2L1C data."
,
)
parser
.
add_argument
(
"--rgb_outputs"
,
action
=
"store_true"
,
help
=
"If present, output files will only contain RGB channels. "
"Otherwise, all bands will be saved."
,
)
def
main
(
data_file
:
str
,
model
:
str
,
output_dir
:
str
,
rgb_outputs
:
bool
,
input_indices
:
list
[
int
]
=
None
,
):
os
.
makedirs
(
output_dir
,
exist_ok
=
True
)
# Load model ---------------------------------------------------------------
model_obj
=
PrithviMAE
()
model_obj
=
PrithviMAE
(
model
=
model
)
datamodule
=
generate_datamodule
()
img_size
=
256
# Size of Sen1Floods11
# Loading data -------------------------------------------------------------
img_size
=
512
# Size of Sen1Floods11
input_data
,
temporal_coords
,
location_coords
,
meta_data
=
load_example
(
file_paths
=
[
data_file
],
...
...
@@ -460,8 +320,6 @@ def main(
if
input_data
.
mean
()
>
1
:
input_data
=
input_data
/
10000
# Convert to range 0-1
# Running model ------------------------------------------------------------
channels
=
[
datamodule_config
[
"bands"
].
index
(
b
)
for
b
in
[
"RED"
,
"GREEN"
,
"BLUE"
]
]
# BGR -> RGB
...
...
@@ -469,7 +327,6 @@ def main(
pred
=
run_model
(
input_data
,
temporal_coords
,
location_coords
,
model_obj
,
datamodule
,
img_size
)
# Save pred
meta_data
.
update
(
count
=
1
,
dtype
=
"uint8"
,
compress
=
"lzw"
,
nodata
=
0
)
pred_file
=
os
.
path
.
join
(
...
...
@@ -487,6 +344,7 @@ def main(
orig_img
=
torch
.
Tensor
(
input_data
[
0
,
:,
0
,
...]),
channels
=
channels
,
)
rgb_orig
=
rgb_orig
.
to
(
torch
.
float32
)
pred
[
pred
==
0.0
]
=
np
.
nan
img_pred
=
rgb_orig
*
0.7
+
pred
*
0.3
...
...
@@ -503,9 +361,10 @@ def main(
# Save image rgb
if
rgb_outputs
:
name_suffix
=
os
.
path
.
splitext
(
os
.
path
.
basename
(
data_file
))[
0
]
rgb_file
=
os
.
path
.
join
(
output_dir
,
f
"original_rgb_
{
os
.
path
.
splitext
(
os
.
path
.
basename
(
data_file
))[
0
]
}
.tiff"
,
f
"original_rgb_
{
name_suffix
}
.tiff"
,
)
save_geotiff
(
image
=
_convert_np_uint8
(
rgb_orig
),
...
...
@@ -515,6 +374,42 @@ def main(
if
__name__
==
"__main__"
:
args
=
parse_args
()
parser
=
argparse
.
ArgumentParser
(
"MAE run inference"
,
add_help
=
False
)
parser
.
add_argument
(
"--data_file"
,
type
=
str
,
default
=
"./India_900498_S2Hand.tif"
,
help
=
"Path to the file."
,
)
parser
.
add_argument
(
"--model"
,
type
=
str
,
default
=
"christian-pinto/Prithvi-EO-2.0-300M-TL-VLLM"
,
help
=
"Path to a checkpoint file to load from."
,
)
parser
.
add_argument
(
"--output_dir"
,
type
=
str
,
default
=
"output"
,
help
=
"Path to the directory where to save outputs."
,
)
parser
.
add_argument
(
"--input_indices"
,
default
=
[
1
,
2
,
3
,
8
,
11
,
12
],
type
=
int
,
nargs
=
"+"
,
help
=
"""
0-based indices of the six Prithvi channels to be selected from the input.
By default selects [1,2,3,8,11,12] for S2L1C data.
"""
,
)
parser
.
add_argument
(
"--rgb_outputs"
,
action
=
"store_true"
,
help
=
"If present, output files will only contain RGB channels. "
"Otherwise, all bands will be saved."
,
)
args
=
parser
.
parse_args
()
main
(
**
vars
(
args
))
examples/offline_inference/qwen3_reranker.py
View file @
711aa9d5
...
...
@@ -17,13 +17,13 @@ model_name = "Qwen/Qwen3-Reranker-0.6B"
# Models converted offline using this method can not only be more efficient
# and support the vllm score API, but also make the init parameters more
# concise, for example.
#
model
= LLM(model="tomaarsen/Qwen3-Reranker-0.6B-seq-cls", task="score")
#
llm
= LLM(model="tomaarsen/Qwen3-Reranker-0.6B-seq-cls", task="score")
# If you want to load the official original version, the init parameters are
# as follows.
def
get_
model
()
->
LLM
:
def
get_
llm
()
->
LLM
:
"""Initializes and returns the LLM model for Qwen3-Reranker."""
return
LLM
(
model
=
model_name
,
...
...
@@ -76,8 +76,8 @@ def main() -> None:
]
documents
=
[
document_template
.
format
(
doc
=
doc
,
suffix
=
suffix
)
for
doc
in
documents
]
model
=
get_
model
()
outputs
=
model
.
score
(
queries
,
documents
)
llm
=
get_
llm
()
outputs
=
llm
.
score
(
queries
,
documents
)
print
(
"-"
*
30
)
print
([
output
.
outputs
.
score
for
output
in
outputs
])
...
...
examples/offline_inference/rlhf.py
View file @
711aa9d5
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""
a simple demonstration of RLHF with vLLM, inspired by
the OpenRLHF framework https://github.com/OpenRLHF/OpenRLHF .
It follows the design that, training processes and inference processes
are different, and they live on different GPUs.
Training processes send prompts to inference processes to generate data,
and also synchronize the weights of the model by broadcasting the weights
from the training process to the inference process.
Note that this is a simple demonstration of one training instance and one
inference instance. In practice, there could be multiple training instances
and multiple inference instances. For the full implementation, please refer
to the OpenRLHF framework.
Demonstrates reinforcement learning from human feedback (RLHF) using vLLM and Ray.
The script separates training and inference workloads onto distinct GPUs
so that Ray can manage process placement and inter-process communication.
A Hugging Face Transformer model occupies GPU 0 for training, whereas a
tensor-parallel vLLM inference engine occupies GPU 1–2.
The example performs the following steps:
* Load the training model on GPU 0.
* Split the inference model across GPUs 1–2 using vLLM's tensor parallelism
and Ray placement groups.
* Generate text from a list of prompts using the inference engine.
* Update the weights of the training model and broadcast the updated weights
to the inference engine by using a Ray collective RPC group. Note that
for demonstration purposes we simply zero out the weights.
For a production-ready implementation that supports multiple training and
inference replicas, see the OpenRLHF framework:
https://github.com/OpenRLHF/OpenRLHF
This example assumes a single-node cluster with three GPUs, but Ray
supports multi-node clusters. vLLM expects the GPUs are only used for vLLM
workloads. Residual GPU activity interferes with vLLM memory profiling and
causes unexpected behavior.
"""
import
os
...
...
@@ -28,29 +42,27 @@ from vllm.utils import get_ip, get_open_port
class
MyLLM
(
LLM
):
"""Configure the vLLM worker for Ray placement group execution."""
def
__init__
(
self
,
*
args
,
**
kwargs
):
# a hack to make the script work.
# stop ray from manipulating CUDA_VISIBLE_DEVICES
# at the top-level
# Remove the top-level CUDA_VISIBLE_DEVICES variable set by Ray
# so that vLLM can manage its own device placement within the worker.
os
.
environ
.
pop
(
"CUDA_VISIBLE_DEVICES"
,
None
)
super
().
__init__
(
*
args
,
**
kwargs
)
"""
Start the training process, here we use huggingface transformers
as an example to hold a model on GPU 0.
"""
# Load the OPT-125M model onto GPU 0 for the training workload.
train_model
=
AutoModelForCausalLM
.
from_pretrained
(
"facebook/opt-125m"
)
train_model
.
to
(
"cuda:0"
)
"""
Start the inference process, here we use vLLM to hold a model on GPU 1 and
GPU 2. For the details on how to use ray, please refer to the ray
documentation https://docs.ray.io/en/latest/ .
"""
# Initialize Ray and set the visible devices. The vLLM engine will
# be placed on GPUs 1 and 2.
os
.
environ
[
"CUDA_VISIBLE_DEVICES"
]
=
"1,2"
ray
.
init
()
# Create a placement group that reserves GPU 1–2 for the vLLM inference engine.
# Learn more about Ray placement groups:
# https://docs.ray.io/en/latest/placement-groups.html
pg_inference
=
placement_group
([{
"GPU"
:
1
,
"CPU"
:
0
}]
*
2
)
ray
.
get
(
pg_inference
.
ready
())
scheduling_inference
=
PlacementGroupSchedulingStrategy
(
...
...
@@ -58,10 +70,9 @@ scheduling_inference = PlacementGroupSchedulingStrategy(
placement_group_capture_child_tasks
=
True
,
placement_group_bundle_index
=
0
,
)
"""
launch the vLLM inference engine.
here we use `enforce_eager` to reduce the start time.
"""
# Launch the vLLM inference engine. The `enforce_eager` flag reduces
# start-up latency.
llm
=
ray
.
remote
(
num_cpus
=
0
,
num_gpus
=
0
,
...
...
@@ -74,7 +85,7 @@ llm = ray.remote(
distributed_executor_backend
=
"ray"
,
)
# Generate text
s
from the prompts.
# Generate text from the prompts.
prompts
=
[
"Hello, my name is"
,
"The president of the United States is"
,
...
...
@@ -93,8 +104,8 @@ for output in outputs:
print
(
f
"Prompt:
{
prompt
!
r
}
\n
Generated text:
{
generated_text
!
r
}
"
)
print
(
"-"
*
50
)
#
s
et up the communication between the training process
#
and the
inference engine.
#
S
et up the communication
channel
between the training process
and the
# inference engine.
master_address
=
get_ip
()
master_port
=
get_open_port
()
...
...
@@ -107,21 +118,23 @@ model_update_group = stateless_init_process_group(
)
ray
.
get
(
handle
)
# simulate training, modify the weights of the model.
# Simulate a training step by zeroing out all model weights.
# In a real RLHF training loop the weights would be updated using the gradient
# from an RL objective such as PPO on a reward model.
for
name
,
p
in
train_model
.
named_parameters
():
p
.
data
.
zero_
()
#
s
ync
weight from the training proces
s to the inference engine.
#
S
ync
hronize the updated weight
s to the inference engine.
for
name
,
p
in
train_model
.
named_parameters
():
handle
=
llm
.
collective_rpc
.
remote
(
"update_weight"
,
args
=
(
name
,
p
.
dtype
,
p
.
shape
))
model_update_group
.
broadcast
(
p
,
src
=
0
,
stream
=
torch
.
cuda
.
current_stream
())
ray
.
get
(
handle
)
#
check if the weights are
updated.
#
Verify that the inference weights have been
updated.
assert
all
(
ray
.
get
(
llm
.
collective_rpc
.
remote
(
"check_weights_changed"
)))
#
use the updated model to generate texts, they will
be nonsense
# because the weights are
all
zero
s
.
#
Generate text with the updated model. The output is expected to
be nonsense
# because the weights are zero.
outputs_updated
=
ray
.
get
(
llm
.
generate
.
remote
(
prompts
,
sampling_params
))
print
(
"-"
*
50
)
for
output
in
outputs_updated
:
...
...
examples/offline_inference/rlhf_colocate.py
View file @
711aa9d5
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""
a simple demonstration to show how to co-locate
vLLM worker with training actors on the same GPUs,
for RLHF-like applications.
The key points:
- Control the placement of the vLLM workers with Ray, by setting
VLLM_RAY_PER_WORKER_GPUS and VLLM_RAY_BUNDLE_INDICES properly.
- Use cuda-ipc to pass tensors, since NCCL does not work when we have
multiple processes on the same GPU.
Demonstrates how to co-locate a vLLM inference worker and training
actors on the same set of GPUs for reinforcement learning from human feedback
(RLHF) workloads.
Ray serves as the distributed execution framework in this example. Ray
placement groups allocate both training actors and vLLM workers to the
same GPU bundles, enabling fast, in-GPU communication between the two
components.
The script shows how to do the following:
* Configure environment variables (`VLLM_RAY_PER_WORKER_GPUS` and
`VLLM_RAY_BUNDLE_INDICES`) so that vLLM workers land on the desired
devices.
* Exchange tensors between processes by means of CUDA inter-process
communication (IPC). CUDA IPC sidesteps NCCL limitations that occur
when multiple processes share a single GPU.
Note that this example assumes a single-node cluster with four GPUs, but Ray
supports multi-node clusters. vLLM expects exclusive use of the GPUs during
its initialization for memory profiling. Residual GPU activity interferes
with vLLM memory profiling and causes unexpected behavior.
Learn more about Ray placement groups:
https://docs.ray.io/en/latest/placement-groups.html
"""
import
os
...
...
@@ -22,13 +39,24 @@ from vllm import LLM
class
MyLLM
(
LLM
):
def
__init__
(
self
,
*
args
,
bundle_indices
:
list
,
**
kwargs
):
# a hack to make the script work.
# stop ray from manipulating CUDA_VISIBLE_DEVICES
# at the top-level
"""Configure the vLLM worker for Ray placement group execution.
The constructor sets environment variables that allow multiple vLLM
workers to share a single physical GPU and that encode the bundle
indices assigned by the placement group.
Args:
*args: Positional arguments forwarded to `vllm.LLM`.
bundle_indices (list[int]): Placement-group bundle indices
assigned to this worker.
**kwargs: Keyword arguments forwarded to `vllm.LLM`.
"""
def
__init__
(
self
,
*
args
,
bundle_indices
:
list
[
int
],
**
kwargs
):
# Prevent Ray from manipulating the top-level CUDA_VISIBLE_DEVICES variable
# so that vLLM can its own device placement inside the worker.
os
.
environ
.
pop
(
"CUDA_VISIBLE_DEVICES"
,
None
)
# every worker will use 0.4 GPU, so that we can schedule
# 2 instances on the same GPUs.
# Each worker uses 0.4 GPU so that two instances fit on the same GPUs.
os
.
environ
[
"VLLM_RAY_PER_WORKER_GPUS"
]
=
"0.4"
os
.
environ
[
"VLLM_RAY_BUNDLE_INDICES"
]
=
","
.
join
(
map
(
str
,
bundle_indices
))
print
(
f
"creating LLM with bundle_indices=
{
bundle_indices
}
"
)
...
...
@@ -36,17 +64,25 @@ class MyLLM(LLM):
class
RayTrainingActor
:
"""Training actor that hosts a Facebook OPT-125M model from Hugging Face.
The model is loaded onto the first GPU assigned to this actor, and expose
the CUDA IPC handles so that colocated vLLM workers can map tensors
directly.
"""
def
__init__
(
self
):
#
r
ay
will
set CUDA_VISIBLE_DEVICES to the assigned
GPUs
#
R
ay set
s
CUDA_VISIBLE_DEVICES to the
GPUs
assigned
to this actor.
from
transformers
import
AutoModelForCausalLM
self
.
model
=
AutoModelForCausalLM
.
from_pretrained
(
"facebook/opt-125m"
)
self
.
model
.
to
(
"cuda:0"
)
# Zero out all the parameters.
for
name
,
p
in
self
.
model
.
named_parameters
():
p
.
data
.
zero_
()
torch
.
cuda
.
synchronize
()
#
t
he argument for get_device_uuid is the index
#
of the GPU in the
visible devices.
#
T
he argument for
`
get_device_uuid
`
is the index
of the GPU in the
#
list of
visible devices.
from
vllm.platforms
import
current_platform
self
.
device_uuid
=
current_platform
.
get_device_uuid
(
0
)
...
...
@@ -59,23 +95,23 @@ class RayTrainingActor:
data
=
{}
for
name
,
p
in
self
.
model
.
named_parameters
():
# the training actor might only have a subset of the weights
# and need to all-gather the weights from all the actors.
# for demonstration, here we assume all training actors have
# the full weights.
# A training actor might hold only a subset of the weights and may
# need to gather weights from other actors. For demonstration
# purposes, each training actor owns the full weight set.
data
[
name
]
=
reduce_tensor
(
p
.
detach
())
return
{
self
.
device_uuid
:
data
}
# ray manages 4 GPUs
# Ray manages four GPUs.
os
.
environ
[
"CUDA_VISIBLE_DEVICES"
]
=
"0,1,2,3"
ray
.
init
()
#
we want to c
o-locate vLLM instance and
the
training actor
#
on the same set of GPUs.
#
the placement plan is as follows:
# GPU
0
and
1
: training actor
0
,
1
, and vLLM instance
0 (with TP=2)
#
GPU 2 and 3: training actor 2, 3, and vLLM instance 1 (with TP=
2)
#
C
o-locate vLLM instance
s
and training actor
s on the same set of GPUs:
#
* GPU 0 and 1: training actor 0, training actor 1, and vLLM instance 0
#
(tensor parallelism = 2).
#
*
GPU
2
and
3
: training actor
2
,
training actor 3
, and vLLM instance
1
#
(tensor parallelism =
2)
.
pg
=
placement_group
([{
"GPU"
:
1
,
"CPU"
:
0
}]
*
4
)
ray
.
get
(
pg
.
ready
())
...
...
@@ -104,10 +140,8 @@ for bundle_index, training_actor in enumerate(training_actors):
training_actor_device_ids
.
append
(
device_id
)
for
i
,
bundle_indices
in
enumerate
([[
0
,
1
],
[
2
,
3
]]):
# IMPORTANT: when creating vLLM instances, we need to
# make sure there are no GPU activities on the target GPUs,
# otherwise, they will interfere with the vLLM memory profiling,
# and cause unexpected behaviors.
# Use the following syntax instead of the @ray.remote decorator so that
# the placement group is customized for each bundle.
llm
=
ray
.
remote
(
num_cpus
=
0
,
num_gpus
=
0
,
...
...
@@ -125,8 +159,8 @@ for i, bundle_indices in enumerate([[0, 1], [2, 3]]):
bundle_indices
=
bundle_indices
,
)
inference_engines
.
append
(
llm
)
#
don'
t call any method on the inference engine
here,
#
otherwise it will
block until the vLLM instance
is created
.
#
Do no
t call any method on the inference engine
at this point; the call
# block
s
until the vLLM instance
finishes initialization
.
for
i
,
llm
in
enumerate
(
inference_engines
):
inference_engine_device_ids
.
append
(
...
...
@@ -134,26 +168,25 @@ for i, llm in enumerate(inference_engines):
)
print
(
f
"inference engine
{
i
}
is on
{
inference_engine_device_ids
[
-
1
]
}
"
)
# check the placement
# the first two training actors should be
# on the same GPUs as the first inference engine
# Verify placement: the first two training actors share the same GPUs as
# the first inference engine.
assert
training_actor_device_ids
[:
2
]
==
inference_engine_device_ids
[
0
]
# the last two training actors sh
ould be
#
on the same GPUs as
the second inference engine
#
Verify placement:
the last two training actors sh
are the same GPUs as
# the second inference engine
.
assert
training_actor_device_ids
[
2
:]
==
inference_engine_device_ids
[
1
]
print
(
"
g
ather all the IPC handles from the training actors"
)
print
(
"
G
ather all the IPC handles from the training actors
.
"
)
ipc_handles
=
{}
for
actor
in
training_actors
:
ipc_handles
.
update
(
ray
.
get
(
actor
.
get_weight_ipc_handles
.
remote
()))
print
(
"
u
pdate the weights of the inference engines"
)
print
(
"
U
pdate the weights of the inference engines
.
"
)
for
llm
in
inference_engines
:
ray
.
get
(
llm
.
collective_rpc
.
remote
(
"update_weights_from_ipc_handles"
,
args
=
(
ipc_handles
,)
)
)
print
(
"
c
heck if the weights are updated"
)
print
(
"
C
heck if the weights are updated
.
"
)
for
llm
in
inference_engines
:
assert
ray
.
get
(
llm
.
collective_rpc
.
remote
(
"check_weights_changed"
,
args
=
tuple
()))
examples/offline_inference/skip_loading_weights_in_engine_init.py
0 → 100644
View file @
711aa9d5
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from
vllm
import
LLM
,
RequestOutput
,
SamplingParams
# Sample prompts.
prompts
=
[
"Hello, my name is"
,
"The president of the United States is"
,
"The capital of France is"
,
"The future of AI is"
,
]
# Create a sampling params object.
sampling_params
=
SamplingParams
(
temperature
=
0.8
,
top_p
=
0.95
)
def
print_prompts_and_outputs
(
outputs
:
list
[
RequestOutput
])
->
None
:
print
(
"-"
*
60
)
for
output
in
outputs
:
prompt
=
output
.
prompt
generated_text
=
output
.
outputs
[
0
].
text
print
(
f
"Prompt:
{
prompt
!
r
}
"
)
print
(
f
"Output:
{
generated_text
!
r
}
"
)
print
(
"-"
*
60
)
def
main
():
# Create an LLM without loading real weights
llm
=
LLM
(
model
=
"Qwen/Qwen3-0.6B"
,
load_format
=
"dummy"
,
enforce_eager
=
True
,
tensor_parallel_size
=
4
,
)
outputs
=
llm
.
generate
(
prompts
,
sampling_params
)
print
(
"
\n
Outputs do not make sense:"
)
print_prompts_and_outputs
(
outputs
)
# Update load format from `dummy` to `auto`
llm
.
collective_rpc
(
"update_config"
,
args
=
({
"load_config"
:
{
"load_format"
:
"auto"
}},)
)
# Now reload real weights inplace
llm
.
collective_rpc
(
"reload_weights"
)
# Check outputs make sense
outputs
=
llm
.
generate
(
prompts
,
sampling_params
)
print
(
"
\n
Outputs make sense after loading real weights:"
)
print_prompts_and_outputs
(
outputs
)
if
__name__
==
"__main__"
:
main
()
examples/offline_inference/spec_decode.py
View file @
711aa9d5
...
...
@@ -84,6 +84,7 @@ def main():
gpu_memory_utilization
=
0.8
,
speculative_config
=
speculative_config
,
disable_log_stats
=
False
,
max_model_len
=
16384
,
)
sampling_params
=
SamplingParams
(
temperature
=
args
.
temp
,
max_tokens
=
args
.
output_len
)
...
...
examples/offline_inference/vision_language.py
View file @
711aa9d5
...
...
@@ -429,6 +429,44 @@ def run_internvl(questions: list[str], modality: str) -> ModelRequestData:
)
# Nemontron_VL
def
run_nemotron_vl
(
questions
:
list
[
str
],
modality
:
str
)
->
ModelRequestData
:
model_name
=
"nvidia/Llama-3.1-Nemotron-Nano-VL-8B-V1"
engine_args
=
EngineArgs
(
model
=
model_name
,
trust_remote_code
=
True
,
max_model_len
=
8192
,
limit_mm_per_prompt
=
{
modality
:
1
},
)
assert
modality
==
"image"
placeholder
=
"<image>"
tokenizer
=
AutoTokenizer
.
from_pretrained
(
model_name
,
trust_remote_code
=
True
)
messages
=
[
[{
"role"
:
"user"
,
"content"
:
f
"
{
placeholder
}
\n
{
question
}
"
}]
for
question
in
questions
]
prompts
=
tokenizer
.
apply_chat_template
(
messages
,
tokenize
=
False
,
add_generation_prompt
=
True
)
# Stop tokens for InternVL
# models variants may have different stop tokens
# please refer to the model card for the correct "stop words":
# https://huggingface.co/OpenGVLab/InternVL2-2B/blob/main/conversation.py
stop_tokens
=
[
"<|endoftext|>"
,
"<|im_start|>"
,
"<|im_end|>"
,
"<|end|>"
]
stop_token_ids
=
[
tokenizer
.
convert_tokens_to_ids
(
i
)
for
i
in
stop_tokens
]
stop_token_ids
=
[
token_id
for
token_id
in
stop_token_ids
if
token_id
is
not
None
]
return
ModelRequestData
(
engine_args
=
engine_args
,
prompts
=
prompts
,
stop_token_ids
=
stop_token_ids
,
)
# Keye-VL
def
run_keye_vl
(
questions
:
list
[
str
],
modality
:
str
)
->
ModelRequestData
:
model_name
=
"Kwai-Keye/Keye-VL-8B-Preview"
...
...
@@ -1186,6 +1224,7 @@ model_example_map = {
"h2ovl_chat"
:
run_h2ovl
,
"idefics3"
:
run_idefics3
,
"internvl_chat"
:
run_internvl
,
"nemotron_vl"
:
run_nemotron_vl
,
"keye_vl"
:
run_keye_vl
,
"kimi_vl"
:
run_kimi_vl
,
"llava"
:
run_llava
,
...
...
examples/offline_inference/vision_language_
embedd
ing.py
→
examples/offline_inference/vision_language_
pool
ing.py
View file @
711aa9d5
...
...
@@ -2,7 +2,7 @@
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""
This example shows how to use vLLM for running offline inference with
the correct prompt format on vision language models for multimodal
embedd
ing.
the correct prompt format on vision language models for multimodal
pool
ing.
For most models, the prompt format should follow corresponding examples
on HuggingFace model repository.
...
...
@@ -15,6 +15,7 @@ from typing import Literal, NamedTuple, Optional, TypedDict, Union, get_args
from
PIL.Image
import
Image
from
vllm
import
LLM
,
EngineArgs
from
vllm.entrypoints.score_utils
import
ScoreMultiModalParam
from
vllm.multimodal.utils
import
fetch_image
from
vllm.utils
import
FlexibleArgumentParser
...
...
@@ -35,14 +36,22 @@ class TextImageQuery(TypedDict):
image
:
Image
QueryModality
=
Literal
[
"text"
,
"image"
,
"text+image"
]
Query
=
Union
[
TextQuery
,
ImageQuery
,
TextImageQuery
]
class
TextImagesQuery
(
TypedDict
):
modality
:
Literal
[
"text+images"
]
text
:
str
image
:
ScoreMultiModalParam
QueryModality
=
Literal
[
"text"
,
"image"
,
"text+image"
,
"text+images"
]
Query
=
Union
[
TextQuery
,
ImageQuery
,
TextImageQuery
,
TextImagesQuery
]
class
ModelRequestData
(
NamedTuple
):
engine_args
:
EngineArgs
prompt
:
str
image
:
Optional
[
Image
]
prompt
:
Optional
[
str
]
=
None
image
:
Optional
[
Image
]
=
None
query
:
Optional
[
str
]
=
None
documents
:
Optional
[
ScoreMultiModalParam
]
=
None
def
run_e5_v
(
query
:
Query
)
->
ModelRequestData
:
...
...
@@ -107,6 +116,29 @@ def run_vlm2vec(query: Query) -> ModelRequestData:
)
def
run_jinavl_reranker
(
query
:
Query
)
->
ModelRequestData
:
if
query
[
"modality"
]
!=
"text+images"
:
raise
ValueError
(
f
"Unsupported query modality: '
{
query
[
'modality'
]
}
'"
)
engine_args
=
EngineArgs
(
model
=
"jinaai/jina-reranker-m0"
,
task
=
"score"
,
max_model_len
=
32768
,
trust_remote_code
=
True
,
mm_processor_kwargs
=
{
"min_pixels"
:
3136
,
"max_pixels"
:
602112
,
},
limit_mm_per_prompt
=
{
"image"
:
1
},
)
return
ModelRequestData
(
engine_args
=
engine_args
,
query
=
query
[
"text"
],
documents
=
query
[
"image"
],
)
def
get_query
(
modality
:
QueryModality
):
if
modality
==
"text"
:
return
TextQuery
(
modality
=
"text"
,
text
=
"A dog sitting in the grass"
)
...
...
@@ -128,6 +160,28 @@ def get_query(modality: QueryModality):
),
)
if
modality
==
"text+images"
:
return
TextImagesQuery
(
modality
=
"text+images"
,
text
=
"slm markdown"
,
image
=
{
"content"
:
[
{
"type"
:
"image_url"
,
"image_url"
:
{
"url"
:
"https://raw.githubusercontent.com/jina-ai/multimodal-reranker-test/main/handelsblatt-preview.png"
},
},
{
"type"
:
"image_url"
,
"image_url"
:
{
"url"
:
"https://raw.githubusercontent.com/jina-ai/multimodal-reranker-test/main/paper-11.png"
},
},
]
},
)
msg
=
f
"Modality
{
modality
}
is not supported."
raise
ValueError
(
msg
)
...
...
@@ -162,16 +216,31 @@ def run_encode(model: str, modality: QueryModality, seed: Optional[int]):
print
(
"-"
*
50
)
def
run_score
(
model
:
str
,
modality
:
QueryModality
,
seed
:
Optional
[
int
]):
query
=
get_query
(
modality
)
req_data
=
model_example_map
[
model
](
query
)
engine_args
=
asdict
(
req_data
.
engine_args
)
|
{
"seed"
:
seed
}
llm
=
LLM
(
**
engine_args
)
outputs
=
llm
.
score
(
req_data
.
query
,
req_data
.
documents
)
print
(
"-"
*
30
)
print
([
output
.
outputs
.
score
for
output
in
outputs
])
print
(
"-"
*
30
)
model_example_map
=
{
"e5_v"
:
run_e5_v
,
"vlm2vec"
:
run_vlm2vec
,
"jinavl_reranker"
:
run_jinavl_reranker
,
}
def
parse_args
():
parser
=
FlexibleArgumentParser
(
description
=
"Demo on using vLLM for offline inference with "
"vision language models for multimodal
embedding
"
"vision language models for multimodal
pooling tasks.
"
)
parser
.
add_argument
(
"--model-name"
,
...
...
@@ -181,6 +250,14 @@ def parse_args():
choices
=
model_example_map
.
keys
(),
help
=
"The name of the embedding model."
,
)
parser
.
add_argument
(
"--task"
,
"-t"
,
type
=
str
,
default
=
"embedding"
,
choices
=
[
"embedding"
,
"scoring"
],
help
=
"The task type."
,
)
parser
.
add_argument
(
"--modality"
,
type
=
str
,
...
...
@@ -198,7 +275,12 @@ def parse_args():
def
main
(
args
:
Namespace
):
run_encode
(
args
.
model_name
,
args
.
modality
,
args
.
seed
)
if
args
.
task
==
"embedding"
:
run_encode
(
args
.
model_name
,
args
.
modality
,
args
.
seed
)
elif
args
.
task
==
"scoring"
:
run_score
(
args
.
model_name
,
args
.
modality
,
args
.
seed
)
else
:
raise
ValueError
(
f
"Unsupported task:
{
args
.
task
}
"
)
if
__name__
==
"__main__"
:
...
...
examples/online_serving/disaggregated_serving_p2p_nccl_xpyd/disagg_example_p2p_nccl_xpyd.sh
View file @
711aa9d5
...
...
@@ -93,6 +93,7 @@ ensure_python_library_installed() {
cleanup
()
{
echo
"Stopping everything…"
trap
- INT TERM
# prevent re-entrancy
pkill
-9
-f
"disagg_proxy_p2p_nccl_xpyd.py"
kill
--
-
$$
# negative PID == "this whole process-group"
wait
# reap children so we don't leave zombies
exit
0
...
...
examples/online_serving/disaggregated_serving_p2p_nccl_xpyd/disagg_proxy_p2p_nccl_xpyd.py
View file @
711aa9d5
...
...
@@ -4,7 +4,9 @@
import
os
import
socket
import
threading
import
time
import
uuid
from
typing
import
Any
import
aiohttp
import
msgpack
...
...
@@ -12,12 +14,25 @@ import zmq
from
quart
import
Quart
,
make_response
,
request
count
=
0
prefill_instances
:
dict
[
str
,
str
]
=
{}
# http_address: zmq_address
decode_instances
:
dict
[
str
,
str
]
=
{}
# http_address: zmq_address
prefill_instances
:
dict
[
str
,
Any
]
=
{}
# http_address:
(
zmq_address
, stamp)
decode_instances
:
dict
[
str
,
Any
]
=
{}
# http_address:
(
zmq_address
, stamp)
prefill_cv
=
threading
.
Condition
()
decode_cv
=
threading
.
Condition
()
DEFAULT_PING_SECONDS
=
5
def
_remove_oldest_instances
(
instances
:
dict
[
str
,
Any
])
->
None
:
oldest_key
=
next
(
iter
(
instances
),
None
)
while
oldest_key
is
not
None
:
value
=
instances
[
oldest_key
]
if
value
[
1
]
>
time
.
time
():
break
print
(
f
"🔴Remove [HTTP:
{
oldest_key
}
, ZMQ:
{
value
[
0
]
}
, stamp:
{
value
[
1
]
}
]"
)
instances
.
pop
(
oldest_key
,
None
)
oldest_key
=
next
(
iter
(
instances
),
None
)
def
_listen_for_register
(
poller
,
router_socket
):
while
True
:
...
...
@@ -31,12 +46,23 @@ def _listen_for_register(poller, router_socket):
global
prefill_instances
global
prefill_cv
with
prefill_cv
:
prefill_instances
[
data
[
"http_address"
]]
=
data
[
"zmq_address"
]
node
=
prefill_instances
.
pop
(
data
[
"http_address"
],
None
)
prefill_instances
[
data
[
"http_address"
]]
=
(
data
[
"zmq_address"
],
time
.
time
()
+
DEFAULT_PING_SECONDS
,
)
_remove_oldest_instances
(
prefill_instances
)
elif
data
[
"type"
]
==
"D"
:
global
decode_instances
global
decode_cv
with
decode_cv
:
decode_instances
[
data
[
"http_address"
]]
=
data
[
"zmq_address"
]
node
=
decode_instances
.
pop
(
data
[
"http_address"
],
None
)
decode_instances
[
data
[
"http_address"
]]
=
(
data
[
"zmq_address"
],
time
.
time
()
+
DEFAULT_PING_SECONDS
,
)
_remove_oldest_instances
(
decode_instances
)
else
:
print
(
"Unexpected, Received message from %s, data: %s"
,
...
...
@@ -44,6 +70,9 @@ def _listen_for_register(poller, router_socket):
data
,
)
if
node
is
None
:
print
(
f
"🔵Add [HTTP:
{
data
[
'http_address'
]
}
, ZMQ:
{
data
[
'zmq_address'
]
}
]"
)
def
start_service_discovery
(
hostname
,
port
):
if
not
hostname
:
...
...
@@ -105,12 +134,14 @@ async def handle_request():
with
prefill_cv
:
prefill_list
=
list
(
prefill_instances
.
items
())
prefill_addr
,
prefill_zmq_addr
=
prefill_list
[
count
%
len
(
prefill_list
)]
prefill_zmq_addr
=
prefill_zmq_addr
[
0
]
global
decode_instances
global
decode_cv
with
decode_cv
:
decode_list
=
list
(
decode_instances
.
items
())
decode_addr
,
decode_zmq_addr
=
decode_list
[
count
%
len
(
decode_list
)]
decode_zmq_addr
=
decode_zmq_addr
[
0
]
print
(
f
"handle_request count:
{
count
}
, [HTTP:
{
prefill_addr
}
, "
...
...
examples/online_serving/elastic_ep/bench.sh
0 → 100644
View file @
711aa9d5
#!/bin/bash
MODEL_NAME
=
"deepseek-ai/DeepSeek-V2-Lite"
LOCAL_MODEL_PATH
=
"/models/models--deepseek-ai--DeepSeek-V2-Lite/snapshots/604d5664dddd88a0433dbae533b7fe9472482de0"
HOST
=
"localhost"
PORT
=
8006
NUM_PROMPTS
=
20
REQUEST_RATE
=
5
# Parse command line arguments
while
[[
$#
-gt
0
]]
;
do
case
$1
in
--model
)
MODEL_NAME
=
"
$2
"
shift
2
;;
--local-model
)
MODEL_NAME
=
$LOCAL_MODEL_PATH
shift
;;
--host
)
HOST
=
"
$2
"
shift
2
;;
--port
)
PORT
=
"
$2
"
shift
2
;;
--num-prompts
)
NUM_PROMPTS
=
"
$2
"
shift
2
;;
--request-rate
)
REQUEST_RATE
=
"
$2
"
shift
2
;;
-h
|
--help
)
echo
"Usage:
$0
[OPTIONS]"
echo
"Options:"
echo
" --model MODEL_NAME Set model name or path (default: deepseek-ai/DeepSeek-V2-Lite)"
echo
" --local-model Use local model path (convenience option)"
exit
0
;;
*
)
echo
"Unknown option:
$1
"
echo
"Use -h or --help for usage information"
exit
1
;;
esac
done
vllm bench serve
\
--model
$MODEL_NAME
\
--host
$HOST
\
--port
$PORT
\
--num-prompts
$NUM_PROMPTS
\
--request-rate
$REQUEST_RATE
examples/online_serving/elastic_ep/scale.py
0 → 100644
View file @
711aa9d5
#!/usr/bin/env python3
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import
argparse
import
json
import
sys
import
requests
def
scale
(
host
,
port
,
new_dp_size
):
url
=
f
"http://
{
host
}
:
{
port
}
/scale_elastic_ep"
payload
=
{
"new_data_parallel_size"
:
new_dp_size
}
headers
=
{
"Content-Type"
:
"application/json"
}
print
(
f
"Sending scale request to
{
url
}
"
)
print
(
f
"Payload:
{
json
.
dumps
(
payload
,
indent
=
2
)
}
"
)
try
:
response
=
requests
.
post
(
url
,
json
=
payload
,
headers
=
headers
,
timeout
=
300
)
print
(
f
"Status Code:
{
response
.
status_code
}
"
)
print
(
f
"Response:
{
response
.
text
}
"
)
if
response
.
status_code
==
200
:
print
(
"Scale up/down request successful!"
)
return
True
else
:
print
(
"Scale up/down request failed!"
)
return
False
except
requests
.
exceptions
.
RequestException
as
e
:
print
(
f
"Request failed:
{
e
}
"
)
return
False
def
main
():
parser
=
argparse
.
ArgumentParser
(
description
=
"Test scale up/down functionality"
)
parser
.
add_argument
(
"--host"
,
default
=
"localhost"
,
help
=
"API server host"
)
parser
.
add_argument
(
"--port"
,
type
=
int
,
default
=
8006
,
help
=
"API server port"
)
parser
.
add_argument
(
"--new-dp-size"
,
type
=
int
,
default
=
2
,
help
=
"New data parallel size"
)
args
=
parser
.
parse_args
()
success
=
scale
(
args
.
host
,
args
.
port
,
args
.
new_dp_size
)
sys
.
exit
(
0
if
success
else
1
)
if
__name__
==
"__main__"
:
main
()
Prev
1
…
7
8
9
10
11
12
13
14
15
…
26
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment