Commit 711aa9d5 authored by zhuwenwen's avatar zhuwenwen
Browse files

Merge tag 'v0.10.0' into v0.10.0-dev

parents 751c492c 6d8d0a24
...@@ -31,10 +31,10 @@ def main(args: Namespace): ...@@ -31,10 +31,10 @@ def main(args: Namespace):
# Create an LLM. # Create an LLM.
# You should pass task="embed" for embedding models # You should pass task="embed" for embedding models
model = LLM(**vars(args)) llm = LLM(**vars(args))
# Generate embedding. The output is a list of EmbeddingRequestOutputs. # Generate embedding. The output is a list of EmbeddingRequestOutputs.
outputs = model.embed(prompts) outputs = llm.embed(prompts)
# Print the outputs. # Print the outputs.
print("\nGenerated Outputs:\n" + "-" * 60) print("\nGenerated Outputs:\n" + "-" * 60)
......
...@@ -27,10 +27,10 @@ def main(args: Namespace): ...@@ -27,10 +27,10 @@ def main(args: Namespace):
# Create an LLM. # Create an LLM.
# You should pass task="score" for cross-encoder models # You should pass task="score" for cross-encoder models
model = LLM(**vars(args)) llm = LLM(**vars(args))
# Generate scores. The output is a list of ScoringRequestOutputs. # Generate scores. The output is a list of ScoringRequestOutputs.
outputs = model.score(text_1, texts_2) outputs = llm.score(text_1, texts_2)
# Print the outputs. # Print the outputs.
print("\nGenerated Outputs:\n" + "-" * 60) print("\nGenerated Outputs:\n" + "-" * 60)
......
...@@ -3,17 +3,19 @@ ...@@ -3,17 +3,19 @@
""" """
This example shows how to use Ray Data for data parallel batch inference. This example shows how to use Ray Data for data parallel batch inference.
Ray Data is a data processing framework that can handle large datasets Ray Data is a data processing framework that can process very large datasets
and integrates tightly with vLLM for data-parallel inference. with first-class support for vLLM.
As of Ray 2.44, Ray Data has a native integration with
vLLM (under ray.data.llm).
Ray Data provides functionality for: Ray Data provides functionality for:
* Reading and writing to cloud storage (S3, GCS, etc.) * Reading and writing to most popular file formats and cloud object storage.
* Automatic sharding and load-balancing across a cluster * Streaming execution, so you can run inference on datasets that far exceed
* Optimized configuration of vLLM using continuous batching the aggregate RAM of the cluster.
* Compatible with tensor/pipeline parallel inference as well. * Scale up the workload without code changes.
* Automatic sharding, load-balancing, and autoscaling across a Ray cluster,
with built-in fault-tolerance and retry semantics.
* Continuous batching that keeps vLLM replicas saturated and maximizes GPU
utilization.
* Compatible with tensor/pipeline parallel inference.
Learn more about Ray Data's LLM integration: Learn more about Ray Data's LLM integration:
https://docs.ray.io/en/latest/data/working-with-llms.html https://docs.ray.io/en/latest/data/working-with-llms.html
......
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
# ruff: noqa: E501
import argparse
import json
import torch
import transformers
# Usage:
# for BAAI/bge-reranker-v2-gemma
# Caution: "Yes" and "yes" are two different tokens
# python convert_model_to_seq_cls.py --model_name BAAI/bge-reranker-v2-gemma --classifier_from_tokens '["Yes"]' --method no_post_processing --path ./bge-reranker-v2-gemma-seq-cls
# for mxbai-rerank-v2
# python convert_model_to_seq_cls.py --model_name mixedbread-ai/mxbai-rerank-base-v2 --classifier_from_tokens '["0", "1"]' --method from_2_way_softmax --path ./mxbai-rerank-base-v2-seq-cls
# for Qwen3-Reranker
# python convert_model_to_seq_cls.py --model_name Qwen/Qwen3-Reranker-0.6B --classifier_from_tokens '["no", "yes"]' --method from_2_way_softmax --path ./Qwen3-Reranker-0.6B-seq-cls
def from_2_way_softmax(causal_lm, seq_cls_model, tokenizer, tokens, device):
# refer to https://huggingface.co/Qwen/Qwen3-Reranker-0.6B/discussions/3
assert len(tokens) == 2
lm_head_weights = causal_lm.lm_head.weight
false_id = tokenizer.convert_tokens_to_ids(tokens[0])
true_id = tokenizer.convert_tokens_to_ids(tokens[1])
score_weight = lm_head_weights[true_id].to(device).to(
torch.float32
) - lm_head_weights[false_id].to(device).to(torch.float32)
with torch.no_grad():
seq_cls_model.score.weight.copy_(score_weight.unsqueeze(0))
if seq_cls_model.score.bias is not None:
seq_cls_model.score.bias.zero_()
def no_post_processing(causal_lm, seq_cls_model, tokenizer, tokens, device):
lm_head_weights = causal_lm.lm_head.weight
token_ids = [tokenizer.convert_tokens_to_ids(t) for t in tokens]
score_weight = lm_head_weights[token_ids].to(device)
with torch.no_grad():
seq_cls_model.score.weight.copy_(score_weight)
if seq_cls_model.score.bias is not None:
seq_cls_model.score.bias.zero_()
method_map = {
function.__name__: function for function in [from_2_way_softmax, no_post_processing]
}
def converting(
model_name, classifier_from_tokens, path, method, use_pad_token=False, device="cpu"
):
assert method in method_map
if method == "from_2_way_softmax":
assert len(classifier_from_tokens) == 2
num_labels = 1
else:
num_labels = len(classifier_from_tokens)
tokenizer = transformers.AutoTokenizer.from_pretrained(model_name)
causal_lm = transformers.AutoModelForCausalLM.from_pretrained(
model_name, device_map=device
)
seq_cls_model = transformers.AutoModelForSequenceClassification.from_pretrained(
model_name,
num_labels=num_labels,
ignore_mismatched_sizes=True,
device_map=device,
)
method_map[method](
causal_lm, seq_cls_model, tokenizer, classifier_from_tokens, device
)
# `llm as reranker` defaults to not using pad_token
seq_cls_model.config.use_pad_token = use_pad_token
seq_cls_model.config.pad_token_id = tokenizer.pad_token_id
seq_cls_model.save_pretrained(path)
tokenizer.save_pretrained(path)
def parse_args():
parser = argparse.ArgumentParser(
description="Converting *ForCausalLM models to "
"*ForSequenceClassification models."
)
parser.add_argument(
"--model_name",
type=str,
default="BAAI/bge-reranker-v2-gemma",
help="Model name",
)
parser.add_argument(
"--classifier_from_tokens",
type=str,
default='["Yes"]',
help="classifier from tokens",
)
parser.add_argument(
"--method", type=str, default="no_post_processing", help="Converting converting"
)
parser.add_argument(
"--use-pad-token", action="store_true", help="Whether to use pad_token"
)
parser.add_argument(
"--path",
type=str,
default="./bge-reranker-v2-gemma-seq-cls",
help="Path to save converted model",
)
return parser.parse_args()
if __name__ == "__main__":
args = parse_args()
converting(
model_name=args.model_name,
classifier_from_tokens=json.loads(args.classifier_from_tokens),
method=args.method,
use_pad_token=args.use_pad_token,
path=args.path,
)
...@@ -30,11 +30,11 @@ def main(args: Namespace): ...@@ -30,11 +30,11 @@ def main(args: Namespace):
# Create an LLM. # Create an LLM.
# You should pass task="embed" for embedding models # You should pass task="embed" for embedding models
model = LLM(**vars(args)) llm = LLM(**vars(args))
# Generate embedding. The output is a list of EmbeddingRequestOutputs. # Generate embedding. The output is a list of EmbeddingRequestOutputs.
# Only text matching task is supported for now. See #16120 # Only text matching task is supported for now. See #16120
outputs = model.embed(prompts) outputs = llm.embed(prompts)
# Print the outputs. # Print the outputs.
print("\nGenerated Outputs:") print("\nGenerated Outputs:")
......
...@@ -30,10 +30,10 @@ def main(args: Namespace): ...@@ -30,10 +30,10 @@ def main(args: Namespace):
# Create an LLM. # Create an LLM.
# You should pass task="embed" for embedding models # You should pass task="embed" for embedding models
model = LLM(**vars(args)) llm = LLM(**vars(args))
# Generate embedding. The output is a list of EmbeddingRequestOutputs. # Generate embedding. The output is a list of EmbeddingRequestOutputs.
outputs = model.embed(prompts, pooling_params=PoolingParams(dimensions=32)) outputs = llm.embed(prompts, pooling_params=PoolingParams(dimensions=32))
# Print the outputs. # Print the outputs.
print("\nGenerated Outputs:") print("\nGenerated Outputs:")
......
...@@ -54,7 +54,7 @@ def main(): ...@@ -54,7 +54,7 @@ def main():
for output in outputs: for output in outputs:
prompt = output.prompt prompt = output.prompt
generated_text = output.outputs[0].text generated_text = output.outputs[0].text
print(f"Prompt: {prompt!r}, \n\n\n\ Generated text: {generated_text!r}") print(f"Prompt: {prompt!r}, \n\n\n Generated text: {generated_text!r}")
if __name__ == "__main__": if __name__ == "__main__":
......
...@@ -25,7 +25,7 @@ def config_buckets(): ...@@ -25,7 +25,7 @@ def config_buckets():
os.environ["NEURON_TOKEN_GEN_BUCKETS"] = "128,512,1024,2048" os.environ["NEURON_TOKEN_GEN_BUCKETS"] = "128,512,1024,2048"
def initialize_model(): def initialize_llm():
"""Create an LLM with speculative decoding.""" """Create an LLM with speculative decoding."""
return LLM( return LLM(
model="openlm-research/open_llama_7b", model="openlm-research/open_llama_7b",
...@@ -37,15 +37,14 @@ def initialize_model(): ...@@ -37,15 +37,14 @@ def initialize_model():
max_num_seqs=4, max_num_seqs=4,
max_model_len=2048, max_model_len=2048,
block_size=2048, block_size=2048,
use_v2_block_manager=True,
device="neuron", device="neuron",
tensor_parallel_size=32, tensor_parallel_size=32,
) )
def process_requests(model: LLM, sampling_params: SamplingParams): def process_requests(llm: LLM, sampling_params: SamplingParams):
"""Generate texts from prompts and print them.""" """Generate texts from prompts and print them."""
outputs = model.generate(prompts, sampling_params) outputs = llm.generate(prompts, sampling_params)
for output in outputs: for output in outputs:
prompt = output.prompt prompt = output.prompt
generated_text = output.outputs[0].text generated_text = output.outputs[0].text
...@@ -53,12 +52,12 @@ def process_requests(model: LLM, sampling_params: SamplingParams): ...@@ -53,12 +52,12 @@ def process_requests(model: LLM, sampling_params: SamplingParams):
def main(): def main():
"""Main function that sets up the model and processes prompts.""" """Main function that sets up the llm and processes prompts."""
config_buckets() config_buckets()
model = initialize_model() llm = initialize_llm()
# Create a sampling params object. # Create a sampling params object.
sampling_params = SamplingParams(max_tokens=100, top_k=1) sampling_params = SamplingParams(max_tokens=100, top_k=1)
process_requests(model, sampling_params) process_requests(llm, sampling_params)
if __name__ == "__main__": if __name__ == "__main__":
......
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""
This is a demo script showing how to use the
PrithviGeospatialMAE model with vLLM
This script is based on: https://huggingface.co/ibm-nasa-geospatial/Prithvi-EO-2.0-300M-TL-Sen1Floods11/blob/main/inference.py # noqa
Target model weights: https://huggingface.co/ibm-nasa-geospatial/Prithvi-EO-2.0-300M-TL-Sen1Floods11/resolve/main/Prithvi-EO-V2-300M-TL-Sen1Floods11.pt # noqa
The requirements for running this script are:
- Installing [terratorch, albumentations, rasterio] in your python environment
- downloading the model weights in a 'model' folder local to the script
(temporary measure until the proper config.json file is uploaded to HF)
- download an input example image (India_900498_S2Hand.tif) and place it in
the same folder with the script (or specify with the --data_file argument)
Run the example:
python prithvi_geospatial_mae.py
""" # noqa: E501
import argparse import argparse
import datetime import datetime
import os import os
import re
from typing import Union from typing import Union
import albumentations import albumentations
import numpy as np import numpy as np
import rasterio import rasterio
import regex as re
import torch import torch
from einops import rearrange from einops import rearrange
from terratorch.datamodules import Sen1Floods11NonGeoDataModule from terratorch.datamodules import Sen1Floods11NonGeoDataModule
from vllm import LLM from vllm import LLM
torch.set_default_dtype(torch.float16)
NO_DATA = -9999 NO_DATA = -9999
NO_DATA_FLOAT = 0.0001 NO_DATA_FLOAT = 0.0001
OFFSET = 0 OFFSET = 0
PERCENTILE = 99 PERCENTILE = 99
model_config = """{
"architectures": ["PrithviGeoSpatialMAE"],
"num_classes": 0,
"pretrained_cfg": {
"task_args": {
"task": "SemanticSegmentationTask",
"model_factory": "EncoderDecoderFactory",
"loss": "ce",
"ignore_index": -1,
"lr": 0.001,
"freeze_backbone": false,
"freeze_decoder": false,
"plot_on_val": 10,
"optimizer": "AdamW",
"scheduler": "CosineAnnealingLR"
},
"model_args": {
"backbone_pretrained": false,
"backbone": "prithvi_eo_v2_300_tl",
"decoder": "UperNetDecoder",
"decoder_channels": 256,
"decoder_scale_modules": true,
"num_classes": 2,
"rescale": true,
"backbone_bands": [
"BLUE",
"GREEN",
"RED",
"NIR_NARROW",
"SWIR_1",
"SWIR_2"
],
"head_dropout": 0.1,
"necks": [
{
"name": "SelectIndices",
"indices": [
5,
11,
17,
23
]
},
{
"name": "ReshapeTokensToImage"
}
]
},
"optimizer_params" : {
"lr": 5.0e-05,
"betas": [0.9, 0.999],
"eps": [1.0e-08],
"weight_decay": 0.05,
"amsgrad": false,
"maximize": false,
"capturable": false,
"differentiable": false
},
"scheduler_params" : {
"T_max": 50,
"eta_min": 0,
"last_epoch": -1,
"verbose": "deprecated"
}
},
"torch_dtype": "float32"
}
"""
# Temporarily creating the "config.json" for the model.
# This is going to disappear once the correct config.json is available on HF
with open(
os.path.join(os.path.dirname(__file__), "./model/config.json"), "w"
) as config_file:
config_file.write(model_config)
datamodule_config = { datamodule_config = {
"bands": ["BLUE", "GREEN", "RED", "NIR_NARROW", "SWIR_1", "SWIR_2"], "bands": ["BLUE", "GREEN", "RED", "NIR_NARROW", "SWIR_1", "SWIR_2"],
"batch_size": 16, "batch_size": 16,
...@@ -138,28 +43,24 @@ datamodule_config = { ...@@ -138,28 +43,24 @@ datamodule_config = {
class PrithviMAE: class PrithviMAE:
def __init__(self): def __init__(self, model):
print("Initializing PrithviMAE model")
self.model = LLM( self.model = LLM(
model=os.path.join(os.path.dirname(__file__), "./model"), model=model, skip_tokenizer_init=True, dtype="float16", enforce_eager=True
skip_tokenizer_init=True,
dtype="float32",
) )
def run(self, input_data, location_coords): def run(self, input_data, location_coords):
print("################ Running inference on vLLM ##############")
# merge the inputs into one data structure # merge the inputs into one data structure
if input_data is not None and input_data.dtype == torch.float32:
input_data = input_data.to(torch.float16)
input_data = input_data[0]
mm_data = { mm_data = {
"pixel_values": torch.empty(0) if input_data is None else input_data, "pixel_values": input_data,
"location_coords": torch.empty(0) "location_coords": location_coords,
if location_coords is None
else location_coords,
} }
prompt = {"prompt_token_ids": [1], "multi_modal_data": mm_data} prompt = {"prompt_token_ids": [1], "multi_modal_data": mm_data}
outputs = self.model.encode(prompt, use_tqdm=False) outputs = self.model.encode(prompt, use_tqdm=False)
print("################ Inference done (it took seconds) ##############")
return outputs[0].outputs.data return outputs[0].outputs.data
...@@ -181,11 +82,12 @@ def process_channel_group(orig_img, channels): ...@@ -181,11 +82,12 @@ def process_channel_group(orig_img, channels):
""" """
Args: Args:
orig_img: torch.Tensor representing original image (reference) orig_img: torch.Tensor representing original image (reference)
with shape = (bands, H, W). with shape = (bands, H, W).
channels: list of indices representing RGB channels. channels: list of indices representing RGB channels.
Returns: Returns:
torch.Tensor with shape (num_channels, height, width) for original image torch.Tensor with shape (num_channels, height, width)
for original image
""" """
orig_img = orig_img[channels, ...] orig_img = orig_img[channels, ...]
...@@ -260,10 +162,10 @@ def load_example( ...@@ -260,10 +162,10 @@ def load_example(
Args: Args:
file_paths: list of file paths . file_paths: list of file paths .
mean: list containing mean values for each band in the images mean: list containing mean values for each band in the
in *file_paths*. images in *file_paths*.
std: list containing std values for each band in the images std: list containing std values for each band in the
in *file_paths*. images in *file_paths*.
Returns: Returns:
np.array containing created example np.array containing created example
...@@ -308,7 +210,7 @@ def load_example( ...@@ -308,7 +210,7 @@ def load_example(
print(f"Could not extract timestamp for {file} ({e})") print(f"Could not extract timestamp for {file} ({e})")
imgs = np.stack(imgs, axis=0) # num_frames, H, W, C imgs = np.stack(imgs, axis=0) # num_frames, H, W, C
imgs = np.moveaxis(imgs, -1, 0).astype("float32") imgs = np.moveaxis(imgs, -1, 0).astype("float32") # C, num_frames, H, W
imgs = np.expand_dims(imgs, axis=0) # add batch di imgs = np.expand_dims(imgs, axis=0) # add batch di
return imgs, temporal_coords, location_coords, metas return imgs, temporal_coords, location_coords, metas
...@@ -332,8 +234,10 @@ def run_model( ...@@ -332,8 +234,10 @@ def run_model(
) )
# Build sliding window # Build sliding window
batch_size = 1 batch_size = 1
batch = torch.tensor(input_data, device="cpu") # batch = torch.tensor(input_data, device="cpu")
batch = torch.tensor(input_data)
windows = batch.unfold(3, img_size, img_size).unfold(4, img_size, img_size) windows = batch.unfold(3, img_size, img_size).unfold(4, img_size, img_size)
h1, w1 = windows.shape[3:5] h1, w1 = windows.shape[3:5]
windows = rearrange( windows = rearrange(
...@@ -344,18 +248,16 @@ def run_model( ...@@ -344,18 +248,16 @@ def run_model(
num_batches = windows.shape[0] // batch_size if windows.shape[0] > batch_size else 1 num_batches = windows.shape[0] // batch_size if windows.shape[0] > batch_size else 1
windows = torch.tensor_split(windows, num_batches, dim=0) windows = torch.tensor_split(windows, num_batches, dim=0)
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
if temporal_coords: if temporal_coords:
temporal_coords = torch.tensor(temporal_coords, device=device).unsqueeze(0) temporal_coords = torch.tensor(temporal_coords).unsqueeze(0)
else: else:
temporal_coords = None temporal_coords = None
if location_coords: if location_coords:
location_coords = torch.tensor(location_coords[0], device=device).unsqueeze(0) location_coords = torch.tensor(location_coords[0]).unsqueeze(0)
else: else:
location_coords = None location_coords = None
# Run model # Run Prithvi-EO-V2-300M-TL-Sen1Floods11
pred_imgs = [] pred_imgs = []
for x in windows: for x in windows:
# Apply standardization # Apply standardization
...@@ -363,15 +265,7 @@ def run_model( ...@@ -363,15 +265,7 @@ def run_model(
x = datamodule.aug(x)["image"] x = datamodule.aug(x)["image"]
with torch.no_grad(): with torch.no_grad():
x = x.to(device)
pred = model.run(x, location_coords=location_coords) pred = model.run(x, location_coords=location_coords)
if lightning_model:
pred_lightning = lightning_model(
x, temporal_coords=temporal_coords, location_coords=location_coords
)
pred_lightning = pred_lightning.output.detach().cpu()
if not torch.equal(pred, pred_lightning):
print("Inference output is not equal")
y_hat = pred.argmax(dim=1) y_hat = pred.argmax(dim=1)
y_hat = torch.nn.functional.interpolate( y_hat = torch.nn.functional.interpolate(
...@@ -403,52 +297,18 @@ def run_model( ...@@ -403,52 +297,18 @@ def run_model(
return pred_imgs return pred_imgs
def parse_args():
parser = argparse.ArgumentParser("MAE run inference", add_help=False)
parser.add_argument(
"--data_file",
type=str,
default="./India_900498_S2Hand.tif",
help="Path to the file.",
)
parser.add_argument(
"--output_dir",
type=str,
default="output",
help="Path to the directory where to save outputs.",
)
parser.add_argument(
"--input_indices",
default=[1, 2, 3, 8, 11, 12],
type=int,
nargs="+",
help="0-based indices of the six Prithvi channels to be selected from the "
"input. By default selects [1,2,3,8,11,12] for S2L1C data.",
)
parser.add_argument(
"--rgb_outputs",
action="store_true",
help="If present, output files will only contain RGB channels. "
"Otherwise, all bands will be saved.",
)
def main( def main(
data_file: str, data_file: str,
model: str,
output_dir: str, output_dir: str,
rgb_outputs: bool, rgb_outputs: bool,
input_indices: list[int] = None, input_indices: list[int] = None,
): ):
os.makedirs(output_dir, exist_ok=True) os.makedirs(output_dir, exist_ok=True)
# Load model --------------------------------------------------------------- model_obj = PrithviMAE(model=model)
model_obj = PrithviMAE()
datamodule = generate_datamodule() datamodule = generate_datamodule()
img_size = 256 # Size of Sen1Floods11 img_size = 512 # Size of Sen1Floods11
# Loading data -------------------------------------------------------------
input_data, temporal_coords, location_coords, meta_data = load_example( input_data, temporal_coords, location_coords, meta_data = load_example(
file_paths=[data_file], file_paths=[data_file],
...@@ -460,8 +320,6 @@ def main( ...@@ -460,8 +320,6 @@ def main(
if input_data.mean() > 1: if input_data.mean() > 1:
input_data = input_data / 10000 # Convert to range 0-1 input_data = input_data / 10000 # Convert to range 0-1
# Running model ------------------------------------------------------------
channels = [ channels = [
datamodule_config["bands"].index(b) for b in ["RED", "GREEN", "BLUE"] datamodule_config["bands"].index(b) for b in ["RED", "GREEN", "BLUE"]
] # BGR -> RGB ] # BGR -> RGB
...@@ -469,7 +327,6 @@ def main( ...@@ -469,7 +327,6 @@ def main(
pred = run_model( pred = run_model(
input_data, temporal_coords, location_coords, model_obj, datamodule, img_size input_data, temporal_coords, location_coords, model_obj, datamodule, img_size
) )
# Save pred # Save pred
meta_data.update(count=1, dtype="uint8", compress="lzw", nodata=0) meta_data.update(count=1, dtype="uint8", compress="lzw", nodata=0)
pred_file = os.path.join( pred_file = os.path.join(
...@@ -487,6 +344,7 @@ def main( ...@@ -487,6 +344,7 @@ def main(
orig_img=torch.Tensor(input_data[0, :, 0, ...]), orig_img=torch.Tensor(input_data[0, :, 0, ...]),
channels=channels, channels=channels,
) )
rgb_orig = rgb_orig.to(torch.float32)
pred[pred == 0.0] = np.nan pred[pred == 0.0] = np.nan
img_pred = rgb_orig * 0.7 + pred * 0.3 img_pred = rgb_orig * 0.7 + pred * 0.3
...@@ -503,9 +361,10 @@ def main( ...@@ -503,9 +361,10 @@ def main(
# Save image rgb # Save image rgb
if rgb_outputs: if rgb_outputs:
name_suffix = os.path.splitext(os.path.basename(data_file))[0]
rgb_file = os.path.join( rgb_file = os.path.join(
output_dir, output_dir,
f"original_rgb_{os.path.splitext(os.path.basename(data_file))[0]}.tiff", f"original_rgb_{name_suffix}.tiff",
) )
save_geotiff( save_geotiff(
image=_convert_np_uint8(rgb_orig), image=_convert_np_uint8(rgb_orig),
...@@ -515,6 +374,42 @@ def main( ...@@ -515,6 +374,42 @@ def main(
if __name__ == "__main__": if __name__ == "__main__":
args = parse_args() parser = argparse.ArgumentParser("MAE run inference", add_help=False)
parser.add_argument(
"--data_file",
type=str,
default="./India_900498_S2Hand.tif",
help="Path to the file.",
)
parser.add_argument(
"--model",
type=str,
default="christian-pinto/Prithvi-EO-2.0-300M-TL-VLLM",
help="Path to a checkpoint file to load from.",
)
parser.add_argument(
"--output_dir",
type=str,
default="output",
help="Path to the directory where to save outputs.",
)
parser.add_argument(
"--input_indices",
default=[1, 2, 3, 8, 11, 12],
type=int,
nargs="+",
help="""
0-based indices of the six Prithvi channels to be selected from the input.
By default selects [1,2,3,8,11,12] for S2L1C data.
""",
)
parser.add_argument(
"--rgb_outputs",
action="store_true",
help="If present, output files will only contain RGB channels. "
"Otherwise, all bands will be saved.",
)
args = parser.parse_args()
main(**vars(args)) main(**vars(args))
...@@ -17,13 +17,13 @@ model_name = "Qwen/Qwen3-Reranker-0.6B" ...@@ -17,13 +17,13 @@ model_name = "Qwen/Qwen3-Reranker-0.6B"
# Models converted offline using this method can not only be more efficient # Models converted offline using this method can not only be more efficient
# and support the vllm score API, but also make the init parameters more # and support the vllm score API, but also make the init parameters more
# concise, for example. # concise, for example.
# model = LLM(model="tomaarsen/Qwen3-Reranker-0.6B-seq-cls", task="score") # llm = LLM(model="tomaarsen/Qwen3-Reranker-0.6B-seq-cls", task="score")
# If you want to load the official original version, the init parameters are # If you want to load the official original version, the init parameters are
# as follows. # as follows.
def get_model() -> LLM: def get_llm() -> LLM:
"""Initializes and returns the LLM model for Qwen3-Reranker.""" """Initializes and returns the LLM model for Qwen3-Reranker."""
return LLM( return LLM(
model=model_name, model=model_name,
...@@ -76,8 +76,8 @@ def main() -> None: ...@@ -76,8 +76,8 @@ def main() -> None:
] ]
documents = [document_template.format(doc=doc, suffix=suffix) for doc in documents] documents = [document_template.format(doc=doc, suffix=suffix) for doc in documents]
model = get_model() llm = get_llm()
outputs = model.score(queries, documents) outputs = llm.score(queries, documents)
print("-" * 30) print("-" * 30)
print([output.outputs.score for output in outputs]) print([output.outputs.score for output in outputs])
......
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
""" """
a simple demonstration of RLHF with vLLM, inspired by Demonstrates reinforcement learning from human feedback (RLHF) using vLLM and Ray.
the OpenRLHF framework https://github.com/OpenRLHF/OpenRLHF .
It follows the design that, training processes and inference processes The script separates training and inference workloads onto distinct GPUs
are different, and they live on different GPUs. so that Ray can manage process placement and inter-process communication.
Training processes send prompts to inference processes to generate data, A Hugging Face Transformer model occupies GPU 0 for training, whereas a
and also synchronize the weights of the model by broadcasting the weights tensor-parallel vLLM inference engine occupies GPU 1–2.
from the training process to the inference process.
Note that this is a simple demonstration of one training instance and one The example performs the following steps:
inference instance. In practice, there could be multiple training instances
and multiple inference instances. For the full implementation, please refer * Load the training model on GPU 0.
to the OpenRLHF framework. * Split the inference model across GPUs 1–2 using vLLM's tensor parallelism
and Ray placement groups.
* Generate text from a list of prompts using the inference engine.
* Update the weights of the training model and broadcast the updated weights
to the inference engine by using a Ray collective RPC group. Note that
for demonstration purposes we simply zero out the weights.
For a production-ready implementation that supports multiple training and
inference replicas, see the OpenRLHF framework:
https://github.com/OpenRLHF/OpenRLHF
This example assumes a single-node cluster with three GPUs, but Ray
supports multi-node clusters. vLLM expects the GPUs are only used for vLLM
workloads. Residual GPU activity interferes with vLLM memory profiling and
causes unexpected behavior.
""" """
import os import os
...@@ -28,29 +42,27 @@ from vllm.utils import get_ip, get_open_port ...@@ -28,29 +42,27 @@ from vllm.utils import get_ip, get_open_port
class MyLLM(LLM): class MyLLM(LLM):
"""Configure the vLLM worker for Ray placement group execution."""
def __init__(self, *args, **kwargs): def __init__(self, *args, **kwargs):
# a hack to make the script work. # Remove the top-level CUDA_VISIBLE_DEVICES variable set by Ray
# stop ray from manipulating CUDA_VISIBLE_DEVICES # so that vLLM can manage its own device placement within the worker.
# at the top-level
os.environ.pop("CUDA_VISIBLE_DEVICES", None) os.environ.pop("CUDA_VISIBLE_DEVICES", None)
super().__init__(*args, **kwargs) super().__init__(*args, **kwargs)
""" # Load the OPT-125M model onto GPU 0 for the training workload.
Start the training process, here we use huggingface transformers
as an example to hold a model on GPU 0.
"""
train_model = AutoModelForCausalLM.from_pretrained("facebook/opt-125m") train_model = AutoModelForCausalLM.from_pretrained("facebook/opt-125m")
train_model.to("cuda:0") train_model.to("cuda:0")
"""
Start the inference process, here we use vLLM to hold a model on GPU 1 and # Initialize Ray and set the visible devices. The vLLM engine will
GPU 2. For the details on how to use ray, please refer to the ray # be placed on GPUs 1 and 2.
documentation https://docs.ray.io/en/latest/ .
"""
os.environ["CUDA_VISIBLE_DEVICES"] = "1,2" os.environ["CUDA_VISIBLE_DEVICES"] = "1,2"
ray.init() ray.init()
# Create a placement group that reserves GPU 1–2 for the vLLM inference engine.
# Learn more about Ray placement groups:
# https://docs.ray.io/en/latest/placement-groups.html
pg_inference = placement_group([{"GPU": 1, "CPU": 0}] * 2) pg_inference = placement_group([{"GPU": 1, "CPU": 0}] * 2)
ray.get(pg_inference.ready()) ray.get(pg_inference.ready())
scheduling_inference = PlacementGroupSchedulingStrategy( scheduling_inference = PlacementGroupSchedulingStrategy(
...@@ -58,10 +70,9 @@ scheduling_inference = PlacementGroupSchedulingStrategy( ...@@ -58,10 +70,9 @@ scheduling_inference = PlacementGroupSchedulingStrategy(
placement_group_capture_child_tasks=True, placement_group_capture_child_tasks=True,
placement_group_bundle_index=0, placement_group_bundle_index=0,
) )
"""
launch the vLLM inference engine. # Launch the vLLM inference engine. The `enforce_eager` flag reduces
here we use `enforce_eager` to reduce the start time. # start-up latency.
"""
llm = ray.remote( llm = ray.remote(
num_cpus=0, num_cpus=0,
num_gpus=0, num_gpus=0,
...@@ -74,7 +85,7 @@ llm = ray.remote( ...@@ -74,7 +85,7 @@ llm = ray.remote(
distributed_executor_backend="ray", distributed_executor_backend="ray",
) )
# Generate texts from the prompts. # Generate text from the prompts.
prompts = [ prompts = [
"Hello, my name is", "Hello, my name is",
"The president of the United States is", "The president of the United States is",
...@@ -93,8 +104,8 @@ for output in outputs: ...@@ -93,8 +104,8 @@ for output in outputs:
print(f"Prompt: {prompt!r}\nGenerated text: {generated_text!r}") print(f"Prompt: {prompt!r}\nGenerated text: {generated_text!r}")
print("-" * 50) print("-" * 50)
# set up the communication between the training process # Set up the communication channel between the training process and the
# and the inference engine. # inference engine.
master_address = get_ip() master_address = get_ip()
master_port = get_open_port() master_port = get_open_port()
...@@ -107,21 +118,23 @@ model_update_group = stateless_init_process_group( ...@@ -107,21 +118,23 @@ model_update_group = stateless_init_process_group(
) )
ray.get(handle) ray.get(handle)
# simulate training, modify the weights of the model. # Simulate a training step by zeroing out all model weights.
# In a real RLHF training loop the weights would be updated using the gradient
# from an RL objective such as PPO on a reward model.
for name, p in train_model.named_parameters(): for name, p in train_model.named_parameters():
p.data.zero_() p.data.zero_()
# sync weight from the training process to the inference engine. # Synchronize the updated weights to the inference engine.
for name, p in train_model.named_parameters(): for name, p in train_model.named_parameters():
handle = llm.collective_rpc.remote("update_weight", args=(name, p.dtype, p.shape)) handle = llm.collective_rpc.remote("update_weight", args=(name, p.dtype, p.shape))
model_update_group.broadcast(p, src=0, stream=torch.cuda.current_stream()) model_update_group.broadcast(p, src=0, stream=torch.cuda.current_stream())
ray.get(handle) ray.get(handle)
# check if the weights are updated. # Verify that the inference weights have been updated.
assert all(ray.get(llm.collective_rpc.remote("check_weights_changed"))) assert all(ray.get(llm.collective_rpc.remote("check_weights_changed")))
# use the updated model to generate texts, they will be nonsense # Generate text with the updated model. The output is expected to be nonsense
# because the weights are all zeros. # because the weights are zero.
outputs_updated = ray.get(llm.generate.remote(prompts, sampling_params)) outputs_updated = ray.get(llm.generate.remote(prompts, sampling_params))
print("-" * 50) print("-" * 50)
for output in outputs_updated: for output in outputs_updated:
......
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
""" """
a simple demonstration to show how to co-locate Demonstrates how to co-locate a vLLM inference worker and training
vLLM worker with training actors on the same GPUs, actors on the same set of GPUs for reinforcement learning from human feedback
for RLHF-like applications. (RLHF) workloads.
The key points:
- Control the placement of the vLLM workers with Ray, by setting Ray serves as the distributed execution framework in this example. Ray
VLLM_RAY_PER_WORKER_GPUS and VLLM_RAY_BUNDLE_INDICES properly. placement groups allocate both training actors and vLLM workers to the
- Use cuda-ipc to pass tensors, since NCCL does not work when we have same GPU bundles, enabling fast, in-GPU communication between the two
multiple processes on the same GPU. components.
The script shows how to do the following:
* Configure environment variables (`VLLM_RAY_PER_WORKER_GPUS` and
`VLLM_RAY_BUNDLE_INDICES`) so that vLLM workers land on the desired
devices.
* Exchange tensors between processes by means of CUDA inter-process
communication (IPC). CUDA IPC sidesteps NCCL limitations that occur
when multiple processes share a single GPU.
Note that this example assumes a single-node cluster with four GPUs, but Ray
supports multi-node clusters. vLLM expects exclusive use of the GPUs during
its initialization for memory profiling. Residual GPU activity interferes
with vLLM memory profiling and causes unexpected behavior.
Learn more about Ray placement groups:
https://docs.ray.io/en/latest/placement-groups.html
""" """
import os import os
...@@ -22,13 +39,24 @@ from vllm import LLM ...@@ -22,13 +39,24 @@ from vllm import LLM
class MyLLM(LLM): class MyLLM(LLM):
def __init__(self, *args, bundle_indices: list, **kwargs): """Configure the vLLM worker for Ray placement group execution.
# a hack to make the script work.
# stop ray from manipulating CUDA_VISIBLE_DEVICES The constructor sets environment variables that allow multiple vLLM
# at the top-level workers to share a single physical GPU and that encode the bundle
indices assigned by the placement group.
Args:
*args: Positional arguments forwarded to `vllm.LLM`.
bundle_indices (list[int]): Placement-group bundle indices
assigned to this worker.
**kwargs: Keyword arguments forwarded to `vllm.LLM`.
"""
def __init__(self, *args, bundle_indices: list[int], **kwargs):
# Prevent Ray from manipulating the top-level CUDA_VISIBLE_DEVICES variable
# so that vLLM can its own device placement inside the worker.
os.environ.pop("CUDA_VISIBLE_DEVICES", None) os.environ.pop("CUDA_VISIBLE_DEVICES", None)
# every worker will use 0.4 GPU, so that we can schedule # Each worker uses 0.4 GPU so that two instances fit on the same GPUs.
# 2 instances on the same GPUs.
os.environ["VLLM_RAY_PER_WORKER_GPUS"] = "0.4" os.environ["VLLM_RAY_PER_WORKER_GPUS"] = "0.4"
os.environ["VLLM_RAY_BUNDLE_INDICES"] = ",".join(map(str, bundle_indices)) os.environ["VLLM_RAY_BUNDLE_INDICES"] = ",".join(map(str, bundle_indices))
print(f"creating LLM with bundle_indices={bundle_indices}") print(f"creating LLM with bundle_indices={bundle_indices}")
...@@ -36,17 +64,25 @@ class MyLLM(LLM): ...@@ -36,17 +64,25 @@ class MyLLM(LLM):
class RayTrainingActor: class RayTrainingActor:
"""Training actor that hosts a Facebook OPT-125M model from Hugging Face.
The model is loaded onto the first GPU assigned to this actor, and expose
the CUDA IPC handles so that colocated vLLM workers can map tensors
directly.
"""
def __init__(self): def __init__(self):
# ray will set CUDA_VISIBLE_DEVICES to the assigned GPUs # Ray sets CUDA_VISIBLE_DEVICES to the GPUs assigned to this actor.
from transformers import AutoModelForCausalLM from transformers import AutoModelForCausalLM
self.model = AutoModelForCausalLM.from_pretrained("facebook/opt-125m") self.model = AutoModelForCausalLM.from_pretrained("facebook/opt-125m")
self.model.to("cuda:0") self.model.to("cuda:0")
# Zero out all the parameters.
for name, p in self.model.named_parameters(): for name, p in self.model.named_parameters():
p.data.zero_() p.data.zero_()
torch.cuda.synchronize() torch.cuda.synchronize()
# the argument for get_device_uuid is the index # The argument for `get_device_uuid` is the index of the GPU in the
# of the GPU in the visible devices. # list of visible devices.
from vllm.platforms import current_platform from vllm.platforms import current_platform
self.device_uuid = current_platform.get_device_uuid(0) self.device_uuid = current_platform.get_device_uuid(0)
...@@ -59,23 +95,23 @@ class RayTrainingActor: ...@@ -59,23 +95,23 @@ class RayTrainingActor:
data = {} data = {}
for name, p in self.model.named_parameters(): for name, p in self.model.named_parameters():
# the training actor might only have a subset of the weights # A training actor might hold only a subset of the weights and may
# and need to all-gather the weights from all the actors. # need to gather weights from other actors. For demonstration
# for demonstration, here we assume all training actors have # purposes, each training actor owns the full weight set.
# the full weights.
data[name] = reduce_tensor(p.detach()) data[name] = reduce_tensor(p.detach())
return {self.device_uuid: data} return {self.device_uuid: data}
# ray manages 4 GPUs # Ray manages four GPUs.
os.environ["CUDA_VISIBLE_DEVICES"] = "0,1,2,3" os.environ["CUDA_VISIBLE_DEVICES"] = "0,1,2,3"
ray.init() ray.init()
# we want to co-locate vLLM instance and the training actor # Co-locate vLLM instances and training actors on the same set of GPUs:
# on the same set of GPUs. # * GPU 0 and 1: training actor 0, training actor 1, and vLLM instance 0
# the placement plan is as follows: # (tensor parallelism = 2).
# GPU 0 and 1: training actor 0, 1, and vLLM instance 0 (with TP=2) # * GPU 2 and 3: training actor 2, training actor 3, and vLLM instance 1
# GPU 2 and 3: training actor 2, 3, and vLLM instance 1 (with TP=2) # (tensor parallelism = 2).
pg = placement_group([{"GPU": 1, "CPU": 0}] * 4) pg = placement_group([{"GPU": 1, "CPU": 0}] * 4)
ray.get(pg.ready()) ray.get(pg.ready())
...@@ -104,10 +140,8 @@ for bundle_index, training_actor in enumerate(training_actors): ...@@ -104,10 +140,8 @@ for bundle_index, training_actor in enumerate(training_actors):
training_actor_device_ids.append(device_id) training_actor_device_ids.append(device_id)
for i, bundle_indices in enumerate([[0, 1], [2, 3]]): for i, bundle_indices in enumerate([[0, 1], [2, 3]]):
# IMPORTANT: when creating vLLM instances, we need to # Use the following syntax instead of the @ray.remote decorator so that
# make sure there are no GPU activities on the target GPUs, # the placement group is customized for each bundle.
# otherwise, they will interfere with the vLLM memory profiling,
# and cause unexpected behaviors.
llm = ray.remote( llm = ray.remote(
num_cpus=0, num_cpus=0,
num_gpus=0, num_gpus=0,
...@@ -125,8 +159,8 @@ for i, bundle_indices in enumerate([[0, 1], [2, 3]]): ...@@ -125,8 +159,8 @@ for i, bundle_indices in enumerate([[0, 1], [2, 3]]):
bundle_indices=bundle_indices, bundle_indices=bundle_indices,
) )
inference_engines.append(llm) inference_engines.append(llm)
# don't call any method on the inference engine here, # Do not call any method on the inference engine at this point; the call
# otherwise it will block until the vLLM instance is created. # blocks until the vLLM instance finishes initialization.
for i, llm in enumerate(inference_engines): for i, llm in enumerate(inference_engines):
inference_engine_device_ids.append( inference_engine_device_ids.append(
...@@ -134,26 +168,25 @@ for i, llm in enumerate(inference_engines): ...@@ -134,26 +168,25 @@ for i, llm in enumerate(inference_engines):
) )
print(f"inference engine {i} is on {inference_engine_device_ids[-1]}") print(f"inference engine {i} is on {inference_engine_device_ids[-1]}")
# check the placement # Verify placement: the first two training actors share the same GPUs as
# the first two training actors should be # the first inference engine.
# on the same GPUs as the first inference engine
assert training_actor_device_ids[:2] == inference_engine_device_ids[0] assert training_actor_device_ids[:2] == inference_engine_device_ids[0]
# the last two training actors should be # Verify placement: the last two training actors share the same GPUs as
# on the same GPUs as the second inference engine # the second inference engine.
assert training_actor_device_ids[2:] == inference_engine_device_ids[1] assert training_actor_device_ids[2:] == inference_engine_device_ids[1]
print("gather all the IPC handles from the training actors") print("Gather all the IPC handles from the training actors.")
ipc_handles = {} ipc_handles = {}
for actor in training_actors: for actor in training_actors:
ipc_handles.update(ray.get(actor.get_weight_ipc_handles.remote())) ipc_handles.update(ray.get(actor.get_weight_ipc_handles.remote()))
print("update the weights of the inference engines") print("Update the weights of the inference engines.")
for llm in inference_engines: for llm in inference_engines:
ray.get( ray.get(
llm.collective_rpc.remote( llm.collective_rpc.remote(
"update_weights_from_ipc_handles", args=(ipc_handles,) "update_weights_from_ipc_handles", args=(ipc_handles,)
) )
) )
print("check if the weights are updated") print("Check if the weights are updated.")
for llm in inference_engines: for llm in inference_engines:
assert ray.get(llm.collective_rpc.remote("check_weights_changed", args=tuple())) assert ray.get(llm.collective_rpc.remote("check_weights_changed", args=tuple()))
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from vllm import LLM, RequestOutput, SamplingParams
# Sample prompts.
prompts = [
"Hello, my name is",
"The president of the United States is",
"The capital of France is",
"The future of AI is",
]
# Create a sampling params object.
sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
def print_prompts_and_outputs(outputs: list[RequestOutput]) -> None:
print("-" * 60)
for output in outputs:
prompt = output.prompt
generated_text = output.outputs[0].text
print(f"Prompt: {prompt!r}")
print(f"Output: {generated_text!r}")
print("-" * 60)
def main():
# Create an LLM without loading real weights
llm = LLM(
model="Qwen/Qwen3-0.6B",
load_format="dummy",
enforce_eager=True,
tensor_parallel_size=4,
)
outputs = llm.generate(prompts, sampling_params)
print("\nOutputs do not make sense:")
print_prompts_and_outputs(outputs)
# Update load format from `dummy` to `auto`
llm.collective_rpc(
"update_config", args=({"load_config": {"load_format": "auto"}},)
)
# Now reload real weights inplace
llm.collective_rpc("reload_weights")
# Check outputs make sense
outputs = llm.generate(prompts, sampling_params)
print("\nOutputs make sense after loading real weights:")
print_prompts_and_outputs(outputs)
if __name__ == "__main__":
main()
...@@ -84,6 +84,7 @@ def main(): ...@@ -84,6 +84,7 @@ def main():
gpu_memory_utilization=0.8, gpu_memory_utilization=0.8,
speculative_config=speculative_config, speculative_config=speculative_config,
disable_log_stats=False, disable_log_stats=False,
max_model_len=16384,
) )
sampling_params = SamplingParams(temperature=args.temp, max_tokens=args.output_len) sampling_params = SamplingParams(temperature=args.temp, max_tokens=args.output_len)
......
...@@ -429,6 +429,44 @@ def run_internvl(questions: list[str], modality: str) -> ModelRequestData: ...@@ -429,6 +429,44 @@ def run_internvl(questions: list[str], modality: str) -> ModelRequestData:
) )
# Nemontron_VL
def run_nemotron_vl(questions: list[str], modality: str) -> ModelRequestData:
model_name = "nvidia/Llama-3.1-Nemotron-Nano-VL-8B-V1"
engine_args = EngineArgs(
model=model_name,
trust_remote_code=True,
max_model_len=8192,
limit_mm_per_prompt={modality: 1},
)
assert modality == "image"
placeholder = "<image>"
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
messages = [
[{"role": "user", "content": f"{placeholder}\n{question}"}]
for question in questions
]
prompts = tokenizer.apply_chat_template(
messages, tokenize=False, add_generation_prompt=True
)
# Stop tokens for InternVL
# models variants may have different stop tokens
# please refer to the model card for the correct "stop words":
# https://huggingface.co/OpenGVLab/InternVL2-2B/blob/main/conversation.py
stop_tokens = ["<|endoftext|>", "<|im_start|>", "<|im_end|>", "<|end|>"]
stop_token_ids = [tokenizer.convert_tokens_to_ids(i) for i in stop_tokens]
stop_token_ids = [token_id for token_id in stop_token_ids if token_id is not None]
return ModelRequestData(
engine_args=engine_args,
prompts=prompts,
stop_token_ids=stop_token_ids,
)
# Keye-VL # Keye-VL
def run_keye_vl(questions: list[str], modality: str) -> ModelRequestData: def run_keye_vl(questions: list[str], modality: str) -> ModelRequestData:
model_name = "Kwai-Keye/Keye-VL-8B-Preview" model_name = "Kwai-Keye/Keye-VL-8B-Preview"
...@@ -1186,6 +1224,7 @@ model_example_map = { ...@@ -1186,6 +1224,7 @@ model_example_map = {
"h2ovl_chat": run_h2ovl, "h2ovl_chat": run_h2ovl,
"idefics3": run_idefics3, "idefics3": run_idefics3,
"internvl_chat": run_internvl, "internvl_chat": run_internvl,
"nemotron_vl": run_nemotron_vl,
"keye_vl": run_keye_vl, "keye_vl": run_keye_vl,
"kimi_vl": run_kimi_vl, "kimi_vl": run_kimi_vl,
"llava": run_llava, "llava": run_llava,
......
...@@ -2,7 +2,7 @@ ...@@ -2,7 +2,7 @@
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
""" """
This example shows how to use vLLM for running offline inference with This example shows how to use vLLM for running offline inference with
the correct prompt format on vision language models for multimodal embedding. the correct prompt format on vision language models for multimodal pooling.
For most models, the prompt format should follow corresponding examples For most models, the prompt format should follow corresponding examples
on HuggingFace model repository. on HuggingFace model repository.
...@@ -15,6 +15,7 @@ from typing import Literal, NamedTuple, Optional, TypedDict, Union, get_args ...@@ -15,6 +15,7 @@ from typing import Literal, NamedTuple, Optional, TypedDict, Union, get_args
from PIL.Image import Image from PIL.Image import Image
from vllm import LLM, EngineArgs from vllm import LLM, EngineArgs
from vllm.entrypoints.score_utils import ScoreMultiModalParam
from vllm.multimodal.utils import fetch_image from vllm.multimodal.utils import fetch_image
from vllm.utils import FlexibleArgumentParser from vllm.utils import FlexibleArgumentParser
...@@ -35,14 +36,22 @@ class TextImageQuery(TypedDict): ...@@ -35,14 +36,22 @@ class TextImageQuery(TypedDict):
image: Image image: Image
QueryModality = Literal["text", "image", "text+image"] class TextImagesQuery(TypedDict):
Query = Union[TextQuery, ImageQuery, TextImageQuery] modality: Literal["text+images"]
text: str
image: ScoreMultiModalParam
QueryModality = Literal["text", "image", "text+image", "text+images"]
Query = Union[TextQuery, ImageQuery, TextImageQuery, TextImagesQuery]
class ModelRequestData(NamedTuple): class ModelRequestData(NamedTuple):
engine_args: EngineArgs engine_args: EngineArgs
prompt: str prompt: Optional[str] = None
image: Optional[Image] image: Optional[Image] = None
query: Optional[str] = None
documents: Optional[ScoreMultiModalParam] = None
def run_e5_v(query: Query) -> ModelRequestData: def run_e5_v(query: Query) -> ModelRequestData:
...@@ -107,6 +116,29 @@ def run_vlm2vec(query: Query) -> ModelRequestData: ...@@ -107,6 +116,29 @@ def run_vlm2vec(query: Query) -> ModelRequestData:
) )
def run_jinavl_reranker(query: Query) -> ModelRequestData:
if query["modality"] != "text+images":
raise ValueError(f"Unsupported query modality: '{query['modality']}'")
engine_args = EngineArgs(
model="jinaai/jina-reranker-m0",
task="score",
max_model_len=32768,
trust_remote_code=True,
mm_processor_kwargs={
"min_pixels": 3136,
"max_pixels": 602112,
},
limit_mm_per_prompt={"image": 1},
)
return ModelRequestData(
engine_args=engine_args,
query=query["text"],
documents=query["image"],
)
def get_query(modality: QueryModality): def get_query(modality: QueryModality):
if modality == "text": if modality == "text":
return TextQuery(modality="text", text="A dog sitting in the grass") return TextQuery(modality="text", text="A dog sitting in the grass")
...@@ -128,6 +160,28 @@ def get_query(modality: QueryModality): ...@@ -128,6 +160,28 @@ def get_query(modality: QueryModality):
), ),
) )
if modality == "text+images":
return TextImagesQuery(
modality="text+images",
text="slm markdown",
image={
"content": [
{
"type": "image_url",
"image_url": {
"url": "https://raw.githubusercontent.com/jina-ai/multimodal-reranker-test/main/handelsblatt-preview.png"
},
},
{
"type": "image_url",
"image_url": {
"url": "https://raw.githubusercontent.com/jina-ai/multimodal-reranker-test/main/paper-11.png"
},
},
]
},
)
msg = f"Modality {modality} is not supported." msg = f"Modality {modality} is not supported."
raise ValueError(msg) raise ValueError(msg)
...@@ -162,16 +216,31 @@ def run_encode(model: str, modality: QueryModality, seed: Optional[int]): ...@@ -162,16 +216,31 @@ def run_encode(model: str, modality: QueryModality, seed: Optional[int]):
print("-" * 50) print("-" * 50)
def run_score(model: str, modality: QueryModality, seed: Optional[int]):
query = get_query(modality)
req_data = model_example_map[model](query)
engine_args = asdict(req_data.engine_args) | {"seed": seed}
llm = LLM(**engine_args)
outputs = llm.score(req_data.query, req_data.documents)
print("-" * 30)
print([output.outputs.score for output in outputs])
print("-" * 30)
model_example_map = { model_example_map = {
"e5_v": run_e5_v, "e5_v": run_e5_v,
"vlm2vec": run_vlm2vec, "vlm2vec": run_vlm2vec,
"jinavl_reranker": run_jinavl_reranker,
} }
def parse_args(): def parse_args():
parser = FlexibleArgumentParser( parser = FlexibleArgumentParser(
description="Demo on using vLLM for offline inference with " description="Demo on using vLLM for offline inference with "
"vision language models for multimodal embedding" "vision language models for multimodal pooling tasks."
) )
parser.add_argument( parser.add_argument(
"--model-name", "--model-name",
...@@ -181,6 +250,14 @@ def parse_args(): ...@@ -181,6 +250,14 @@ def parse_args():
choices=model_example_map.keys(), choices=model_example_map.keys(),
help="The name of the embedding model.", help="The name of the embedding model.",
) )
parser.add_argument(
"--task",
"-t",
type=str,
default="embedding",
choices=["embedding", "scoring"],
help="The task type.",
)
parser.add_argument( parser.add_argument(
"--modality", "--modality",
type=str, type=str,
...@@ -198,7 +275,12 @@ def parse_args(): ...@@ -198,7 +275,12 @@ def parse_args():
def main(args: Namespace): def main(args: Namespace):
run_encode(args.model_name, args.modality, args.seed) if args.task == "embedding":
run_encode(args.model_name, args.modality, args.seed)
elif args.task == "scoring":
run_score(args.model_name, args.modality, args.seed)
else:
raise ValueError(f"Unsupported task: {args.task}")
if __name__ == "__main__": if __name__ == "__main__":
......
...@@ -93,6 +93,7 @@ ensure_python_library_installed() { ...@@ -93,6 +93,7 @@ ensure_python_library_installed() {
cleanup() { cleanup() {
echo "Stopping everything…" echo "Stopping everything…"
trap - INT TERM # prevent re-entrancy trap - INT TERM # prevent re-entrancy
pkill -9 -f "disagg_proxy_p2p_nccl_xpyd.py"
kill -- -$$ # negative PID == "this whole process-group" kill -- -$$ # negative PID == "this whole process-group"
wait # reap children so we don't leave zombies wait # reap children so we don't leave zombies
exit 0 exit 0
......
...@@ -4,7 +4,9 @@ ...@@ -4,7 +4,9 @@
import os import os
import socket import socket
import threading import threading
import time
import uuid import uuid
from typing import Any
import aiohttp import aiohttp
import msgpack import msgpack
...@@ -12,12 +14,25 @@ import zmq ...@@ -12,12 +14,25 @@ import zmq
from quart import Quart, make_response, request from quart import Quart, make_response, request
count = 0 count = 0
prefill_instances: dict[str, str] = {} # http_address: zmq_address prefill_instances: dict[str, Any] = {} # http_address: (zmq_address, stamp)
decode_instances: dict[str, str] = {} # http_address: zmq_address decode_instances: dict[str, Any] = {} # http_address: (zmq_address, stamp)
prefill_cv = threading.Condition() prefill_cv = threading.Condition()
decode_cv = threading.Condition() decode_cv = threading.Condition()
DEFAULT_PING_SECONDS = 5
def _remove_oldest_instances(instances: dict[str, Any]) -> None:
oldest_key = next(iter(instances), None)
while oldest_key is not None:
value = instances[oldest_key]
if value[1] > time.time():
break
print(f"🔴Remove [HTTP:{oldest_key}, ZMQ:{value[0]}, stamp:{value[1]}]")
instances.pop(oldest_key, None)
oldest_key = next(iter(instances), None)
def _listen_for_register(poller, router_socket): def _listen_for_register(poller, router_socket):
while True: while True:
...@@ -31,12 +46,23 @@ def _listen_for_register(poller, router_socket): ...@@ -31,12 +46,23 @@ def _listen_for_register(poller, router_socket):
global prefill_instances global prefill_instances
global prefill_cv global prefill_cv
with prefill_cv: with prefill_cv:
prefill_instances[data["http_address"]] = data["zmq_address"] node = prefill_instances.pop(data["http_address"], None)
prefill_instances[data["http_address"]] = (
data["zmq_address"],
time.time() + DEFAULT_PING_SECONDS,
)
_remove_oldest_instances(prefill_instances)
elif data["type"] == "D": elif data["type"] == "D":
global decode_instances global decode_instances
global decode_cv global decode_cv
with decode_cv: with decode_cv:
decode_instances[data["http_address"]] = data["zmq_address"] node = decode_instances.pop(data["http_address"], None)
decode_instances[data["http_address"]] = (
data["zmq_address"],
time.time() + DEFAULT_PING_SECONDS,
)
_remove_oldest_instances(decode_instances)
else: else:
print( print(
"Unexpected, Received message from %s, data: %s", "Unexpected, Received message from %s, data: %s",
...@@ -44,6 +70,9 @@ def _listen_for_register(poller, router_socket): ...@@ -44,6 +70,9 @@ def _listen_for_register(poller, router_socket):
data, data,
) )
if node is None:
print(f"🔵Add [HTTP:{data['http_address']}, ZMQ:{data['zmq_address']}]")
def start_service_discovery(hostname, port): def start_service_discovery(hostname, port):
if not hostname: if not hostname:
...@@ -105,12 +134,14 @@ async def handle_request(): ...@@ -105,12 +134,14 @@ async def handle_request():
with prefill_cv: with prefill_cv:
prefill_list = list(prefill_instances.items()) prefill_list = list(prefill_instances.items())
prefill_addr, prefill_zmq_addr = prefill_list[count % len(prefill_list)] prefill_addr, prefill_zmq_addr = prefill_list[count % len(prefill_list)]
prefill_zmq_addr = prefill_zmq_addr[0]
global decode_instances global decode_instances
global decode_cv global decode_cv
with decode_cv: with decode_cv:
decode_list = list(decode_instances.items()) decode_list = list(decode_instances.items())
decode_addr, decode_zmq_addr = decode_list[count % len(decode_list)] decode_addr, decode_zmq_addr = decode_list[count % len(decode_list)]
decode_zmq_addr = decode_zmq_addr[0]
print( print(
f"handle_request count: {count}, [HTTP:{prefill_addr}, " f"handle_request count: {count}, [HTTP:{prefill_addr}, "
......
#!/bin/bash
MODEL_NAME="deepseek-ai/DeepSeek-V2-Lite"
LOCAL_MODEL_PATH="/models/models--deepseek-ai--DeepSeek-V2-Lite/snapshots/604d5664dddd88a0433dbae533b7fe9472482de0"
HOST="localhost"
PORT=8006
NUM_PROMPTS=20
REQUEST_RATE=5
# Parse command line arguments
while [[ $# -gt 0 ]]; do
case $1 in
--model)
MODEL_NAME="$2"
shift 2
;;
--local-model)
MODEL_NAME=$LOCAL_MODEL_PATH
shift
;;
--host)
HOST="$2"
shift 2
;;
--port)
PORT="$2"
shift 2
;;
--num-prompts)
NUM_PROMPTS="$2"
shift 2
;;
--request-rate)
REQUEST_RATE="$2"
shift 2
;;
-h|--help)
echo "Usage: $0 [OPTIONS]"
echo "Options:"
echo " --model MODEL_NAME Set model name or path (default: deepseek-ai/DeepSeek-V2-Lite)"
echo " --local-model Use local model path (convenience option)"
exit 0
;;
*)
echo "Unknown option: $1"
echo "Use -h or --help for usage information"
exit 1
;;
esac
done
vllm bench serve \
--model $MODEL_NAME \
--host $HOST \
--port $PORT \
--num-prompts $NUM_PROMPTS \
--request-rate $REQUEST_RATE
#!/usr/bin/env python3
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import argparse
import json
import sys
import requests
def scale(host, port, new_dp_size):
url = f"http://{host}:{port}/scale_elastic_ep"
payload = {"new_data_parallel_size": new_dp_size}
headers = {"Content-Type": "application/json"}
print(f"Sending scale request to {url}")
print(f"Payload: {json.dumps(payload, indent=2)}")
try:
response = requests.post(url, json=payload, headers=headers, timeout=300)
print(f"Status Code: {response.status_code}")
print(f"Response: {response.text}")
if response.status_code == 200:
print("Scale up/down request successful!")
return True
else:
print("Scale up/down request failed!")
return False
except requests.exceptions.RequestException as e:
print(f"Request failed: {e}")
return False
def main():
parser = argparse.ArgumentParser(description="Test scale up/down functionality")
parser.add_argument("--host", default="localhost", help="API server host")
parser.add_argument("--port", type=int, default=8006, help="API server port")
parser.add_argument(
"--new-dp-size", type=int, default=2, help="New data parallel size"
)
args = parser.parse_args()
success = scale(args.host, args.port, args.new_dp_size)
sys.exit(0 if success else 1)
if __name__ == "__main__":
main()
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment