Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
7a985548
Commit
7a985548
authored
May 22, 2025
by
zhuwenwen
Browse files
Merge tag 'v0.9.0' into v0.9.0-ori
parents
45d3785c
dc1440cf
Changes
486
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
434 additions
and
79 deletions
+434
-79
examples/offline_inference/openai_batch/openai_example_batch.jsonl
...offline_inference/openai_batch/openai_example_batch.jsonl
+0
-0
examples/offline_inference/profiling.py
examples/offline_inference/profiling.py
+2
-2
examples/offline_inference/qwen2_5_omni/only_thinker.py
examples/offline_inference/qwen2_5_omni/only_thinker.py
+8
-5
examples/offline_inference/qwen_1m.py
examples/offline_inference/qwen_1m.py
+66
-0
examples/offline_inference/reproducibility.py
examples/offline_inference/reproducibility.py
+0
-0
examples/offline_inference/torchrun_example.py
examples/offline_inference/torchrun_example.py
+14
-9
examples/offline_inference/tpu.py
examples/offline_inference/tpu.py
+2
-1
examples/offline_inference/vision_language.py
examples/offline_inference/vision_language.py
+68
-36
examples/offline_inference/vision_language_multi_image.py
examples/offline_inference/vision_language_multi_image.py
+31
-0
examples/online_serving/chart-helm/values.yaml
examples/online_serving/chart-helm/values.yaml
+1
-1
examples/online_serving/disaggregated_serving/README.md
examples/online_serving/disaggregated_serving/README.md
+8
-0
examples/online_serving/disaggregated_serving/disagg_proxy_demo.py
...online_serving/disaggregated_serving/disagg_proxy_demo.py
+7
-3
examples/online_serving/disaggregated_serving/kv_events.sh
examples/online_serving/disaggregated_serving/kv_events.sh
+86
-0
examples/online_serving/kv_events_subscriber.py
examples/online_serving/kv_events_subscriber.py
+114
-0
examples/online_serving/openai_chat_completion_client_for_multimodal.py
...e_serving/openai_chat_completion_client_for_multimodal.py
+16
-12
examples/online_serving/openai_chat_completion_client_with_tools.py
...nline_serving/openai_chat_completion_client_with_tools.py
+2
-2
examples/online_serving/openai_chat_completion_structured_outputs.py
...line_serving/openai_chat_completion_structured_outputs.py
+5
-4
examples/online_serving/openai_chat_completion_structured_outputs_structural_tag.py
...enai_chat_completion_structured_outputs_structural_tag.py
+1
-1
examples/online_serving/openai_chat_completion_structured_outputs_with_reasoning.py
...enai_chat_completion_structured_outputs_with_reasoning.py
+2
-2
examples/online_serving/openai_chat_completion_tool_calls_with_reasoning.py
...rving/openai_chat_completion_tool_calls_with_reasoning.py
+1
-1
No files found.
Too many changes to show.
To preserve performance only
486 of 486+
files are displayed.
Plain diff
Email patch
examples/offline_inference/openai/openai_example_batch.jsonl
→
examples/offline_inference/openai
_batch
/openai_example_batch.jsonl
View file @
7a985548
File moved
examples/offline_inference/profiling.py
View file @
7a985548
...
...
@@ -14,7 +14,7 @@ import tqdm
from
vllm
import
LLM
,
SamplingParams
from
vllm.engine.arg_utils
import
EngineArgs
from
vllm.profiler
import
layerwise_profile
from
vllm.profiler
.layerwise_profile
import
layerwise_profile
from
vllm.utils
import
FlexibleArgumentParser
BATCH_SIZE_DEFAULT
=
1
...
...
@@ -193,7 +193,7 @@ def run_profile(context: ProfileContext, csv_output: Optional[str],
batch_size
=
context
.
batch_size
prompt_len
=
context
.
prompt_len
scheduler_config
=
llm
.
llm_engine
.
scheduler_config
scheduler_config
=
llm
.
llm_engine
.
vllm_config
.
scheduler_config
max_model_len
=
llm
.
llm_engine
.
model_config
.
max_model_len
max_num_batched_tokens
=
scheduler_config
.
max_num_batched_tokens
max_num_seqs
=
scheduler_config
.
max_num_seqs
...
...
examples/offline_inference/qwen2_5_omni/only_thinker.py
View file @
7a985548
...
...
@@ -47,8 +47,7 @@ def get_mixed_modalities_query() -> QueryResult:
"image"
:
ImageAsset
(
"cherry_blossom"
).
pil_image
.
convert
(
"RGB"
),
"video"
:
VideoAsset
(
name
=
"sample_demo_1.mp4"
,
num_frames
=
16
).
np_ndarrays
,
VideoAsset
(
name
=
"baby_reading"
,
num_frames
=
16
).
np_ndarrays
,
},
},
limit_mm_per_prompt
=
{
...
...
@@ -66,7 +65,7 @@ def get_use_audio_in_video_query() -> QueryResult:
"<|im_start|>user
\n
<|vision_bos|><|VIDEO|><|vision_eos|>"
f
"
{
question
}
<|im_end|>
\n
"
f
"<|im_start|>assistant
\n
"
)
asset
=
VideoAsset
(
name
=
"
sample_demo_1.mp4
"
,
num_frames
=
16
)
asset
=
VideoAsset
(
name
=
"
baby_reading
"
,
num_frames
=
16
)
audio
=
asset
.
get_audio
(
sampling_rate
=
16000
)
assert
not
envs
.
VLLM_USE_V1
,
(
"V1 does not support use_audio_in_video. "
"Please launch this example with "
...
...
@@ -141,7 +140,7 @@ def main(args):
print
(
generated_text
)
if
__name__
==
"__main__"
:
def
parse_args
()
:
parser
=
FlexibleArgumentParser
(
description
=
'Demo on using vLLM for offline inference with '
'audio language models'
)
...
...
@@ -156,5 +155,9 @@ if __name__ == "__main__":
default
=
None
,
help
=
"Set the seed when initializing `vllm.LLM`."
)
args
=
parser
.
parse_args
()
return
parser
.
parse_args
()
if
__name__
==
"__main__"
:
args
=
parse_args
()
main
(
args
)
examples/offline_inference/qwen_1m.py
0 → 100644
View file @
7a985548
# SPDX-License-Identifier: Apache-2.0
import
os
from
urllib.request
import
urlopen
from
vllm
import
LLM
,
SamplingParams
os
.
environ
[
"VLLM_ATTENTION_BACKEND"
]
=
"DUAL_CHUNK_FLASH_ATTN"
os
.
environ
[
"VLLM_ALLOW_LONG_MAX_MODEL_LEN"
]
=
"1"
def
load_prompt
()
->
str
:
# Test cases with various lengths can be found at:
#
# https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen2.5-1M/test-data/64k.txt
# https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen2.5-1M/test-data/200k.txt
# https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen2.5-1M/test-data/600k.txt
# https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen2.5-1M/test-data/1m.txt
with
urlopen
(
"https://qianwen-res.oss-cn-beijing.aliyuncs.com"
"/Qwen2.5-1M/test-data/600k.txt"
,
timeout
=
5
)
as
response
:
prompt
=
response
.
read
().
decode
(
'utf-8'
)
return
prompt
# Processing the prompt.
def
process_requests
(
llm
:
LLM
,
prompts
:
list
[
str
])
->
None
:
# Create a sampling params object.
sampling_params
=
SamplingParams
(
temperature
=
0.7
,
top_p
=
0.8
,
top_k
=
20
,
repetition_penalty
=
1.05
,
detokenize
=
True
,
max_tokens
=
256
,
)
# Generate texts from the prompts.
outputs
=
llm
.
generate
(
prompts
,
sampling_params
)
# Print the outputs.
for
output
in
outputs
:
prompt_token_ids
=
output
.
prompt_token_ids
generated_text
=
output
.
outputs
[
0
].
text
print
(
f
"Prompt length:
{
len
(
prompt_token_ids
)
}
, "
f
"Generated text:
{
generated_text
!
r
}
"
)
# Create an LLM.
def
initialize_engine
()
->
LLM
:
llm
=
LLM
(
model
=
"Qwen/Qwen2.5-7B-Instruct-1M"
,
max_model_len
=
1048576
,
tensor_parallel_size
=
4
,
enforce_eager
=
True
,
enable_chunked_prefill
=
True
,
max_num_batched_tokens
=
131072
)
return
llm
def
main
():
llm
=
initialize_engine
()
prompt
=
load_prompt
()
process_requests
(
llm
,
[
prompt
])
if
__name__
==
'__main__'
:
main
()
examples/offline_inference/reproduciblity.py
→
examples/offline_inference/reproducib
i
lity.py
View file @
7a985548
File moved
examples/offline_inference/torchrun_example.py
View file @
7a985548
...
...
@@ -8,6 +8,8 @@ the argument 2 should match the `tensor_parallel_size` below.
see `tests/distributed/test_torchrun_example.py` for the unit test.
"""
import
torch.distributed
as
dist
from
vllm
import
LLM
,
SamplingParams
# Create prompts, the same across all ranks
...
...
@@ -27,23 +29,26 @@ sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
# all ranks have the same random seed, so that sampling can be
# deterministic across ranks.
llm
=
LLM
(
model
=
"
facebook/opt-125m
"
,
model
=
"
meta-llama/Llama-3.1-8B
"
,
tensor_parallel_size
=
2
,
pipeline_parallel_size
=
2
,
distributed_executor_backend
=
"external_launcher"
,
seed
=
0
,
max_model_len
=
32768
,
seed
=
1
,
)
outputs
=
llm
.
generate
(
prompts
,
sampling_params
)
# all ranks will have the same outputs
print
(
"-"
*
50
)
for
output
in
outputs
:
prompt
=
output
.
prompt
generated_text
=
output
.
outputs
[
0
].
text
print
(
f
"Prompt:
{
prompt
!
r
}
\n
"
f
"Generated text:
{
generated_text
!
r
}
"
)
if
dist
.
get_rank
()
==
0
:
print
(
"-"
*
50
)
"""
for
output
in
outputs
:
prompt
=
output
.
prompt
generated_text
=
output
.
outputs
[
0
].
text
print
(
f
"Prompt:
{
prompt
!
r
}
\n
"
f
"Generated text:
{
generated_text
!
r
}
\n
"
)
print
(
"-"
*
50
)
"""
Further tips:
1. to communicate control messages across all ranks, use the cpu group,
...
...
examples/offline_inference/tpu.py
View file @
7a985548
...
...
@@ -22,7 +22,8 @@ def main():
# In real workloads, `enforace_eager` should be `False`.
llm
=
LLM
(
model
=
"Qwen/Qwen2-1.5B-Instruct"
,
max_num_batched_tokens
=
64
,
max_num_seqs
=
4
)
max_num_seqs
=
4
,
max_model_len
=
128
)
outputs
=
llm
.
generate
(
prompts
,
sampling_params
)
print
(
"-"
*
50
)
for
output
,
answer
in
zip
(
outputs
,
answers
):
...
...
examples/offline_inference/vision_language.py
View file @
7a985548
...
...
@@ -45,7 +45,7 @@ def run_aria(questions: list[str], modality: str) -> ModelRequestData:
max_model_len
=
4096
,
max_num_seqs
=
2
,
dtype
=
"bfloat16"
,
limit_mm_per_prompt
=
{
"image"
:
1
},
limit_mm_per_prompt
=
{
modality
:
1
},
)
prompts
=
[(
f
"<|im_start|>user
\n
<fim_prefix><|img|><fim_suffix>
{
question
}
"
...
...
@@ -71,7 +71,7 @@ def run_aya_vision(questions: list[str], modality: str) -> ModelRequestData:
max_model_len
=
2048
,
max_num_seqs
=
2
,
mm_processor_kwargs
=
{
"crop_to_patches"
:
True
},
limit_mm_per_prompt
=
{
"image"
:
1
},
limit_mm_per_prompt
=
{
modality
:
1
},
)
prompts
=
[
f
"<|START_OF_TURN_TOKEN|><|USER_TOKEN|><image>
{
question
}
<|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>"
...
...
@@ -92,7 +92,7 @@ def run_blip2(questions: list[str], modality: str) -> ModelRequestData:
prompts
=
[
f
"Question:
{
question
}
Answer:"
for
question
in
questions
]
engine_args
=
EngineArgs
(
model
=
"Salesforce/blip2-opt-6.7b"
,
limit_mm_per_prompt
=
{
"image"
:
1
},
limit_mm_per_prompt
=
{
modality
:
1
},
)
return
ModelRequestData
(
...
...
@@ -110,7 +110,7 @@ def run_chameleon(questions: list[str], modality: str) -> ModelRequestData:
model
=
"facebook/chameleon-7b"
,
max_model_len
=
4096
,
max_num_seqs
=
2
,
limit_mm_per_prompt
=
{
"image"
:
1
},
limit_mm_per_prompt
=
{
modality
:
1
},
)
return
ModelRequestData
(
...
...
@@ -130,7 +130,7 @@ def run_deepseek_vl2(questions: list[str], modality: str) -> ModelRequestData:
max_model_len
=
4096
,
max_num_seqs
=
2
,
hf_overrides
=
{
"architectures"
:
[
"DeepseekVLV2ForCausalLM"
]},
limit_mm_per_prompt
=
{
"image"
:
1
},
limit_mm_per_prompt
=
{
modality
:
1
},
)
prompts
=
[
...
...
@@ -155,7 +155,7 @@ def run_florence2(questions: list[str], modality: str) -> ModelRequestData:
max_num_seqs
=
2
,
trust_remote_code
=
True
,
dtype
=
"bfloat16"
,
limit_mm_per_prompt
=
{
"image"
:
1
},
limit_mm_per_prompt
=
{
modality
:
1
},
)
prompts
=
[
"<MORE_DETAILED_CAPTION>"
for
_
in
questions
]
...
...
@@ -175,7 +175,7 @@ def run_fuyu(questions: list[str], modality: str) -> ModelRequestData:
model
=
"adept/fuyu-8b"
,
max_model_len
=
2048
,
max_num_seqs
=
2
,
limit_mm_per_prompt
=
{
"image"
:
1
},
limit_mm_per_prompt
=
{
modality
:
1
},
)
return
ModelRequestData
(
...
...
@@ -194,7 +194,7 @@ def run_gemma3(questions: list[str], modality: str) -> ModelRequestData:
max_model_len
=
2048
,
max_num_seqs
=
2
,
mm_processor_kwargs
=
{
"do_pan_and_scan"
:
True
},
limit_mm_per_prompt
=
{
"image"
:
1
},
limit_mm_per_prompt
=
{
modality
:
1
},
)
prompts
=
[(
"<bos><start_of_turn>user
\n
"
...
...
@@ -219,7 +219,7 @@ def run_glm4v(questions: list[str], modality: str) -> ModelRequestData:
trust_remote_code
=
True
,
enforce_eager
=
True
,
hf_overrides
=
{
"architectures"
:
[
"GLM4VForCausalLM"
]},
limit_mm_per_prompt
=
{
"image"
:
1
},
limit_mm_per_prompt
=
{
modality
:
1
},
)
prompts
=
[
...
...
@@ -246,7 +246,7 @@ def run_h2ovl(questions: list[str], modality: str) -> ModelRequestData:
model
=
model_name
,
trust_remote_code
=
True
,
max_model_len
=
8192
,
limit_mm_per_prompt
=
{
"image"
:
1
},
limit_mm_per_prompt
=
{
modality
:
1
},
)
tokenizer
=
AutoTokenizer
.
from_pretrained
(
model_name
,
...
...
@@ -287,7 +287,7 @@ def run_idefics3(questions: list[str], modality: str) -> ModelRequestData:
"longest_edge"
:
3
*
364
},
},
limit_mm_per_prompt
=
{
"image"
:
1
},
limit_mm_per_prompt
=
{
modality
:
1
},
)
prompts
=
[(
f
"<|begin_of_text|>User:<image>
{
question
}
<end_of_utterance>
\n
Assistant:"
...
...
@@ -314,7 +314,7 @@ def run_smolvlm(questions: list[str], modality: str) -> ModelRequestData:
"longest_edge"
:
384
},
},
limit_mm_per_prompt
=
{
"image"
:
1
},
limit_mm_per_prompt
=
{
modality
:
1
},
)
prompts
=
[
(
f
"<|im_start|>User:<image>
{
question
}
<end_of_utterance>
\n
Assistant:"
)
...
...
@@ -337,7 +337,7 @@ def run_internvl(questions: list[str], modality: str) -> ModelRequestData:
model
=
model_name
,
trust_remote_code
=
True
,
max_model_len
=
4096
,
limit_mm_per_prompt
=
{
"image"
:
1
},
limit_mm_per_prompt
=
{
modality
:
1
},
)
tokenizer
=
AutoTokenizer
.
from_pretrained
(
model_name
,
...
...
@@ -378,7 +378,7 @@ def run_kimi_vl(questions: list[str], modality: str) -> ModelRequestData:
model
=
"moonshotai/Kimi-VL-A3B-Instruct"
,
trust_remote_code
=
True
,
max_model_len
=
4096
,
limit_mm_per_prompt
=
{
"image"
:
1
},
limit_mm_per_prompt
=
{
modality
:
1
},
)
return
ModelRequestData
(
...
...
@@ -398,7 +398,7 @@ def run_llava(questions: list[str], modality: str) -> ModelRequestData:
engine_args
=
EngineArgs
(
model
=
"llava-hf/llava-1.5-7b-hf"
,
max_model_len
=
4096
,
limit_mm_per_prompt
=
{
"image"
:
1
},
limit_mm_per_prompt
=
{
modality
:
1
},
)
return
ModelRequestData
(
...
...
@@ -415,7 +415,7 @@ def run_llava_next(questions: list[str], modality: str) -> ModelRequestData:
engine_args
=
EngineArgs
(
model
=
"llava-hf/llava-v1.6-mistral-7b-hf"
,
max_model_len
=
8192
,
limit_mm_per_prompt
=
{
"image"
:
1
},
limit_mm_per_prompt
=
{
modality
:
1
},
)
return
ModelRequestData
(
...
...
@@ -437,7 +437,7 @@ def run_llava_next_video(questions: list[str],
model
=
"llava-hf/LLaVA-NeXT-Video-7B-hf"
,
max_model_len
=
8192
,
max_num_seqs
=
2
,
limit_mm_per_prompt
=
{
"image"
:
1
},
limit_mm_per_prompt
=
{
modality
:
1
},
)
return
ModelRequestData
(
...
...
@@ -465,7 +465,7 @@ def run_llava_onevision(questions: list[str],
engine_args
=
EngineArgs
(
model
=
"llava-hf/llava-onevision-qwen2-7b-ov-hf"
,
max_model_len
=
16384
,
limit_mm_per_prompt
=
{
"image"
:
1
},
limit_mm_per_prompt
=
{
modality
:
1
},
)
return
ModelRequestData
(
...
...
@@ -488,7 +488,7 @@ def run_mantis(questions: list[str], modality: str) -> ModelRequestData:
model
=
"TIGER-Lab/Mantis-8B-siglip-llama3"
,
max_model_len
=
4096
,
hf_overrides
=
{
"architectures"
:
[
"MantisForConditionalGeneration"
]},
limit_mm_per_prompt
=
{
"image"
:
1
},
limit_mm_per_prompt
=
{
modality
:
1
},
)
stop_token_ids
=
[
128009
]
...
...
@@ -529,7 +529,7 @@ def run_minicpmv_base(questions: list[str], modality: str, model_name):
max_model_len
=
4096
,
max_num_seqs
=
2
,
trust_remote_code
=
True
,
limit_mm_per_prompt
=
{
"image"
:
1
},
limit_mm_per_prompt
=
{
modality
:
1
},
)
# NOTE The stop_token_ids are different for various versions of MiniCPM-V
# 2.0
...
...
@@ -584,7 +584,7 @@ def run_mistral3(questions: list[str], modality: str) -> ModelRequestData:
max_model_len
=
8192
,
max_num_seqs
=
2
,
tensor_parallel_size
=
2
,
limit_mm_per_prompt
=
{
"image"
:
1
},
limit_mm_per_prompt
=
{
modality
:
1
},
)
prompts
=
[
f
"<s>[INST]
{
question
}
\n
[IMG][/INST]"
for
question
in
questions
]
...
...
@@ -610,7 +610,7 @@ def run_mllama(questions: list[str], modality: str) -> ModelRequestData:
model
=
model_name
,
max_model_len
=
8192
,
max_num_seqs
=
2
,
limit_mm_per_prompt
=
{
"image"
:
1
},
limit_mm_per_prompt
=
{
modality
:
1
},
)
tokenizer
=
AutoTokenizer
.
from_pretrained
(
model_name
)
...
...
@@ -645,7 +645,7 @@ def run_llama4(questions: list[str], modality: str) -> ModelRequestData:
max_num_seqs
=
4
,
tensor_parallel_size
=
8
,
gpu_memory_utilization
=
0.4
,
limit_mm_per_prompt
=
{
"image"
:
1
},
limit_mm_per_prompt
=
{
modality
:
1
},
)
tokenizer
=
AutoTokenizer
.
from_pretrained
(
model_name
)
...
...
@@ -680,7 +680,7 @@ def run_molmo(questions: list[str], modality: str) -> ModelRequestData:
model
=
model_name
,
trust_remote_code
=
True
,
dtype
=
"bfloat16"
,
limit_mm_per_prompt
=
{
"image"
:
1
},
limit_mm_per_prompt
=
{
modality
:
1
},
)
prompts
=
[
...
...
@@ -706,7 +706,38 @@ def run_nvlm_d(questions: list[str], modality: str) -> ModelRequestData:
trust_remote_code
=
True
,
max_model_len
=
4096
,
tensor_parallel_size
=
4
,
limit_mm_per_prompt
=
{
"image"
:
1
},
limit_mm_per_prompt
=
{
modality
:
1
},
)
tokenizer
=
AutoTokenizer
.
from_pretrained
(
model_name
,
trust_remote_code
=
True
)
messages
=
[[{
'role'
:
'user'
,
'content'
:
f
"<image>
\n
{
question
}
"
}]
for
question
in
questions
]
prompts
=
tokenizer
.
apply_chat_template
(
messages
,
tokenize
=
False
,
add_generation_prompt
=
True
)
return
ModelRequestData
(
engine_args
=
engine_args
,
prompts
=
prompts
,
)
# Ovis
def
run_ovis
(
questions
:
list
[
str
],
modality
:
str
)
->
ModelRequestData
:
assert
modality
==
"image"
model_name
=
"AIDC-AI/Ovis2-1B"
engine_args
=
EngineArgs
(
model
=
model_name
,
max_model_len
=
4096
,
max_num_seqs
=
2
,
trust_remote_code
=
True
,
dtype
=
"half"
,
limit_mm_per_prompt
=
{
modality
:
1
},
)
tokenizer
=
AutoTokenizer
.
from_pretrained
(
model_name
,
...
...
@@ -733,7 +764,7 @@ def run_paligemma(questions: list[str], modality: str) -> ModelRequestData:
prompts
=
[
"caption en"
for
_
in
questions
]
engine_args
=
EngineArgs
(
model
=
"google/paligemma-3b-mix-224"
,
limit_mm_per_prompt
=
{
"image"
:
1
},
limit_mm_per_prompt
=
{
modality
:
1
},
)
return
ModelRequestData
(
...
...
@@ -750,7 +781,7 @@ def run_paligemma2(questions: list[str], modality: str) -> ModelRequestData:
prompts
=
[
"caption en"
for
_
in
questions
]
engine_args
=
EngineArgs
(
model
=
"google/paligemma2-3b-ft-docci-448"
,
limit_mm_per_prompt
=
{
"image"
:
1
},
limit_mm_per_prompt
=
{
modality
:
1
},
)
return
ModelRequestData
(
...
...
@@ -787,7 +818,7 @@ def run_phi3v(questions: list[str], modality: str) -> ModelRequestData:
max_num_seqs
=
2
,
# Note - mm_processor_kwargs can also be passed to generate/chat calls
mm_processor_kwargs
=
{
"num_crops"
:
16
},
limit_mm_per_prompt
=
{
"image"
:
1
},
limit_mm_per_prompt
=
{
modality
:
1
},
)
return
ModelRequestData
(
...
...
@@ -821,7 +852,7 @@ def run_phi4mm(questions: list[str], modality: str) -> ModelRequestData:
max_lora_rank
=
320
,
# Note - mm_processor_kwargs can also be passed to generate/chat calls
mm_processor_kwargs
=
{
"dynamic_hd"
:
16
},
limit_mm_per_prompt
=
{
"image"
:
1
},
limit_mm_per_prompt
=
{
modality
:
1
},
)
return
ModelRequestData
(
...
...
@@ -842,7 +873,7 @@ def run_pixtral_hf(questions: list[str], modality: str) -> ModelRequestData:
model
=
model_name
,
max_model_len
=
6144
,
max_num_seqs
=
2
,
limit_mm_per_prompt
=
{
"image"
:
1
},
limit_mm_per_prompt
=
{
modality
:
1
},
)
prompts
=
[
f
"<s>[INST]
{
question
}
\n
[IMG][/INST]"
for
question
in
questions
]
...
...
@@ -863,7 +894,7 @@ def run_qwen_vl(questions: list[str], modality: str) -> ModelRequestData:
max_model_len
=
1024
,
max_num_seqs
=
2
,
hf_overrides
=
{
"architectures"
:
[
"QwenVLForConditionalGeneration"
]},
limit_mm_per_prompt
=
{
"image"
:
1
},
limit_mm_per_prompt
=
{
modality
:
1
},
)
prompts
=
[
f
"
{
question
}
Picture 1: <img></img>
\n
"
for
question
in
questions
]
...
...
@@ -888,7 +919,7 @@ def run_qwen2_vl(questions: list[str], modality: str) -> ModelRequestData:
"min_pixels"
:
28
*
28
,
"max_pixels"
:
1280
*
28
*
28
,
},
limit_mm_per_prompt
=
{
"image"
:
1
},
limit_mm_per_prompt
=
{
modality
:
1
},
)
if
modality
==
"image"
:
...
...
@@ -923,7 +954,7 @@ def run_qwen2_5_vl(questions: list[str], modality: str) -> ModelRequestData:
"max_pixels"
:
1280
*
28
*
28
,
"fps"
:
1
,
},
limit_mm_per_prompt
=
{
"image"
:
1
},
limit_mm_per_prompt
=
{
modality
:
1
},
)
if
modality
==
"image"
:
...
...
@@ -957,7 +988,7 @@ def run_qwen2_5_omni(questions: list[str], modality: str):
"max_pixels"
:
1280
*
28
*
28
,
"fps"
:
[
1
],
},
limit_mm_per_prompt
=
{
"image"
:
1
},
limit_mm_per_prompt
=
{
modality
:
1
},
)
if
modality
==
"image"
:
...
...
@@ -990,7 +1021,7 @@ def run_skyworkr1v(questions: list[str], modality: str) -> ModelRequestData:
model
=
model_name
,
trust_remote_code
=
True
,
max_model_len
=
4096
,
limit_mm_per_prompt
=
{
"image"
:
1
},
limit_mm_per_prompt
=
{
modality
:
1
},
)
tokenizer
=
AutoTokenizer
.
from_pretrained
(
model_name
,
...
...
@@ -1041,6 +1072,7 @@ model_example_map = {
"llama4"
:
run_llama4
,
"molmo"
:
run_molmo
,
"NVLM_D"
:
run_nvlm_d
,
"ovis"
:
run_ovis
,
"paligemma"
:
run_paligemma
,
"paligemma2"
:
run_paligemma2
,
"phi3_v"
:
run_phi3v
,
...
...
@@ -1080,7 +1112,7 @@ def get_multi_modal_input(args):
if
args
.
modality
==
"video"
:
# Input video and question
video
=
VideoAsset
(
name
=
"
sample_demo_1.mp4
"
,
video
=
VideoAsset
(
name
=
"
baby_reading
"
,
num_frames
=
args
.
num_frames
).
np_ndarrays
vid_questions
=
[
"Why is this video funny?"
]
...
...
examples/offline_inference/vision_language_multi_image.py
View file @
7a985548
...
...
@@ -436,6 +436,36 @@ def load_nvlm_d(question: str, image_urls: list[str]) -> ModelRequestData:
)
# Ovis
def
load_ovis
(
question
:
str
,
image_urls
:
list
[
str
])
->
ModelRequestData
:
model_name
=
"AIDC-AI/Ovis2-1B"
engine_args
=
EngineArgs
(
model
=
model_name
,
max_model_len
=
8192
,
max_num_seqs
=
2
,
trust_remote_code
=
True
,
dtype
=
"half"
,
limit_mm_per_prompt
=
{
"image"
:
len
(
image_urls
)},
)
placeholders
=
"
\n
"
.
join
(
f
"Image-
{
i
}
: <image>
\n
"
for
i
,
_
in
enumerate
(
image_urls
,
start
=
1
))
messages
=
[{
'role'
:
'user'
,
'content'
:
f
"
{
placeholders
}
\n
{
question
}
"
}]
tokenizer
=
AutoTokenizer
.
from_pretrained
(
model_name
,
trust_remote_code
=
True
)
prompt
=
tokenizer
.
apply_chat_template
(
messages
,
tokenize
=
False
,
add_generation_prompt
=
True
)
return
ModelRequestData
(
engine_args
=
engine_args
,
prompt
=
prompt
,
image_data
=
[
fetch_image
(
url
)
for
url
in
image_urls
],
)
def
load_pixtral_hf
(
question
:
str
,
image_urls
:
list
[
str
])
->
ModelRequestData
:
model_name
=
"mistral-community/pixtral-12b"
...
...
@@ -685,6 +715,7 @@ model_example_map = {
"mistral3"
:
load_mistral3
,
"mllama"
:
load_mllama
,
"NVLM_D"
:
load_nvlm_d
,
"ovis"
:
load_ovis
,
"phi3_v"
:
load_phi3v
,
"phi4_mm"
:
load_phi4mm
,
"pixtral_hf"
:
load_pixtral_hf
,
...
...
examples/online_serving/chart-helm/values.yaml
View file @
7a985548
...
...
@@ -8,7 +8,7 @@ image:
# -- Image tag
tag
:
"
latest"
# -- Container launch command
command
:
[
"
vllm"
,
"
serve"
,
"
/data/"
,
"
--served-model-name"
,
"
opt-125m"
,
"
--dtype"
,
"
b
float16"
,
"
--host"
,
"
0.0.0.0"
,
"
--port"
,
"
8000"
]
command
:
[
"
vllm"
,
"
serve"
,
"
/data/"
,
"
--served-model-name"
,
"
opt-125m"
,
"
--dtype"
,
"
float
32"
,
"
--block-size"
,
"
16"
,
"
--host"
,
"
0.0.0.0"
,
"
--port"
,
"
8000"
]
# -- Container port
containerPort
:
8000
...
...
examples/online_serving/disaggregated_serving/README.md
0 → 100644
View file @
7a985548
# Disaggregated Serving
This example contains scripts that demonstrate the disaggregated serving features of vLLM.
## Files
-
`disagg_proxy_demo.py`
- Demonstrates XpYd (X prefill instances, Y decode instances).
-
`kv_events.sh`
- Demonstrates KV cache event publishing.
examples/online_serving/disagg
_examples
/disagg_proxy_demo.py
→
examples/online_serving/disagg
regated_serving
/disagg_proxy_demo.py
View file @
7a985548
...
...
@@ -4,7 +4,7 @@ This file provides a disaggregated prefilling proxy demo to demonstrate an
example usage of XpYd disaggregated prefilling.
We can launch multiple vllm instances (2 for prefill and 2 for decode), and
launch this proxy demo through:
python3 examples/online_serving/disagg
_examples
/disagg_proxy_demo.py
\
python3 examples/online_serving/disagg
regated_serving
/disagg_proxy_demo.py
\
--model $model_name
\
--prefill localhost:8100 localhost:8101
\
--decode localhost:8200 localhost:8201
\
...
...
@@ -414,7 +414,7 @@ class ProxyServer:
server
.
run
()
if
__name__
==
"__main__"
:
def
parse_args
()
:
# Todo: allow more config
parser
=
argparse
.
ArgumentParser
(
"vLLM disaggregated proxy server."
)
parser
.
add_argument
(
"--model"
,
...
...
@@ -445,6 +445,10 @@ if __name__ == "__main__":
default
=
8000
,
help
=
"Server port number"
,
)
args
=
parser
.
parse_args
()
return
parser
.
parse_args
()
if
__name__
==
"__main__"
:
args
=
parse_args
()
proxy_server
=
ProxyServer
(
args
=
args
)
proxy_server
.
run_server
()
examples/online_serving/disaggregated_serving/kv_events.sh
0 → 100644
View file @
7a985548
#!/bin/bash
# This file demonstrates the KV cache event publishing
# We will launch a vllm instances configured to publish KV cache
# events and launch a simple subscriber to log those events.
set
-xe
echo
"🚧🚧 Warning: The usage of KV cache events is experimental and subject to change 🚧🚧"
sleep
1
MODEL_NAME
=
${
HF_MODEL_NAME
:-
meta
-llama/Meta-Llama-3.1-8B-Instruct
}
# Trap the SIGINT signal (triggered by Ctrl+C)
trap
'cleanup'
INT
# Cleanup function
cleanup
()
{
echo
"Caught Ctrl+C, cleaning up..."
# Cleanup commands
pgrep python | xargs
kill
-9
pkill
-f
python
echo
"Cleanup complete. Exiting."
exit
0
}
export
VLLM_HOST_IP
=
$(
hostname
-I
|
awk
'{print $1}'
)
# a function that waits vLLM server to start
wait_for_server
()
{
local
port
=
$1
timeout
1200 bash
-c
"
until curl -s localhost:
${
port
}
/v1/completions > /dev/null; do
sleep 1
done"
&&
return
0
||
return
1
}
vllm serve
$MODEL_NAME
\
--port
8100
\
--max-model-len
100
\
--enforce-eager
\
--gpu-memory-utilization
0.8
\
--trust-remote-code
\
--kv-events-config
\
'{"enable_kv_cache_events": true, "publisher": "zmq", "topic": "kv-events"}'
&
wait_for_server 8100
SCRIPT_DIR
=
"
$(
cd
"
$(
dirname
"
${
BASH_SOURCE
[0]
}
"
)
"
&&
pwd
)
"
python3
"
$SCRIPT_DIR
/kv_events_subscriber.py"
&
sleep
1
# serve two example requests
output1
=
$(
curl
-X
POST
-s
http://localhost:8100/v1/completions
\
-H
"Content-Type: application/json"
\
-d
'{
"model": "'
"
$MODEL_NAME
"
'",
"prompt": "Explain quantum computing in simple terms a 5-year-old could understand.",
"max_tokens": 80,
"temperature": 0
}'
)
output2
=
$(
curl
-X
POST
-s
http://localhost:8100/v1/completions
\
-H
"Content-Type: application/json"
\
-d
'{
"model": "'
"
$MODEL_NAME
"
'",
"prompt": "Explain quantum computing in simple terms a 50-year-old could understand.",
"max_tokens": 80,
"temperature": 0
}'
)
# Cleanup commands
pkill
-9
-u
"
$USER
"
-f
python
pkill
-9
-u
"
$USER
"
-f
vllm
sleep
1
echo
"Cleaned up"
# Print the outputs of the curl requests
echo
""
echo
"Output of first request:
$output1
"
echo
"Output of second request:
$output2
"
echo
"🎉🎉 Successfully finished 2 test requests! 🎉🎉"
echo
""
examples/online_serving/kv_events_subscriber.py
0 → 100644
View file @
7a985548
# SPDX-License-Identifier: Apache-2.0
from
typing
import
Any
,
Optional
,
Union
import
msgspec
import
zmq
from
msgspec.msgpack
import
Decoder
#
# Types copied from vllm.distributed.kv_events
#
class
EventBatch
(
msgspec
.
Struct
,
array_like
=
True
,
omit_defaults
=
True
,
gc
=
False
):
ts
:
float
events
:
list
[
Any
]
class
KVCacheEvent
(
msgspec
.
Struct
,
array_like
=
True
,
omit_defaults
=
True
,
gc
=
False
,
tag
=
True
):
"""Base class for all KV cache-related events"""
class
BlockStored
(
KVCacheEvent
):
block_hashes
:
list
[
int
]
parent_block_hash
:
Optional
[
int
]
token_ids
:
list
[
int
]
block_size
:
int
lora_id
:
Optional
[
int
]
class
BlockRemoved
(
KVCacheEvent
):
block_hashes
:
list
[
int
]
class
AllBlocksCleared
(
KVCacheEvent
):
pass
class
KVEventBatch
(
EventBatch
):
events
:
list
[
Union
[
BlockStored
,
BlockRemoved
,
AllBlocksCleared
]]
def
process_event
(
event_batch
):
print
(
f
"Received event batch at
{
event_batch
.
ts
}
:"
)
for
event
in
event_batch
.
events
:
print
(
f
" -
{
event
}
"
)
def
main
():
decoder
=
Decoder
(
type
=
KVEventBatch
)
last_seq
=
-
1
context
=
zmq
.
Context
()
# Set up the main subscription socket
sub
=
context
.
socket
(
zmq
.
SUB
)
sub
.
connect
(
"tcp://localhost:5557"
)
topic
=
"kv-events"
sub
.
setsockopt_string
(
zmq
.
SUBSCRIBE
,
topic
)
# Initialize replay socket
replay
=
context
.
socket
(
zmq
.
REQ
)
replay
.
connect
(
"tcp://localhost:5558"
)
poller
=
zmq
.
Poller
()
poller
.
register
(
replay
,
zmq
.
POLLIN
)
print
(
"Listening for KV cache events on topic:"
,
topic
)
while
True
:
try
:
if
sub
.
poll
(
50
):
_
,
seq_bytes
,
payload
=
sub
.
recv_multipart
()
seq
=
int
.
from_bytes
(
seq_bytes
,
"big"
)
if
last_seq
>=
0
and
seq
>
last_seq
+
1
:
missed
=
seq
-
last_seq
-
1
print
(
f
"Missed
{
missed
}
messages"
f
" (last:
{
last_seq
}
, current:
{
seq
}
)"
)
replay
.
send
((
last_seq
+
1
).
to_bytes
(
8
,
"big"
))
while
poller
.
poll
(
timeout
=
200
):
seq_bytes
,
replay_payload
=
replay
.
recv_multipart
()
if
not
replay_payload
:
# End of replay marker is sent as an empty frame
# for the payload
break
replay_seq
=
int
.
from_bytes
(
seq_bytes
,
"big"
)
if
replay_seq
>
last_seq
:
event_batch
=
decoder
.
decode
(
replay_payload
)
process_event
(
event_batch
)
last_seq
=
replay_seq
if
replay_seq
>=
seq
-
1
:
break
event_batch
=
decoder
.
decode
(
payload
)
process_event
(
event_batch
)
# ... do other periodic work or check for shutdown ...
except
KeyboardInterrupt
:
print
(
"Interrupted"
)
break
except
Exception
as
e
:
print
(
"Error decoding message:"
,
e
)
if
__name__
==
"__main__"
:
main
()
examples/online_serving/openai_chat_completion_client_for_multimodal.py
View file @
7a985548
# SPDX-License-Identifier: Apache-2.0
"""An example showing how to use vLLM to serve multimodal models
"""An example showing how to use vLLM to serve multimodal models
and run online serving with OpenAI client.
Launch the vLLM server with the following command:
(single image inference with Llava)
vllm serve llava-hf/llava-1.5-7b-hf
--chat-template template_llava.jinja
vllm serve llava-hf/llava-1.5-7b-hf
(multi-image inference with Phi-3.5-vision-instruct)
vllm serve microsoft/Phi-3.5-vision-instruct --task generate
\
--trust-remote-code --max-model-len 4096 --limit-mm-per-prompt '{"image":2}'
(audio inference with Ultravox)
vllm serve fixie-ai/ultravox-v0_5-llama-3_2-1b --max-model-len 4096
vllm serve fixie-ai/ultravox-v0_5-llama-3_2-1b
\
--max-model-len 4096 --trust-remote-code
run the script with
python openai_chat_completion_client_for_multimodal.py --chat-type audio
"""
import
base64
import
requests
from
openai
import
OpenAI
from
utils
import
get_first_model
from
vllm.utils
import
FlexibleArgumentParser
...
...
@@ -31,9 +37,6 @@ client = OpenAI(
base_url
=
openai_api_base
,
)
models
=
client
.
models
.
list
()
model
=
models
.
data
[
0
].
id
def
encode_base64_content_from_url
(
content_url
:
str
)
->
str
:
"""Encode a content retrieved from a remote url to base64 format."""
...
...
@@ -46,7 +49,7 @@ def encode_base64_content_from_url(content_url: str) -> str:
# Text-only inference
def
run_text_only
()
->
None
:
def
run_text_only
(
model
:
str
)
->
None
:
chat_completion
=
client
.
chat
.
completions
.
create
(
messages
=
[{
"role"
:
"user"
,
...
...
@@ -61,7 +64,7 @@ def run_text_only() -> None:
# Single-image input inference
def
run_single_image
()
->
None
:
def
run_single_image
(
model
:
str
)
->
None
:
## Use image url in the payload
image_url
=
"https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg"
...
...
@@ -117,7 +120,7 @@ def run_single_image() -> None:
# Multi-image input inference
def
run_multi_image
()
->
None
:
def
run_multi_image
(
model
:
str
)
->
None
:
image_url_duck
=
"https://upload.wikimedia.org/wikipedia/commons/d/da/2015_Kaczka_krzy%C5%BCowka_w_wodzie_%28samiec%29.jpg"
image_url_lion
=
"https://upload.wikimedia.org/wikipedia/commons/7/77/002_The_lion_king_Snyggve_in_the_Serengeti_National_Park_Photo_by_Giles_Laurent.jpg"
chat_completion_from_url
=
client
.
chat
.
completions
.
create
(
...
...
@@ -152,7 +155,7 @@ def run_multi_image() -> None:
# Video input inference
def
run_video
()
->
None
:
def
run_video
(
model
:
str
)
->
None
:
video_url
=
"http://commondatastorage.googleapis.com/gtv-videos-bucket/sample/ForBiggerFun.mp4"
video_base64
=
encode_base64_content_from_url
(
video_url
)
...
...
@@ -208,7 +211,7 @@ def run_video() -> None:
# Audio input inference
def
run_audio
()
->
None
:
def
run_audio
(
model
:
str
)
->
None
:
from
vllm.assets.audio
import
AudioAsset
audio_url
=
AudioAsset
(
"winning_call"
).
url
...
...
@@ -318,7 +321,8 @@ def parse_args():
def
main
(
args
)
->
None
:
chat_type
=
args
.
chat_type
example_function_map
[
chat_type
]()
model
=
get_first_model
(
client
)
example_function_map
[
chat_type
](
model
)
if
__name__
==
"__main__"
:
...
...
examples/online_serving/openai_chat_completion_client_with_tools.py
View file @
7a985548
...
...
@@ -7,12 +7,12 @@ IMPORTANT: for mistral, you must use one of the provided mistral tool call
templates, or your own - the model default doesn't work for tool calls with vLLM
See the vLLM docs on OpenAI server & tool calling for more details.
vllm serve
--model
mistralai/Mistral-7B-Instruct-v0.3
\
vllm serve mistralai/Mistral-7B-Instruct-v0.3
\
--chat-template examples/tool_chat_template_mistral.jinja
\
--enable-auto-tool-choice --tool-call-parser mistral
OR
vllm serve
--model
NousResearch/Hermes-2-Pro-Llama-3-8B
\
vllm serve NousResearch/Hermes-2-Pro-Llama-3-8B
\
--chat-template examples/tool_chat_template_hermes.jinja
\
--enable-auto-tool-choice --tool-call-parser hermes
"""
...
...
examples/online_serving/openai_chat_completion_structured_outputs.py
View file @
7a985548
...
...
@@ -112,8 +112,8 @@ def extra_backend_options_completion(client: OpenAI, model: str):
"alan.turing@enigma.com
\n
"
)
try
:
# The
no-
fallback option forces vLLM to use
xgrammar, so when it fails
# you get a 400 with the reason why
# The
guided_decoding_disable_
fallback option forces vLLM to use
#
xgrammar, so when it fails
you get a 400 with the reason why
completion
=
client
.
chat
.
completions
.
create
(
model
=
model
,
messages
=
[{
...
...
@@ -123,7 +123,8 @@ def extra_backend_options_completion(client: OpenAI, model: str):
extra_body
=
{
"guided_regex"
:
r
"\w+@\w+\.com\n"
,
"stop"
:
[
"
\n
"
],
"guided_decoding_backend"
:
"xgrammar:no-fallback"
"guided_decoding_backend"
:
"xgrammar"
,
"guided_decoding_disable_fallback"
:
True
,
},
)
return
completion
.
choices
[
0
].
message
.
content
...
...
@@ -137,7 +138,7 @@ def main():
api_key
=
"-"
,
)
model
=
"Qwen/Qwen2.5-3B-Instruct"
model
=
client
.
models
.
list
().
data
[
0
].
id
print
(
"Guided Choice Completion:"
)
print
(
guided_choice_completion
(
client
,
model
))
...
...
examples/online_serving/openai_chat_completion_structured_outputs_structural_tag.py
View file @
7a985548
...
...
@@ -59,7 +59,7 @@ and San Francisco?
}]
response
=
client
.
chat
.
completions
.
create
(
model
=
"meta-llama/Llama-3.1-8B-Instruct"
,
model
=
client
.
models
.
list
().
data
[
0
].
id
,
messages
=
messages
,
response_format
=
{
"type"
:
...
...
examples/online_serving/openai_chat_completion_structured_outputs_with_reasoning.py
View file @
7a985548
...
...
@@ -4,12 +4,12 @@ An example shows how to generate structured outputs from reasoning models
like DeepSeekR1. The thinking process will not be guided by the JSON
schema provided by the user. Only the final output will be structured.
To run this example, you need to start the vLLM server with the reasoning
To run this example, you need to start the vLLM server with the reasoning
parser:
```bash
vllm serve deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B
\
--enable-reasoning
--reasoning-parser deepseek_r1
--reasoning-parser deepseek_r1
```
This example demonstrates how to generate chat completions from reasoning models
...
...
examples/online_serving/openai_chat_completion_tool_calls_with_reasoning.py
View file @
7a985548
...
...
@@ -9,7 +9,7 @@ the reasoning parser and tool calling enabled.
```bash
vllm serve Qwen/QwQ-32B
\
--enable-reasoning
--reasoning-parser deepseek_r1
\
--reasoning-parser deepseek_r1
\
--enable-auto-tool-choice --tool-call-parser hermes
```
...
...
Prev
1
…
9
10
11
12
13
14
15
16
17
…
25
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment