Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
cc7f22a8
Commit
cc7f22a8
authored
Jun 11, 2025
by
zhuwenwen
Browse files
Merge tag 'v0.9.1' into v0.9.1-ori
parents
b9ea0c09
b6553be1
Changes
1000
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
103 additions
and
7 deletions
+103
-7
examples/offline_inference/basic/embed.py
examples/offline_inference/basic/embed.py
+1
-0
examples/offline_inference/basic/generate.py
examples/offline_inference/basic/generate.py
+1
-0
examples/offline_inference/basic/score.py
examples/offline_inference/basic/score.py
+1
-0
examples/offline_inference/batch_llm_inference.py
examples/offline_inference/batch_llm_inference.py
+1
-0
examples/offline_inference/chat_with_tools.py
examples/offline_inference/chat_with_tools.py
+1
-0
examples/offline_inference/context_extension.py
examples/offline_inference/context_extension.py
+68
-0
examples/offline_inference/data_parallel.py
examples/offline_inference/data_parallel.py
+9
-4
examples/offline_inference/disaggregated-prefill-v1/decode_example.py
...line_inference/disaggregated-prefill-v1/decode_example.py
+1
-0
examples/offline_inference/disaggregated-prefill-v1/prefill_example.py
...ine_inference/disaggregated-prefill-v1/prefill_example.py
+1
-0
examples/offline_inference/disaggregated-prefill-v1/run.sh
examples/offline_inference/disaggregated-prefill-v1/run.sh
+9
-3
examples/offline_inference/disaggregated_prefill.py
examples/offline_inference/disaggregated_prefill.py
+1
-0
examples/offline_inference/eagle.py
examples/offline_inference/eagle.py
+1
-0
examples/offline_inference/embed_jina_embeddings_v3.py
examples/offline_inference/embed_jina_embeddings_v3.py
+1
-0
examples/offline_inference/embed_matryoshka_fy.py
examples/offline_inference/embed_matryoshka_fy.py
+1
-0
examples/offline_inference/encoder_decoder.py
examples/offline_inference/encoder_decoder.py
+1
-0
examples/offline_inference/encoder_decoder_multimodal.py
examples/offline_inference/encoder_decoder_multimodal.py
+1
-0
examples/offline_inference/llm_engine_example.py
examples/offline_inference/llm_engine_example.py
+1
-0
examples/offline_inference/load_sharded_state.py
examples/offline_inference/load_sharded_state.py
+1
-0
examples/offline_inference/lora_with_quantization_inference.py
...les/offline_inference/lora_with_quantization_inference.py
+1
-0
examples/offline_inference/metrics.py
examples/offline_inference/metrics.py
+1
-0
No files found.
Too many changes to show.
To preserve performance only
1000 of 1000+
files are displayed.
Plain diff
Email patch
examples/offline_inference/basic/embed.py
View file @
cc7f22a8
# SPDX-License-Identifier: Apache-2.0
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from
argparse
import
Namespace
from
argparse
import
Namespace
...
...
examples/offline_inference/basic/generate.py
View file @
cc7f22a8
# SPDX-License-Identifier: Apache-2.0
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from
vllm
import
LLM
,
EngineArgs
from
vllm
import
LLM
,
EngineArgs
from
vllm.utils
import
FlexibleArgumentParser
from
vllm.utils
import
FlexibleArgumentParser
...
...
examples/offline_inference/basic/score.py
View file @
cc7f22a8
# SPDX-License-Identifier: Apache-2.0
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from
argparse
import
Namespace
from
argparse
import
Namespace
...
...
examples/offline_inference/batch_llm_inference.py
View file @
cc7f22a8
# SPDX-License-Identifier: Apache-2.0
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""
"""
This example shows how to use Ray Data for data parallel batch inference.
This example shows how to use Ray Data for data parallel batch inference.
...
...
examples/offline_inference/chat_with_tools.py
View file @
cc7f22a8
# SPDX-License-Identifier: Apache-2.0
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
# ruff: noqa
# ruff: noqa
import
json
import
json
...
...
examples/offline_inference/context_extension.py
0 → 100644
View file @
cc7f22a8
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""
This script demonstrates how to extend the context length
of a Qwen model using the YARN method (rope_scaling)
and run a simple chat example.
Usage:
python examples/offline_inference/context_extension.py
"""
from
vllm
import
LLM
,
SamplingParams
def
create_llm
():
rope_theta
=
1000000
original_max_position_embeddings
=
32768
factor
=
4.0
# Use yarn to extend context
hf_overrides
=
{
"rope_theta"
:
rope_theta
,
"rope_scaling"
:
{
"rope_type"
:
"yarn"
,
"factor"
:
factor
,
"original_max_position_embeddings"
:
original_max_position_embeddings
,
},
"max_model_len"
:
int
(
original_max_position_embeddings
*
factor
),
}
llm
=
LLM
(
model
=
"Qwen/Qwen3-0.6B"
,
hf_overrides
=
hf_overrides
)
return
llm
def
run_llm_chat
(
llm
):
sampling_params
=
SamplingParams
(
temperature
=
0.8
,
top_p
=
0.95
,
max_tokens
=
128
,
)
conversation
=
[
{
"role"
:
"system"
,
"content"
:
"You are a helpful assistant"
},
{
"role"
:
"user"
,
"content"
:
"Hello"
},
{
"role"
:
"assistant"
,
"content"
:
"Hello! How can I assist you today?"
},
]
outputs
=
llm
.
chat
(
conversation
,
sampling_params
,
use_tqdm
=
False
)
return
outputs
def
print_outputs
(
outputs
):
print
(
"
\n
Generated Outputs:
\n
"
+
"-"
*
80
)
for
output
in
outputs
:
prompt
=
output
.
prompt
generated_text
=
output
.
outputs
[
0
].
text
print
(
f
"Prompt:
{
prompt
!
r
}
\n
"
)
print
(
f
"Generated text:
{
generated_text
!
r
}
"
)
print
(
"-"
*
80
)
def
main
():
llm
=
create_llm
()
outputs
=
run_llm_chat
(
llm
)
print_outputs
(
outputs
)
if
__name__
==
"__main__"
:
main
()
examples/offline_inference/data_parallel.py
View file @
cc7f22a8
# SPDX-License-Identifier: Apache-2.0
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""
"""
Usage:
Usage:
Single node:
Single node:
...
@@ -97,10 +98,14 @@ def main(
...
@@ -97,10 +98,14 @@ def main(
# with DP, each rank should process different prompts.
# with DP, each rank should process different prompts.
# usually all the DP ranks process a full dataset,
# usually all the DP ranks process a full dataset,
# and each rank processes a different part of the dataset.
# and each rank processes a different part of the dataset.
promts_per_rank
=
len
(
prompts
)
//
dp_size
floor
=
len
(
prompts
)
//
dp_size
start
=
global_dp_rank
*
promts_per_rank
remainder
=
len
(
prompts
)
%
dp_size
end
=
start
+
promts_per_rank
prompts
=
prompts
[
start
:
end
]
# Distribute prompts into even groups.
def
start
(
rank
):
return
rank
*
floor
+
min
(
rank
,
remainder
)
prompts
=
prompts
[
start
(
global_dp_rank
)
:
start
(
global_dp_rank
+
1
)]
if
len
(
prompts
)
==
0
:
if
len
(
prompts
)
==
0
:
# if any rank has no prompts to process,
# if any rank has no prompts to process,
# we need to set a placeholder prompt
# we need to set a placeholder prompt
...
...
examples/offline_inference/disaggregated-prefill-v1/decode_example.py
View file @
cc7f22a8
# SPDX-License-Identifier: Apache-2.0
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from
vllm
import
LLM
,
SamplingParams
from
vllm
import
LLM
,
SamplingParams
from
vllm.config
import
KVTransferConfig
from
vllm.config
import
KVTransferConfig
...
...
examples/offline_inference/disaggregated-prefill-v1/prefill_example.py
View file @
cc7f22a8
# SPDX-License-Identifier: Apache-2.0
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from
vllm
import
LLM
,
SamplingParams
from
vllm
import
LLM
,
SamplingParams
from
vllm.config
import
KVTransferConfig
from
vllm.config
import
KVTransferConfig
...
...
examples/offline_inference/disaggregated-prefill-v1/run.sh
View file @
cc7f22a8
rm
-rf
local_storage/
rm
-rf
local_storage/
rm
output.txt
VLLM_ENABLE_V1_MULTIPROCESSING
=
0
CUDA_VISIBLE_DEVICES
=
0 python3 prefill_example.py
if
[
-f
"output.txt"
]
;
then
VLLM_ENABLE_V1_MULTIPROCESSING
=
0
CUDA_VISIBLE_DEVICES
=
0 python3 decode_example.py
rm
output.txt
fi
# The directory of current script
SCRIPT_DIR
=
$(
dirname
"
$(
readlink
-f
"
$0
"
)
"
)
VLLM_ENABLE_V1_MULTIPROCESSING
=
0
CUDA_VISIBLE_DEVICES
=
0 python3
"
$SCRIPT_DIR
/prefill_example.py"
VLLM_ENABLE_V1_MULTIPROCESSING
=
0
CUDA_VISIBLE_DEVICES
=
0 python3
"
$SCRIPT_DIR
/decode_example.py"
examples/offline_inference/disaggregated_prefill.py
View file @
cc7f22a8
# SPDX-License-Identifier: Apache-2.0
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""
"""
This file demonstrates the example usage of disaggregated prefilling
This file demonstrates the example usage of disaggregated prefilling
We will launch 2 vllm instances (GPU 0 for prefill and GPU 1 for decode),
We will launch 2 vllm instances (GPU 0 for prefill and GPU 1 for decode),
...
...
examples/offline_inference/eagle.py
View file @
cc7f22a8
# SPDX-License-Identifier: Apache-2.0
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import
argparse
import
argparse
import
json
import
json
import
os
import
os
...
...
examples/offline_inference/embed_jina_embeddings_v3.py
View file @
cc7f22a8
# SPDX-License-Identifier: Apache-2.0
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from
argparse
import
Namespace
from
argparse
import
Namespace
...
...
examples/offline_inference/embed_matryoshka_fy.py
View file @
cc7f22a8
# SPDX-License-Identifier: Apache-2.0
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from
argparse
import
Namespace
from
argparse
import
Namespace
...
...
examples/offline_inference/encoder_decoder.py
View file @
cc7f22a8
# SPDX-License-Identifier: Apache-2.0
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""
"""
Demonstrate prompting of text-to-text
Demonstrate prompting of text-to-text
encoder/decoder models, specifically BART
encoder/decoder models, specifically BART
...
...
examples/offline_inference/encoder_decoder_multimodal.py
View file @
cc7f22a8
# SPDX-License-Identifier: Apache-2.0
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""
"""
This example shows how to use vLLM for running offline inference with
This example shows how to use vLLM for running offline inference with
the explicit/implicit prompt format on enc-dec LMMs for text generation.
the explicit/implicit prompt format on enc-dec LMMs for text generation.
...
...
examples/offline_inference/llm_engine_example.py
View file @
cc7f22a8
# SPDX-License-Identifier: Apache-2.0
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""
"""
This file demonstrates using the `LLMEngine`
This file demonstrates using the `LLMEngine`
for processing prompts with various sampling parameters.
for processing prompts with various sampling parameters.
...
...
examples/offline_inference/load_sharded_state.py
View file @
cc7f22a8
# SPDX-License-Identifier: Apache-2.0
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""
"""
Validates the loading of a model saved with the sharded_state format.
Validates the loading of a model saved with the sharded_state format.
This script demonstrates how to load a model that was previously saved
This script demonstrates how to load a model that was previously saved
...
...
examples/offline_inference/lora_with_quantization_inference.py
View file @
cc7f22a8
# SPDX-License-Identifier: Apache-2.0
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""
"""
This example shows how to use LoRA with different quantization techniques
This example shows how to use LoRA with different quantization techniques
for offline inference.
for offline inference.
...
...
examples/offline_inference/metrics.py
View file @
cc7f22a8
# SPDX-License-Identifier: Apache-2.0
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from
vllm
import
LLM
,
SamplingParams
from
vllm
import
LLM
,
SamplingParams
from
vllm.v1.metrics.reader
import
Counter
,
Gauge
,
Histogram
,
Vector
from
vllm.v1.metrics.reader
import
Counter
,
Gauge
,
Histogram
,
Vector
...
...
Prev
1
…
4
5
6
7
8
9
10
11
12
…
50
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment