Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
cc7f22a8
Commit
cc7f22a8
authored
Jun 11, 2025
by
zhuwenwen
Browse files
Merge tag 'v0.9.1' into v0.9.1-ori
parents
b9ea0c09
b6553be1
Changes
1000
Show whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
103 additions
and
7 deletions
+103
-7
examples/offline_inference/basic/embed.py
examples/offline_inference/basic/embed.py
+1
-0
examples/offline_inference/basic/generate.py
examples/offline_inference/basic/generate.py
+1
-0
examples/offline_inference/basic/score.py
examples/offline_inference/basic/score.py
+1
-0
examples/offline_inference/batch_llm_inference.py
examples/offline_inference/batch_llm_inference.py
+1
-0
examples/offline_inference/chat_with_tools.py
examples/offline_inference/chat_with_tools.py
+1
-0
examples/offline_inference/context_extension.py
examples/offline_inference/context_extension.py
+68
-0
examples/offline_inference/data_parallel.py
examples/offline_inference/data_parallel.py
+9
-4
examples/offline_inference/disaggregated-prefill-v1/decode_example.py
...line_inference/disaggregated-prefill-v1/decode_example.py
+1
-0
examples/offline_inference/disaggregated-prefill-v1/prefill_example.py
...ine_inference/disaggregated-prefill-v1/prefill_example.py
+1
-0
examples/offline_inference/disaggregated-prefill-v1/run.sh
examples/offline_inference/disaggregated-prefill-v1/run.sh
+9
-3
examples/offline_inference/disaggregated_prefill.py
examples/offline_inference/disaggregated_prefill.py
+1
-0
examples/offline_inference/eagle.py
examples/offline_inference/eagle.py
+1
-0
examples/offline_inference/embed_jina_embeddings_v3.py
examples/offline_inference/embed_jina_embeddings_v3.py
+1
-0
examples/offline_inference/embed_matryoshka_fy.py
examples/offline_inference/embed_matryoshka_fy.py
+1
-0
examples/offline_inference/encoder_decoder.py
examples/offline_inference/encoder_decoder.py
+1
-0
examples/offline_inference/encoder_decoder_multimodal.py
examples/offline_inference/encoder_decoder_multimodal.py
+1
-0
examples/offline_inference/llm_engine_example.py
examples/offline_inference/llm_engine_example.py
+1
-0
examples/offline_inference/load_sharded_state.py
examples/offline_inference/load_sharded_state.py
+1
-0
examples/offline_inference/lora_with_quantization_inference.py
...les/offline_inference/lora_with_quantization_inference.py
+1
-0
examples/offline_inference/metrics.py
examples/offline_inference/metrics.py
+1
-0
No files found.
Too many changes to show.
To preserve performance only
1000 of 1000+
files are displayed.
Plain diff
Email patch
examples/offline_inference/basic/embed.py
View file @
cc7f22a8
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from
argparse
import
Namespace
...
...
examples/offline_inference/basic/generate.py
View file @
cc7f22a8
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from
vllm
import
LLM
,
EngineArgs
from
vllm.utils
import
FlexibleArgumentParser
...
...
examples/offline_inference/basic/score.py
View file @
cc7f22a8
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from
argparse
import
Namespace
...
...
examples/offline_inference/batch_llm_inference.py
View file @
cc7f22a8
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""
This example shows how to use Ray Data for data parallel batch inference.
...
...
examples/offline_inference/chat_with_tools.py
View file @
cc7f22a8
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
# ruff: noqa
import
json
...
...
examples/offline_inference/context_extension.py
0 → 100644
View file @
cc7f22a8
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""
This script demonstrates how to extend the context length
of a Qwen model using the YARN method (rope_scaling)
and run a simple chat example.
Usage:
python examples/offline_inference/context_extension.py
"""
from
vllm
import
LLM
,
SamplingParams
def
create_llm
():
rope_theta
=
1000000
original_max_position_embeddings
=
32768
factor
=
4.0
# Use yarn to extend context
hf_overrides
=
{
"rope_theta"
:
rope_theta
,
"rope_scaling"
:
{
"rope_type"
:
"yarn"
,
"factor"
:
factor
,
"original_max_position_embeddings"
:
original_max_position_embeddings
,
},
"max_model_len"
:
int
(
original_max_position_embeddings
*
factor
),
}
llm
=
LLM
(
model
=
"Qwen/Qwen3-0.6B"
,
hf_overrides
=
hf_overrides
)
return
llm
def
run_llm_chat
(
llm
):
sampling_params
=
SamplingParams
(
temperature
=
0.8
,
top_p
=
0.95
,
max_tokens
=
128
,
)
conversation
=
[
{
"role"
:
"system"
,
"content"
:
"You are a helpful assistant"
},
{
"role"
:
"user"
,
"content"
:
"Hello"
},
{
"role"
:
"assistant"
,
"content"
:
"Hello! How can I assist you today?"
},
]
outputs
=
llm
.
chat
(
conversation
,
sampling_params
,
use_tqdm
=
False
)
return
outputs
def
print_outputs
(
outputs
):
print
(
"
\n
Generated Outputs:
\n
"
+
"-"
*
80
)
for
output
in
outputs
:
prompt
=
output
.
prompt
generated_text
=
output
.
outputs
[
0
].
text
print
(
f
"Prompt:
{
prompt
!
r
}
\n
"
)
print
(
f
"Generated text:
{
generated_text
!
r
}
"
)
print
(
"-"
*
80
)
def
main
():
llm
=
create_llm
()
outputs
=
run_llm_chat
(
llm
)
print_outputs
(
outputs
)
if
__name__
==
"__main__"
:
main
()
examples/offline_inference/data_parallel.py
View file @
cc7f22a8
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""
Usage:
Single node:
...
...
@@ -97,10 +98,14 @@ def main(
# with DP, each rank should process different prompts.
# usually all the DP ranks process a full dataset,
# and each rank processes a different part of the dataset.
promts_per_rank
=
len
(
prompts
)
//
dp_size
start
=
global_dp_rank
*
promts_per_rank
end
=
start
+
promts_per_rank
prompts
=
prompts
[
start
:
end
]
floor
=
len
(
prompts
)
//
dp_size
remainder
=
len
(
prompts
)
%
dp_size
# Distribute prompts into even groups.
def
start
(
rank
):
return
rank
*
floor
+
min
(
rank
,
remainder
)
prompts
=
prompts
[
start
(
global_dp_rank
)
:
start
(
global_dp_rank
+
1
)]
if
len
(
prompts
)
==
0
:
# if any rank has no prompts to process,
# we need to set a placeholder prompt
...
...
examples/offline_inference/disaggregated-prefill-v1/decode_example.py
View file @
cc7f22a8
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from
vllm
import
LLM
,
SamplingParams
from
vllm.config
import
KVTransferConfig
...
...
examples/offline_inference/disaggregated-prefill-v1/prefill_example.py
View file @
cc7f22a8
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from
vllm
import
LLM
,
SamplingParams
from
vllm.config
import
KVTransferConfig
...
...
examples/offline_inference/disaggregated-prefill-v1/run.sh
View file @
cc7f22a8
rm
-rf
local_storage/
rm
output.txt
VLLM_ENABLE_V1_MULTIPROCESSING
=
0
CUDA_VISIBLE_DEVICES
=
0 python3 prefill_example.py
VLLM_ENABLE_V1_MULTIPROCESSING
=
0
CUDA_VISIBLE_DEVICES
=
0 python3 decode_example.py
if
[
-f
"output.txt"
]
;
then
rm
output.txt
fi
# The directory of current script
SCRIPT_DIR
=
$(
dirname
"
$(
readlink
-f
"
$0
"
)
"
)
VLLM_ENABLE_V1_MULTIPROCESSING
=
0
CUDA_VISIBLE_DEVICES
=
0 python3
"
$SCRIPT_DIR
/prefill_example.py"
VLLM_ENABLE_V1_MULTIPROCESSING
=
0
CUDA_VISIBLE_DEVICES
=
0 python3
"
$SCRIPT_DIR
/decode_example.py"
examples/offline_inference/disaggregated_prefill.py
View file @
cc7f22a8
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""
This file demonstrates the example usage of disaggregated prefilling
We will launch 2 vllm instances (GPU 0 for prefill and GPU 1 for decode),
...
...
examples/offline_inference/eagle.py
View file @
cc7f22a8
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import
argparse
import
json
import
os
...
...
examples/offline_inference/embed_jina_embeddings_v3.py
View file @
cc7f22a8
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from
argparse
import
Namespace
...
...
examples/offline_inference/embed_matryoshka_fy.py
View file @
cc7f22a8
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from
argparse
import
Namespace
...
...
examples/offline_inference/encoder_decoder.py
View file @
cc7f22a8
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""
Demonstrate prompting of text-to-text
encoder/decoder models, specifically BART
...
...
examples/offline_inference/encoder_decoder_multimodal.py
View file @
cc7f22a8
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""
This example shows how to use vLLM for running offline inference with
the explicit/implicit prompt format on enc-dec LMMs for text generation.
...
...
examples/offline_inference/llm_engine_example.py
View file @
cc7f22a8
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""
This file demonstrates using the `LLMEngine`
for processing prompts with various sampling parameters.
...
...
examples/offline_inference/load_sharded_state.py
View file @
cc7f22a8
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""
Validates the loading of a model saved with the sharded_state format.
This script demonstrates how to load a model that was previously saved
...
...
examples/offline_inference/lora_with_quantization_inference.py
View file @
cc7f22a8
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""
This example shows how to use LoRA with different quantization techniques
for offline inference.
...
...
examples/offline_inference/metrics.py
View file @
cc7f22a8
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from
vllm
import
LLM
,
SamplingParams
from
vllm.v1.metrics.reader
import
Counter
,
Gauge
,
Histogram
,
Vector
...
...
Prev
1
…
4
5
6
7
8
9
10
11
12
…
50
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment