Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
4c676e3d
Commit
4c676e3d
authored
Jun 20, 2025
by
zhuwenwen
Browse files
Merge tag 'v0.9.1' into v0.9.1-dev
parents
b4c4464d
b6553be1
Changes
437
Show whitespace changes
Inline
Side-by-side
Showing
17 changed files
with
456 additions
and
279 deletions
+456
-279
examples/offline_inference/basic/basic.py
examples/offline_inference/basic/basic.py
+1
-0
examples/offline_inference/basic/chat.py
examples/offline_inference/basic/chat.py
+7
-17
examples/offline_inference/basic/classify.py
examples/offline_inference/basic/classify.py
+9
-7
examples/offline_inference/basic/embed.py
examples/offline_inference/basic/embed.py
+8
-7
examples/offline_inference/basic/generate.py
examples/offline_inference/basic/generate.py
+3
-3
examples/offline_inference/basic/score.py
examples/offline_inference/basic/score.py
+4
-3
examples/offline_inference/batch_llm_inference.py
examples/offline_inference/batch_llm_inference.py
+12
-11
examples/offline_inference/chat_with_tools.py
examples/offline_inference/chat_with_tools.py
+63
-56
examples/offline_inference/context_extension.py
examples/offline_inference/context_extension.py
+68
-0
examples/offline_inference/data_parallel.py
examples/offline_inference/data_parallel.py
+77
-51
examples/offline_inference/disaggregated-prefill-v1/README.md
...ples/offline_inference/disaggregated-prefill-v1/README.md
+10
-0
examples/offline_inference/disaggregated-prefill-v1/decode_example.py
...line_inference/disaggregated-prefill-v1/decode_example.py
+46
-31
examples/offline_inference/disaggregated-prefill-v1/prefill_example.py
...ine_inference/disaggregated-prefill-v1/prefill_example.py
+53
-38
examples/offline_inference/disaggregated-prefill-v1/run.sh
examples/offline_inference/disaggregated-prefill-v1/run.sh
+9
-3
examples/offline_inference/disaggregated_prefill.py
examples/offline_inference/disaggregated_prefill.py
+26
-14
examples/offline_inference/eagle.py
examples/offline_inference/eagle.py
+48
-30
examples/offline_inference/embed_jina_embeddings_v3.py
examples/offline_inference/embed_jina_embeddings_v3.py
+12
-8
No files found.
Too many changes to show.
To preserve performance only
437 of 437+
files are displayed.
Plain diff
Email patch
examples/offline_inference/basic/basic.py
View file @
4c676e3d
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from
vllm
import
LLM
,
SamplingParams
...
...
examples/offline_inference/basic/chat.py
View file @
4c676e3d
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from
vllm
import
LLM
,
EngineArgs
from
vllm.utils
import
FlexibleArgumentParser
...
...
@@ -7,9 +8,8 @@ from vllm.utils import FlexibleArgumentParser
def
create_parser
():
parser
=
FlexibleArgumentParser
()
# Add engine args
engine_group
=
parser
.
add_argument_group
(
"Engine arguments"
)
EngineArgs
.
add_cli_args
(
engine_group
)
engine_group
.
set_defaults
(
model
=
"meta-llama/Llama-3.2-1B-Instruct"
)
EngineArgs
.
add_cli_args
(
parser
)
parser
.
set_defaults
(
model
=
"meta-llama/Llama-3.2-1B-Instruct"
)
# Add sampling params
sampling_group
=
parser
.
add_argument_group
(
"Sampling parameters"
)
sampling_group
.
add_argument
(
"--max-tokens"
,
type
=
int
)
...
...
@@ -57,22 +57,12 @@ def main(args: dict):
# In this script, we demonstrate how to pass input to the chat method:
conversation
=
[
{
"role"
:
"system"
,
"content"
:
"You are a helpful assistant"
},
{
"role"
:
"user"
,
"content"
:
"Hello"
},
{
"role"
:
"assistant"
,
"content"
:
"Hello! How can I assist you today?"
},
{
"role"
:
"system"
,
"content"
:
"You are a helpful assistant"
},
{
"role"
:
"user"
,
"content"
:
"Hello"
},
{
"role"
:
"assistant"
,
"content"
:
"Hello! How can I assist you today?"
},
{
"role"
:
"user"
,
"content"
:
"Write an essay about the importance of higher education."
,
"content"
:
"Write an essay about the importance of higher education."
,
},
]
outputs
=
llm
.
chat
(
conversation
,
sampling_params
,
use_tqdm
=
False
)
...
...
examples/offline_inference/basic/classify.py
View file @
4c676e3d
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from
argparse
import
Namespace
...
...
@@ -10,9 +11,9 @@ def parse_args():
parser
=
FlexibleArgumentParser
()
parser
=
EngineArgs
.
add_cli_args
(
parser
)
# Set example specific arguments
parser
.
set_defaults
(
model
=
"jason9693/Qwen2.5-1.5B-apeach"
,
task
=
"classify"
,
enforce_eager
=
True
)
parser
.
set_defaults
(
model
=
"jason9693/Qwen2.5-1.5B-apeach"
,
task
=
"classify"
,
enforce_eager
=
True
)
return
parser
.
parse_args
()
...
...
@@ -36,10 +37,11 @@ def main(args: Namespace):
print
(
"
\n
Generated Outputs:
\n
"
+
"-"
*
60
)
for
prompt
,
output
in
zip
(
prompts
,
outputs
):
probs
=
output
.
outputs
.
probs
probs_trimmed
=
((
str
(
probs
[:
16
])[:
-
1
]
+
", ...]"
)
if
len
(
probs
)
>
16
else
probs
)
print
(
f
"Prompt:
{
prompt
!
r
}
\n
"
f
"Class Probabilities:
{
probs_trimmed
}
(size=
{
len
(
probs
)
}
)"
)
probs_trimmed
=
(
str
(
probs
[:
16
])[:
-
1
]
+
", ...]"
)
if
len
(
probs
)
>
16
else
probs
print
(
f
"Prompt:
{
prompt
!
r
}
\n
"
f
"Class Probabilities:
{
probs_trimmed
}
(size=
{
len
(
probs
)
}
)"
)
print
(
"-"
*
60
)
...
...
examples/offline_inference/basic/embed.py
View file @
4c676e3d
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from
argparse
import
Namespace
...
...
@@ -10,9 +11,9 @@ def parse_args():
parser
=
FlexibleArgumentParser
()
parser
=
EngineArgs
.
add_cli_args
(
parser
)
# Set example specific arguments
parser
.
set_defaults
(
model
=
"intfloat/e5-mistral-7b-instruct"
,
task
=
"embed"
,
enforce_eager
=
True
)
parser
.
set_defaults
(
model
=
"intfloat/e5-mistral-7b-instruct"
,
task
=
"embed"
,
enforce_eager
=
True
)
return
parser
.
parse_args
()
...
...
@@ -36,10 +37,10 @@ def main(args: Namespace):
print
(
"
\n
Generated Outputs:
\n
"
+
"-"
*
60
)
for
prompt
,
output
in
zip
(
prompts
,
outputs
):
embeds
=
output
.
outputs
.
embedding
embeds_trimmed
=
(
(
str
(
embeds
[:
16
])[:
-
1
]
+
", ...]"
)
if
len
(
embeds
)
>
16
else
embeds
)
print
(
f
"Prompt:
{
prompt
!
r
}
\n
"
f
"
Embeddings:
{
embeds_trimmed
}
(size=
{
len
(
embeds
)
}
)"
)
embeds_trimmed
=
(
(
str
(
embeds
[:
16
])[:
-
1
]
+
", ...]"
)
if
len
(
embeds
)
>
16
else
embeds
)
print
(
f
"Prompt:
{
prompt
!
r
}
\n
Embeddings:
{
embeds_trimmed
}
(size=
{
len
(
embeds
)
}
)"
)
print
(
"-"
*
60
)
...
...
examples/offline_inference/basic/generate.py
View file @
4c676e3d
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from
vllm
import
LLM
,
EngineArgs
from
vllm.utils
import
FlexibleArgumentParser
...
...
@@ -7,9 +8,8 @@ from vllm.utils import FlexibleArgumentParser
def
create_parser
():
parser
=
FlexibleArgumentParser
()
# Add engine args
engine_group
=
parser
.
add_argument_group
(
"Engine arguments"
)
EngineArgs
.
add_cli_args
(
engine_group
)
engine_group
.
set_defaults
(
model
=
"meta-llama/Llama-3.2-1B-Instruct"
)
EngineArgs
.
add_cli_args
(
parser
)
parser
.
set_defaults
(
model
=
"meta-llama/Llama-3.2-1B-Instruct"
)
# Add sampling params
sampling_group
=
parser
.
add_argument_group
(
"Sampling parameters"
)
sampling_group
.
add_argument
(
"--max-tokens"
,
type
=
int
)
...
...
examples/offline_inference/basic/score.py
View file @
4c676e3d
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from
argparse
import
Namespace
...
...
@@ -10,9 +11,9 @@ def parse_args():
parser
=
FlexibleArgumentParser
()
parser
=
EngineArgs
.
add_cli_args
(
parser
)
# Set example specific arguments
parser
.
set_defaults
(
model
=
"BAAI/bge-reranker-v2-m3"
,
task
=
"score"
,
enforce_eager
=
True
)
parser
.
set_defaults
(
model
=
"BAAI/bge-reranker-v2-m3"
,
task
=
"score"
,
enforce_eager
=
True
)
return
parser
.
parse_args
()
...
...
examples/offline_inference/batch_llm_inference.py
View file @
4c676e3d
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""
This example shows how to use Ray Data for data parallel batch inference.
...
...
@@ -17,12 +18,14 @@ Ray Data provides functionality for:
Learn more about Ray Data's LLM integration:
https://docs.ray.io/en/latest/data/working-with-llms.html
"""
import
ray
from
packaging.version
import
Version
from
ray.data.llm
import
build_llm_processor
,
vLLMEngineProcessorConfig
assert
Version
(
ray
.
__version__
)
>=
Version
(
"2.44.1"
),
"Ray version must be at least 2.44.1"
assert
Version
(
ray
.
__version__
)
>=
Version
(
"2.44.1"
),
(
"Ray version must be at least 2.44.1"
)
# Uncomment to reduce clutter in stdout
# ray.init(log_to_driver=False)
...
...
@@ -53,20 +56,18 @@ config = vLLMEngineProcessorConfig(
vllm_processor
=
build_llm_processor
(
config
,
preprocess
=
lambda
row
:
dict
(
messages
=
[{
"role"
:
"system"
,
"content"
:
"You are a bot that responds with haikus."
},
{
"role"
:
"user"
,
"content"
:
row
[
"text"
]
}],
messages
=
[
{
"role"
:
"system"
,
"content"
:
"You are a bot that responds with haikus."
},
{
"role"
:
"user"
,
"content"
:
row
[
"text"
]},
],
sampling_params
=
dict
(
temperature
=
0.3
,
max_tokens
=
250
,
)),
),
),
postprocess
=
lambda
row
:
dict
(
answer
=
row
[
"generated_text"
],
**
row
# This will return all the original columns in the dataset.
**
row
,
# This will return all the original columns in the dataset.
),
)
...
...
examples/offline_inference/chat_with_tools.py
View file @
4c676e3d
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
# ruff: noqa
import
json
...
...
@@ -50,27 +51,32 @@ model_name = "mistralai/Mistral-7B-Instruct-v0.3"
# or any other mistral model with function calling ability
sampling_params
=
SamplingParams
(
max_tokens
=
8192
,
temperature
=
0.0
)
llm
=
LLM
(
model
=
model_name
,
llm
=
LLM
(
model
=
model_name
,
tokenizer_mode
=
"mistral"
,
config_format
=
"mistral"
,
load_format
=
"mistral"
)
load_format
=
"mistral"
,
)
def
generate_random_id
(
length
=
9
):
characters
=
string
.
ascii_letters
+
string
.
digits
random_id
=
''
.
join
(
random
.
choice
(
characters
)
for
_
in
range
(
length
))
random_id
=
""
.
join
(
random
.
choice
(
characters
)
for
_
in
range
(
length
))
return
random_id
# simulate an API that can be called
def
get_current_weather
(
city
:
str
,
state
:
str
,
unit
:
'str'
):
return
(
f
"The weather in
{
city
}
,
{
state
}
is 85 degrees
{
unit
}
. It is "
"partly cloudly, with highs in the 90's."
)
def
get_current_weather
(
city
:
str
,
state
:
str
,
unit
:
"str"
):
return
(
f
"The weather in
{
city
}
,
{
state
}
is 85 degrees
{
unit
}
. It is "
"partly cloudly, with highs in the 90's."
)
tool_funtions
=
{
"get_current_weather"
:
get_current_weather
}
tool_fun
c
tions
=
{
"get_current_weather"
:
get_current_weather
}
tools
=
[{
tools
=
[
{
"type"
:
"function"
,
"function"
:
{
"name"
:
"get_current_weather"
,
...
...
@@ -79,58 +85,59 @@ tools = [{
"type"
:
"object"
,
"properties"
:
{
"city"
:
{
"type"
:
"string"
,
"description"
:
"The city to find the weather for, e.g. 'San Francisco'"
"type"
:
"string"
,
"description"
:
"The city to find the weather for, e.g. 'San Francisco'"
,
},
"state"
:
{
"type"
:
"string"
,
"description"
:
"the two-letter abbreviation for the state that the city is"
" in, e.g. 'CA' which would mean 'California'"
"type"
:
"string"
,
"description"
:
"the two-letter abbreviation for the state that the city is"
" in, e.g. 'CA' which would mean 'California'"
,
},
"unit"
:
{
"type"
:
"string"
,
"description"
:
"The unit to fetch the temperature in"
,
"enum"
:
[
"celsius"
,
"fahrenheit"
]
}
"enum"
:
[
"celsius"
,
"fahrenheit"
],
},
},
"required"
:
[
"city"
,
"state"
,
"unit"
],
},
},
"required"
:
[
"city"
,
"state"
,
"unit"
]
}
}
}
]
]
messages
=
[
{
"role"
:
"user"
,
"content"
:
"Can you tell me what the temperate will be in Dallas, in fahrenheit?"
}
]
messages
=
[
{
"role"
:
"user"
,
"content"
:
"Can you tell me what the temperate will be in Dallas, in fahrenheit?"
,
}
]
outputs
=
llm
.
chat
(
messages
,
sampling_params
=
sampling_params
,
tools
=
tools
)
output
=
outputs
[
0
].
outputs
[
0
].
text
.
strip
()
# append the assistant message
messages
.
append
({
messages
.
append
(
{
"role"
:
"assistant"
,
"content"
:
output
,
})
}
)
# let's now actually parse and execute the model's output simulating an API call by using the
# above defined function
tool_calls
=
json
.
loads
(
output
)
tool_answers
=
[
tool_funtions
[
call
[
'
name
'
]](
**
call
[
'
arguments
'
])
for
call
in
tool_calls
tool_fun
c
tions
[
call
[
"
name
"
]](
**
call
[
"
arguments
"
])
for
call
in
tool_calls
]
# append the answer as a tool message and let the LLM give you an answer
messages
.
append
({
messages
.
append
(
{
"role"
:
"tool"
,
"content"
:
"
\n\n
"
.
join
(
tool_answers
),
"tool_call_id"
:
generate_random_id
(),
})
}
)
outputs
=
llm
.
chat
(
messages
,
sampling_params
,
tools
=
tools
)
...
...
examples/offline_inference/context_extension.py
0 → 100644
View file @
4c676e3d
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""
This script demonstrates how to extend the context length
of a Qwen model using the YARN method (rope_scaling)
and run a simple chat example.
Usage:
python examples/offline_inference/context_extension.py
"""
from
vllm
import
LLM
,
SamplingParams
def
create_llm
():
rope_theta
=
1000000
original_max_position_embeddings
=
32768
factor
=
4.0
# Use yarn to extend context
hf_overrides
=
{
"rope_theta"
:
rope_theta
,
"rope_scaling"
:
{
"rope_type"
:
"yarn"
,
"factor"
:
factor
,
"original_max_position_embeddings"
:
original_max_position_embeddings
,
},
"max_model_len"
:
int
(
original_max_position_embeddings
*
factor
),
}
llm
=
LLM
(
model
=
"Qwen/Qwen3-0.6B"
,
hf_overrides
=
hf_overrides
)
return
llm
def
run_llm_chat
(
llm
):
sampling_params
=
SamplingParams
(
temperature
=
0.8
,
top_p
=
0.95
,
max_tokens
=
128
,
)
conversation
=
[
{
"role"
:
"system"
,
"content"
:
"You are a helpful assistant"
},
{
"role"
:
"user"
,
"content"
:
"Hello"
},
{
"role"
:
"assistant"
,
"content"
:
"Hello! How can I assist you today?"
},
]
outputs
=
llm
.
chat
(
conversation
,
sampling_params
,
use_tqdm
=
False
)
return
outputs
def
print_outputs
(
outputs
):
print
(
"
\n
Generated Outputs:
\n
"
+
"-"
*
80
)
for
output
in
outputs
:
prompt
=
output
.
prompt
generated_text
=
output
.
outputs
[
0
].
text
print
(
f
"Prompt:
{
prompt
!
r
}
\n
"
)
print
(
f
"Generated text:
{
generated_text
!
r
}
"
)
print
(
"-"
*
80
)
def
main
():
llm
=
create_llm
()
outputs
=
run_llm_chat
(
llm
)
print_outputs
(
outputs
)
if
__name__
==
"__main__"
:
main
()
examples/offline_inference/data_parallel.py
View file @
4c676e3d
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""
Usage:
Single node:
...
...
@@ -27,6 +28,7 @@ Multi-node:
--master-addr=10.99.48.128
\
--master-port=13345
"""
import
os
from
time
import
sleep
...
...
@@ -36,40 +38,46 @@ from vllm.utils import get_open_port
def
parse_args
():
import
argparse
parser
=
argparse
.
ArgumentParser
(
description
=
"Data Parallel Inference"
)
parser
.
add_argument
(
"--model"
,
parser
.
add_argument
(
"--model"
,
type
=
str
,
default
=
"ibm-research/PowerMoE-3b"
,
help
=
"Model name or path"
)
parser
.
add_argument
(
"--dp-size"
,
type
=
int
,
default
=
2
,
help
=
"Data parallel size"
)
parser
.
add_argument
(
"--tp-size"
,
type
=
int
,
default
=
2
,
help
=
"Tensor parallel size"
)
parser
.
add_argument
(
"--node-size"
,
type
=
int
,
default
=
1
,
help
=
"Total number of nodes"
)
parser
.
add_argument
(
"--node-rank"
,
type
=
int
,
default
=
0
,
help
=
"Rank of the current node"
)
parser
.
add_argument
(
"--master-addr"
,
type
=
str
,
default
=
""
,
help
=
"Master node IP address"
)
parser
.
add_argument
(
"--master-port"
,
type
=
int
,
default
=
0
,
help
=
"Master node port"
)
help
=
"Model name or path"
,
)
parser
.
add_argument
(
"--dp-size"
,
type
=
int
,
default
=
2
,
help
=
"Data parallel size"
)
parser
.
add_argument
(
"--tp-size"
,
type
=
int
,
default
=
2
,
help
=
"Tensor parallel size"
)
parser
.
add_argument
(
"--node-size"
,
type
=
int
,
default
=
1
,
help
=
"Total number of nodes"
)
parser
.
add_argument
(
"--node-rank"
,
type
=
int
,
default
=
0
,
help
=
"Rank of the current node"
)
parser
.
add_argument
(
"--master-addr"
,
type
=
str
,
default
=
""
,
help
=
"Master node IP address"
)
parser
.
add_argument
(
"--master-port"
,
type
=
int
,
default
=
0
,
help
=
"Master node port"
)
parser
.
add_argument
(
"--enforce-eager"
,
action
=
"store_true"
,
help
=
"Enforce eager mode execution."
)
parser
.
add_argument
(
"--trust-remote-code"
,
action
=
"store_true"
,
help
=
"Trust remote code."
)
return
parser
.
parse_args
()
def
main
(
model
,
dp_size
,
local_dp_rank
,
global_dp_rank
,
dp_master_ip
,
dp_master_port
,
GPUs_per_dp_rank
):
def
main
(
model
,
dp_size
,
local_dp_rank
,
global_dp_rank
,
dp_master_ip
,
dp_master_port
,
GPUs_per_dp_rank
,
enforce_eager
,
trust_remote_code
,
):
os
.
environ
[
"VLLM_DP_RANK"
]
=
str
(
global_dp_rank
)
os
.
environ
[
"VLLM_DP_RANK_LOCAL"
]
=
str
(
local_dp_rank
)
os
.
environ
[
"VLLM_DP_SIZE"
]
=
str
(
dp_size
)
...
...
@@ -90,10 +98,14 @@ def main(model, dp_size, local_dp_rank, global_dp_rank, dp_master_ip,
# with DP, each rank should process different prompts.
# usually all the DP ranks process a full dataset,
# and each rank processes a different part of the dataset.
promts_per_rank
=
len
(
prompts
)
//
dp_size
start
=
global_dp_rank
*
promts_per_rank
end
=
start
+
promts_per_rank
prompts
=
prompts
[
start
:
end
]
floor
=
len
(
prompts
)
//
dp_size
remainder
=
len
(
prompts
)
%
dp_size
# Distribute prompts into even groups.
def
start
(
rank
):
return
rank
*
floor
+
min
(
rank
,
remainder
)
prompts
=
prompts
[
start
(
global_dp_rank
)
:
start
(
global_dp_rank
+
1
)]
if
len
(
prompts
)
==
0
:
# if any rank has no prompts to process,
# we need to set a placeholder prompt
...
...
@@ -104,15 +116,18 @@ def main(model, dp_size, local_dp_rank, global_dp_rank, dp_master_ip,
# since we are doing data parallel, every rank can have different
# sampling params. here we set different max_tokens for different
# ranks for demonstration.
sampling_params
=
SamplingParams
(
temperature
=
0.8
,
top_p
=
0.95
,
max_tokens
=
[
16
,
20
][
global_dp_rank
%
2
]
)
sampling_params
=
SamplingParams
(
temperature
=
0.8
,
top_p
=
0.95
,
max_tokens
=
[
16
,
20
][
global_dp_rank
%
2
]
)
# Create an LLM.
llm
=
LLM
(
model
=
model
,
llm
=
LLM
(
model
=
model
,
tensor_parallel_size
=
GPUs_per_dp_rank
,
enforce_eager
=
True
,
enable_expert_parallel
=
True
)
enforce_eager
=
enforce_eager
,
enable_expert_parallel
=
True
,
trust_remote_code
=
trust_remote_code
,
)
outputs
=
llm
.
generate
(
prompts
,
sampling_params
)
# Print the outputs.
for
i
,
output
in
enumerate
(
outputs
):
...
...
@@ -121,15 +136,16 @@ def main(model, dp_size, local_dp_rank, global_dp_rank, dp_master_ip,
break
prompt
=
output
.
prompt
generated_text
=
output
.
outputs
[
0
].
text
print
(
f
"DP rank
{
global_dp_rank
}
, Prompt:
{
prompt
!
r
}
, "
f
"Generated text:
{
generated_text
!
r
}
"
)
print
(
f
"DP rank
{
global_dp_rank
}
, Prompt:
{
prompt
!
r
}
, "
f
"Generated text:
{
generated_text
!
r
}
"
)
# Give engines time to pause their processing loops before exiting.
sleep
(
1
)
if
__name__
==
"__main__"
:
args
=
parse_args
()
dp_size
=
args
.
dp_size
...
...
@@ -151,19 +167,29 @@ if __name__ == "__main__":
procs
=
[]
for
local_dp_rank
,
global_dp_rank
in
enumerate
(
range
(
node_rank
*
dp_per_node
,
(
node_rank
+
1
)
*
dp_per_node
)):
proc
=
Process
(
target
=
main
,
args
=
(
args
.
model
,
dp_size
,
local_dp_rank
,
global_dp_rank
,
dp_master_ip
,
dp_master_port
,
tp_size
))
range
(
node_rank
*
dp_per_node
,
(
node_rank
+
1
)
*
dp_per_node
)
):
proc
=
Process
(
target
=
main
,
args
=
(
args
.
model
,
dp_size
,
local_dp_rank
,
global_dp_rank
,
dp_master_ip
,
dp_master_port
,
tp_size
,
args
.
enforce_eager
,
args
.
trust_remote_code
,
),
)
proc
.
start
()
procs
.
append
(
proc
)
exit_code
=
0
for
proc
in
procs
:
proc
.
join
(
timeout
=
300
)
if
proc
.
exitcode
is
None
:
print
(
f
"Killing process
{
proc
.
pid
}
that "
f
"didn't stop within 5 minutes."
)
print
(
f
"Killing process
{
proc
.
pid
}
that didn't stop within 5 minutes."
)
proc
.
kill
()
exit_code
=
1
elif
proc
.
exitcode
:
...
...
examples/offline_inference/disaggregated-prefill-v1/README.md
0 → 100644
View file @
4c676e3d
# Disaggregated Prefill V1
This example contains scripts that demonstrate disaggregated prefill in the offline setting of vLLM.
## Files
-
`run.sh`
- A helper script that will run
`prefill_example.py`
and
`decode_example.py`
sequentially.
-
Make sure you are in the
`examples/offline_inference/disaggregated-prefill-v1`
directory before running
`run.sh`
.
-
`prefill_example.py`
- A script which performs prefill only, saving the KV state to the
`local_storage`
directory and the prompts to
`output.txt`
.
-
`decode_example.py`
- A script which performs decode only, loading the KV state from the
`local_storage`
directory and the prompts from
`output.txt`
.
examples/offline_inference/disaggregated-prefill-v1/decode_example.py
View file @
4c676e3d
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from
vllm
import
LLM
,
SamplingParams
from
vllm.config
import
KVTransferConfig
# Read prompts from output.txt
prompts
=
[]
try
:
def
read_prompts
():
"""Read prompts from output.txt"""
prompts
=
[]
try
:
with
open
(
"output.txt"
)
as
f
:
for
line
in
f
:
prompts
.
append
(
line
.
strip
())
print
(
f
"Loaded
{
len
(
prompts
)
}
prompts from output.txt"
)
except
FileNotFoundError
:
return
prompts
except
FileNotFoundError
:
print
(
"Error: output.txt file not found"
)
exit
(
-
1
)
sampling_params
=
SamplingParams
(
temperature
=
0
,
top_p
=
0.95
,
max_tokens
=
10
)
llm
=
LLM
(
def
main
():
prompts
=
read_prompts
()
sampling_params
=
SamplingParams
(
temperature
=
0
,
top_p
=
0.95
,
max_tokens
=
10
)
llm
=
LLM
(
model
=
"meta-llama/Llama-3.2-1B-Instruct"
,
enforce_eager
=
True
,
gpu_memory_utilization
=
0.8
,
max_num_batched_tokens
=
64
,
max_num_seqs
=
16
,
kv_transfer_config
=
KVTransferConfig
.
from_cli
(
'{"kv_connector":"SharedStorageConnector","kv_role":"kv_both",'
'"kv_connector_extra_config": {"shared_storage_path": "local_storage"}}'
))
#, max_model_len=2048, max_num_batched_tokens=2048)
kv_transfer_config
=
KVTransferConfig
(
kv_connector
=
"SharedStorageConnector"
,
kv_role
=
"kv_both"
,
kv_connector_extra_config
=
{
"shared_storage_path"
:
"local_storage"
},
),
)
# , max_model_len=2048, max_num_batched_tokens=2048)
# 1ST generation (prefill instance)
outputs
=
llm
.
generate
(
prompts
,
sampling_params
)
# 1ST generation (prefill instance)
outputs
=
llm
.
generate
(
prompts
,
sampling_params
)
for
output
in
outputs
:
print
(
"-"
*
30
)
for
output
in
outputs
:
prompt
=
output
.
prompt
generated_text
=
output
.
outputs
[
0
].
text
print
(
f
"Prompt:
{
prompt
!
r
}
, Generated text:
{
generated_text
!
r
}
"
)
print
(
f
"Prompt:
{
prompt
!
r
}
\n
Generated text:
{
generated_text
!
r
}
"
)
print
(
"-"
*
30
)
if
__name__
==
"__main__"
:
main
()
examples/offline_inference/disaggregated-prefill-v1/prefill_example.py
View file @
4c676e3d
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from
vllm
import
LLM
,
SamplingParams
from
vllm.config
import
KVTransferConfig
context
=
"Hi "
*
1000
context2
=
"Hey "
*
500
prompts
=
[
def
read_prompts
():
context
=
"Hi "
*
1000
context2
=
"Hey "
*
500
return
[
context
+
"Hello, my name is"
,
context
+
"The capital of France is"
,
context2
+
"Your name is"
,
context2
+
"The capital of China is"
,
]
]
def
main
():
prompts
=
read_prompts
()
sampling_params
=
SamplingParams
(
temperature
=
0
,
top_p
=
0.95
,
max_tokens
=
1
)
sampling_params
=
SamplingParams
(
temperature
=
0
,
top_p
=
0.95
,
max_tokens
=
1
)
llm
=
LLM
(
model
=
"meta-llama/Llama-3.2-1B-Instruct"
,
llm
=
LLM
(
model
=
"meta-llama/Llama-3.2-1B-Instruct"
,
enforce_eager
=
True
,
gpu_memory_utilization
=
0.8
,
kv_transfer_config
=
KVTransferConfig
.
from_cli
(
'{"kv_connector":"SharedStorageConnector","kv_role":"kv_both", '
'"kv_connector_extra_config": '
'{"shared_storage_path": "local_storage"}}'
)
)
#, max_model_len=2048, max_num_batched_tokens=2048)
# 1ST generation (prefill instance)
outputs
=
llm
.
generate
(
kv_transfer_config
=
KVTransferConfig
(
kv_connector
=
"SharedStorageConnector"
,
kv_role
=
"kv_both"
,
kv_connector_extra_config
=
{
"shared_storage_path"
:
"local_storage"
},
),
)
# , max_model_len=2048, max_num_batched_tokens=2048)
# 1ST generation (prefill instance)
outputs
=
llm
.
generate
(
prompts
,
sampling_params
,
)
)
new_prompts
=
[]
for
output
in
outputs
:
new_prompts
=
[]
print
(
"-"
*
30
)
for
output
in
outputs
:
prompt
=
output
.
prompt
generated_text
=
output
.
outputs
[
0
].
text
new_prompts
.
append
(
prompt
+
generated_text
)
print
(
f
"Prompt:
{
prompt
!
r
}
, Generated text:
{
generated_text
!
r
}
"
)
print
(
f
"Prompt:
{
prompt
!
r
}
\n
Generated text:
{
generated_text
!
r
}
"
)
print
(
"-"
*
30
)
# Write new_prompts to output.txt
with
open
(
"output.txt"
,
"w"
)
as
f
:
# Write new_prompts to output.txt
with
open
(
"output.txt"
,
"w"
)
as
f
:
for
prompt
in
new_prompts
:
f
.
write
(
prompt
+
"
\n
"
)
print
(
f
"Saved
{
len
(
new_prompts
)
}
prompts to output.txt"
)
print
(
f
"Saved
{
len
(
new_prompts
)
}
prompts to output.txt"
)
if
__name__
==
"__main__"
:
main
()
examples/offline_inference/disaggregated-prefill-v1/run.sh
View file @
4c676e3d
rm
-rf
local_storage/
rm
output.txt
VLLM_ENABLE_V1_MULTIPROCESSING
=
0
CUDA_VISIBLE_DEVICES
=
0 python3 prefill_example.py
VLLM_ENABLE_V1_MULTIPROCESSING
=
0
CUDA_VISIBLE_DEVICES
=
0 python3 decode_example.py
if
[
-f
"output.txt"
]
;
then
rm
output.txt
fi
# The directory of current script
SCRIPT_DIR
=
$(
dirname
"
$(
readlink
-f
"
$0
"
)
"
)
VLLM_ENABLE_V1_MULTIPROCESSING
=
0
CUDA_VISIBLE_DEVICES
=
0 python3
"
$SCRIPT_DIR
/prefill_example.py"
VLLM_ENABLE_V1_MULTIPROCESSING
=
0
CUDA_VISIBLE_DEVICES
=
0 python3
"
$SCRIPT_DIR
/decode_example.py"
examples/offline_inference/disaggregated_prefill.py
View file @
4c676e3d
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""
This file demonstrates the example usage of disaggregated prefilling
We will launch 2 vllm instances (GPU 0 for prefill and GPU 1 for decode),
and then transfer the KV cache between them.
"""
import
os
import
time
from
multiprocessing
import
Event
,
Process
...
...
@@ -32,16 +34,21 @@ def run_prefill(prefill_done):
# This instance is the prefill node (kv_producer, rank 0).
# The number of parallel instances for KV cache transfer is set to 2,
# as required for PyNcclConnector.
ktc
=
KVTransferConfig
.
from_cli
(
'{"kv_connector":"PyNcclConnector","kv_role":"kv_producer","kv_rank":0,"kv_parallel_size":2}'
ktc
=
KVTransferConfig
(
kv_connector
=
"PyNcclConnector"
,
kv_role
=
"kv_producer"
,
kv_rank
=
0
,
kv_parallel_size
=
2
,
)
# Set GPU memory utilization to 0.8 for an A6000 GPU with 40GB
# memory. You may need to adjust the value to fit your GPU.
llm
=
LLM
(
model
=
"meta-llama/Meta-Llama-3.1-8B-Instruct"
,
llm
=
LLM
(
model
=
"meta-llama/Meta-Llama-3.1-8B-Instruct"
,
kv_transfer_config
=
ktc
,
max_model_len
=
2000
,
gpu_memory_utilization
=
0.8
)
gpu_memory_utilization
=
0.8
,
)
llm
.
generate
(
prompts
,
sampling_params
)
print
(
"Prefill node is finished."
)
...
...
@@ -71,16 +78,21 @@ def run_decode(prefill_done):
# This instance is the decode node (kv_consumer, rank 1).
# The number of parallel instances for KV cache transfer is set to 2,
# as required for PyNcclConnector.
ktc
=
KVTransferConfig
.
from_cli
(
'{"kv_connector":"PyNcclConnector","kv_role":"kv_consumer","kv_rank":1,"kv_parallel_size":2}'
ktc
=
KVTransferConfig
(
kv_connector
=
"PyNcclConnector"
,
kv_role
=
"kv_consumer"
,
kv_rank
=
1
,
kv_parallel_size
=
2
,
)
# Set GPU memory utilization to 0.8 for an A6000 GPU with 40GB
# memory. You may need to adjust the value to fit your GPU.
llm
=
LLM
(
model
=
"meta-llama/Meta-Llama-3.1-8B-Instruct"
,
llm
=
LLM
(
model
=
"meta-llama/Meta-Llama-3.1-8B-Instruct"
,
kv_transfer_config
=
ktc
,
max_model_len
=
2000
,
gpu_memory_utilization
=
0.8
)
gpu_memory_utilization
=
0.8
,
)
# Wait for the producer to start the pipe
print
(
"Waiting for prefill node to finish..."
)
...
...
@@ -97,8 +109,8 @@ def run_decode(prefill_done):
def
main
():
prefill_done
=
Event
()
prefill_process
=
Process
(
target
=
run_prefill
,
args
=
(
prefill_done
,
))
decode_process
=
Process
(
target
=
run_decode
,
args
=
(
prefill_done
,
))
prefill_process
=
Process
(
target
=
run_prefill
,
args
=
(
prefill_done
,))
decode_process
=
Process
(
target
=
run_decode
,
args
=
(
prefill_done
,))
# Start prefill node
prefill_process
.
start
()
...
...
examples/offline_inference/eagle.py
View file @
4c676e3d
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import
argparse
import
json
import
os
...
...
@@ -6,6 +7,7 @@ import os
from
transformers
import
AutoTokenizer
from
vllm
import
LLM
,
SamplingParams
from
vllm.v1.metrics.reader
import
Counter
,
Vector
def
load_prompts
(
dataset_path
,
num_prompts
):
...
...
@@ -20,9 +22,7 @@ def load_prompts(dataset_path, num_prompts):
print
(
f
"Error reading dataset:
{
e
}
"
)
return
[]
else
:
prompts
=
[
"The future of AI is"
,
"The president of the United States is"
]
prompts
=
[
"The future of AI is"
,
"The president of the United States is"
]
return
prompts
[:
num_prompts
]
...
...
@@ -33,27 +33,35 @@ def parse_args():
"--dataset"
,
type
=
str
,
default
=
"./examples/data/gsm8k.jsonl"
,
help
=
"downloaded from the eagle repo "
\
"https://github.com/SafeAILab/EAGLE/blob/main/eagle/data/"
help
=
"downloaded from the eagle repo "
"https://github.com/SafeAILab/EAGLE/blob/main/eagle/data/"
,
)
parser
.
add_argument
(
"--method"
,
type
=
str
,
default
=
"eagle"
,
choices
=
[
"eagle"
,
"eagle3"
]
)
parser
.
add_argument
(
"--max_num_seqs"
,
type
=
int
,
default
=
8
)
parser
.
add_argument
(
"--num_prompts"
,
type
=
int
,
default
=
80
)
parser
.
add_argument
(
"--num_spec_tokens"
,
type
=
int
,
default
=
2
)
parser
.
add_argument
(
"--tp"
,
type
=
int
,
default
=
1
)
parser
.
add_argument
(
"--draft_tp"
,
type
=
int
,
default
=
1
)
parser
.
add_argument
(
"--enforce_eager"
,
action
=
'
store_true
'
)
parser
.
add_argument
(
"--enable_chunked_prefill"
,
action
=
'
store_true
'
)
parser
.
add_argument
(
"--enforce_eager"
,
action
=
"
store_true
"
)
parser
.
add_argument
(
"--enable_chunked_prefill"
,
action
=
"
store_true
"
)
parser
.
add_argument
(
"--max_num_batched_tokens"
,
type
=
int
,
default
=
2048
)
parser
.
add_argument
(
"--temp"
,
type
=
float
,
default
=
0
)
return
parser
.
parse_args
()
def
main
():
args
=
parse_args
()
model_dir
=
"meta-llama/Llama-3.1-8B-Instruct"
if
args
.
method
==
"eagle"
:
eagle_dir
=
"yuhuili/EAGLE-LLaMA3.1-Instruct-8B"
elif
args
.
method
==
"eagle3"
:
eagle_dir
=
"yuhuili/EAGLE3-LLaMA3.1-Instruct-8B"
else
:
raise
ValueError
(
f
"unknown method:
{
args
.
method
}
"
)
max_model_len
=
2048
...
...
@@ -62,11 +70,9 @@ def main():
prompts
=
load_prompts
(
args
.
dataset
,
args
.
num_prompts
)
prompt_ids
=
[
tokenizer
.
apply_chat_template
([{
"role"
:
"user"
,
"content"
:
prompt
}],
add_generation_prompt
=
True
)
tokenizer
.
apply_chat_template
(
[{
"role"
:
"user"
,
"content"
:
prompt
}],
add_generation_prompt
=
True
)
for
prompt
in
prompts
]
...
...
@@ -81,7 +87,7 @@ def main():
max_num_seqs
=
args
.
max_num_seqs
,
gpu_memory_utilization
=
0.8
,
speculative_config
=
{
"method"
:
"eagle3"
if
"eagle3"
in
eagle_dir
.
lower
()
else
"eagle"
,
"method"
:
args
.
method
,
"model"
:
eagle_dir
,
"num_speculative_tokens"
:
args
.
num_spec_tokens
,
"draft_tensor_parallel_size"
:
args
.
draft_tp
,
...
...
@@ -92,30 +98,42 @@ def main():
sampling_params
=
SamplingParams
(
temperature
=
args
.
temp
,
max_tokens
=
256
)
outputs
=
llm
.
generate
(
prompt_token_ids
=
prompt_ids
,
sampling_params
=
sampling_params
)
outputs
=
llm
.
generate
(
prompt_token_ids
=
prompt_ids
,
sampling_params
=
sampling_params
)
if
not
hasattr
(
outputs
,
"metrics"
)
or
outputs
.
metrics
is
None
:
# print the generated text
for
output
in
outputs
:
print
(
"-"
*
50
)
print
(
f
"prompt:
{
output
.
prompt
}
"
)
print
(
f
"generated text:
{
output
.
outputs
[
0
].
text
}
"
)
print
(
"-"
*
50
)
try
:
metrics
=
llm
.
get_metrics
()
except
AssertionError
:
print
(
"Metrics are not supported in the V0 engine."
)
return
# calculate the average number of accepted tokens per forward pass, +1 is
# to account for the token from the target model that's always going to be
# accepted
acceptance_counts
=
[
0
]
*
(
args
.
num_spec_tokens
+
1
)
for
output
in
outputs
:
for
step
,
count
in
enumerate
(
output
.
metrics
.
spec_token_acceptance_counts
):
acceptance_counts
[
step
]
+=
count
num_drafts
=
num_accepted
=
0
acceptance_counts
=
[
0
]
*
args
.
num_spec_tokens
for
metric
in
metrics
:
if
metric
.
name
==
"vllm:spec_decode_num_drafts"
:
assert
isinstance
(
metric
,
Counter
)
num_drafts
+=
metric
.
value
elif
metric
.
name
==
"vllm:spec_decode_num_accepted_tokens"
:
assert
isinstance
(
metric
,
Counter
)
num_accepted
+=
metric
.
value
elif
metric
.
name
==
"vllm:spec_decode_num_accepted_tokens_per_pos"
:
assert
isinstance
(
metric
,
Vector
)
for
pos
in
range
(
len
(
metric
.
values
)):
acceptance_counts
[
pos
]
+=
metric
.
values
[
pos
]
print
(
"-"
*
50
)
print
(
f
"mean acceptance length:
\
{
sum
(
acceptance_counts
)
/
acceptance_counts
[
0
]:.
2
f
}
"
)
print
(
f
"mean acceptance length:
{
1
+
(
num_accepted
/
num_drafts
):.
2
f
}
"
)
print
(
"-"
*
50
)
# print acceptance at each token position
for
i
in
range
(
len
(
acceptance_counts
)):
print
(
f
"acceptance at token
{
i
}
:"
f
"
{
acceptance_counts
[
i
]
/
(
acceptance_counts
[
0
]):.
2
f
}
"
)
print
(
f
"acceptance at token
{
i
}
:
{
acceptance_counts
[
i
]
/
num_drafts
:.
2
f
}
"
)
if
__name__
==
"__main__"
:
...
...
examples/offline_inference/embed_jina_embeddings_v3.py
View file @
4c676e3d
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from
argparse
import
Namespace
...
...
@@ -10,9 +11,9 @@ def parse_args():
parser
=
FlexibleArgumentParser
()
parser
=
EngineArgs
.
add_cli_args
(
parser
)
# Set example specific arguments
parser
.
set_defaults
(
model
=
"jinaai/jina-embeddings-v3"
,
task
=
"embed"
,
trust_remote_code
=
True
)
parser
.
set_defaults
(
model
=
"jinaai/jina-embeddings-v3"
,
task
=
"embed"
,
trust_remote_code
=
True
)
return
parser
.
parse_args
()
...
...
@@ -41,11 +42,14 @@ def main(args: Namespace):
print
(
"-"
*
60
)
for
prompt
,
output
in
zip
(
prompts
,
outputs
):
embeds
=
output
.
outputs
.
embedding
embeds_trimmed
=
((
str
(
embeds
[:
16
])[:
-
1
]
+
", ...]"
)
if
len
(
embeds
)
>
16
else
embeds
)
print
(
f
"Prompt:
{
prompt
!
r
}
\n
"
embeds_trimmed
=
(
(
str
(
embeds
[:
16
])[:
-
1
]
+
", ...]"
)
if
len
(
embeds
)
>
16
else
embeds
)
print
(
f
"Prompt:
{
prompt
!
r
}
\n
"
f
"Embeddings for text matching:
{
embeds_trimmed
}
"
f
"(size=
{
len
(
embeds
)
}
)"
)
f
"(size=
{
len
(
embeds
)
}
)"
)
print
(
"-"
*
60
)
...
...
Prev
1
…
18
19
20
21
22
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment