Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
ModelZoo
LLaMA_vllm
Commits
25f39502
Commit
25f39502
authored
Aug 16, 2025
by
laibao
Browse files
更新README.md,修改Docker镜像版本和深度学习库依赖,删除多个示例文件以简化代码库。
parent
951558c2
Changes
186
Expand all
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
1667 additions
and
66 deletions
+1667
-66
examples/offline_inference/context_extension.py
examples/offline_inference/context_extension.py
+68
-0
examples/offline_inference/data_parallel.py
examples/offline_inference/data_parallel.py
+216
-0
examples/offline_inference/disaggregated-prefill-v1/README.md
...ples/offline_inference/disaggregated-prefill-v1/README.md
+10
-0
examples/offline_inference/disaggregated-prefill-v1/decode_example.py
...line_inference/disaggregated-prefill-v1/decode_example.py
+51
-0
examples/offline_inference/disaggregated-prefill-v1/prefill_example.py
...ine_inference/disaggregated-prefill-v1/prefill_example.py
+58
-0
examples/offline_inference/disaggregated-prefill-v1/run.sh
examples/offline_inference/disaggregated-prefill-v1/run.sh
+11
-0
examples/offline_inference/disaggregated_prefill.py
examples/offline_inference/disaggregated_prefill.py
+127
-0
examples/offline_inference/embed_jina_embeddings_v3.py
examples/offline_inference/embed_jina_embeddings_v3.py
+58
-0
examples/offline_inference/embed_matryoshka_fy.py
examples/offline_inference/embed_matryoshka_fy.py
+52
-0
examples/offline_inference/encoder_decoder.py
examples/offline_inference/encoder_decoder.py
+132
-0
examples/offline_inference/encoder_decoder_multimodal.py
examples/offline_inference/encoder_decoder_multimodal.py
+196
-0
examples/offline_inference/llm_engine_example.py
examples/offline_inference/llm_engine_example.py
+74
-0
examples/offline_inference/load_sharded_state.py
examples/offline_inference/load_sharded_state.py
+94
-0
examples/offline_inference/lora_with_quantization_inference.py
...les/offline_inference/lora_with_quantization_inference.py
+136
-0
examples/offline_inference/metrics.py
examples/offline_inference/metrics.py
+50
-0
examples/offline_inference/mistral-small.py
examples/offline_inference/mistral-small.py
+73
-52
examples/offline_inference/mlpspeculator.py
examples/offline_inference/mlpspeculator.py
+29
-14
examples/offline_inference/multilora_inference.py
examples/offline_inference/multilora_inference.py
+122
-0
examples/offline_inference/neuron.py
examples/offline_inference/neuron.py
+49
-0
examples/offline_inference/neuron_eagle.py
examples/offline_inference/neuron_eagle.py
+61
-0
No files found.
examples/offline_inference/context_extension.py
0 → 100644
View file @
25f39502
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""
This script demonstrates how to extend the context length
of a Qwen model using the YARN method (rope_scaling)
and run a simple chat example.
Usage:
python examples/offline_inference/context_extension.py
"""
from
vllm
import
LLM
,
SamplingParams
def
create_llm
():
rope_theta
=
1000000
original_max_position_embeddings
=
32768
factor
=
4.0
# Use yarn to extend context
hf_overrides
=
{
"rope_theta"
:
rope_theta
,
"rope_scaling"
:
{
"rope_type"
:
"yarn"
,
"factor"
:
factor
,
"original_max_position_embeddings"
:
original_max_position_embeddings
,
},
"max_model_len"
:
int
(
original_max_position_embeddings
*
factor
),
}
llm
=
LLM
(
model
=
"Qwen/Qwen3-0.6B"
,
hf_overrides
=
hf_overrides
)
return
llm
def
run_llm_chat
(
llm
):
sampling_params
=
SamplingParams
(
temperature
=
0.8
,
top_p
=
0.95
,
max_tokens
=
128
,
)
conversation
=
[
{
"role"
:
"system"
,
"content"
:
"You are a helpful assistant"
},
{
"role"
:
"user"
,
"content"
:
"Hello"
},
{
"role"
:
"assistant"
,
"content"
:
"Hello! How can I assist you today?"
},
]
outputs
=
llm
.
chat
(
conversation
,
sampling_params
,
use_tqdm
=
False
)
return
outputs
def
print_outputs
(
outputs
):
print
(
"
\n
Generated Outputs:
\n
"
+
"-"
*
80
)
for
output
in
outputs
:
prompt
=
output
.
prompt
generated_text
=
output
.
outputs
[
0
].
text
print
(
f
"Prompt:
{
prompt
!
r
}
\n
"
)
print
(
f
"Generated text:
{
generated_text
!
r
}
"
)
print
(
"-"
*
80
)
def
main
():
llm
=
create_llm
()
outputs
=
run_llm_chat
(
llm
)
print_outputs
(
outputs
)
if
__name__
==
"__main__"
:
main
()
examples/offline_inference/data_parallel.py
0 → 100644
View file @
25f39502
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""
Usage:
Single node:
python examples/offline_inference/data_parallel.py
\
--model="ibm-research/PowerMoE-3b"
\
--dp-size=2
\
--tp-size=2
Multi-node:
Node 0 (assume the node has ip of 10.99.48.128):
python examples/offline_inference/data_parallel.py
\
--model="ibm-research/PowerMoE-3b"
\
--dp-size=2
\
--tp-size=2
\
--node-size=2
\
--node-rank=0
\
--master-addr=10.99.48.128
\
--master-port=13345
Node 1:
python examples/offline_inference/data_parallel.py
\
--model="ibm-research/PowerMoE-3b"
\
--dp-size=2
\
--tp-size=2
\
--node-size=2
\
--node-rank=1
\
--master-addr=10.99.48.128
\
--master-port=13345
"""
import
os
from
time
import
sleep
from
vllm
import
LLM
,
SamplingParams
from
vllm.utils
import
get_open_port
def
parse_args
():
import
argparse
parser
=
argparse
.
ArgumentParser
(
description
=
"Data Parallel Inference"
)
parser
.
add_argument
(
"--model"
,
type
=
str
,
default
=
"ibm-research/PowerMoE-3b"
,
help
=
"Model name or path"
,
)
parser
.
add_argument
(
"--dp-size"
,
type
=
int
,
default
=
2
,
help
=
"Data parallel size"
)
parser
.
add_argument
(
"--tp-size"
,
type
=
int
,
default
=
2
,
help
=
"Tensor parallel size"
)
parser
.
add_argument
(
"--node-size"
,
type
=
int
,
default
=
1
,
help
=
"Total number of nodes"
)
parser
.
add_argument
(
"--node-rank"
,
type
=
int
,
default
=
0
,
help
=
"Rank of the current node"
)
parser
.
add_argument
(
"--master-addr"
,
type
=
str
,
default
=
""
,
help
=
"Master node IP address"
)
parser
.
add_argument
(
"--master-port"
,
type
=
int
,
default
=
0
,
help
=
"Master node port"
)
parser
.
add_argument
(
"--enforce-eager"
,
action
=
"store_true"
,
help
=
"Enforce eager mode execution."
)
parser
.
add_argument
(
"--trust-remote-code"
,
action
=
"store_true"
,
help
=
"Trust remote code."
)
parser
.
add_argument
(
"--max-num-seqs"
,
type
=
int
,
default
=
64
,
help
=
(
"Maximum number of sequences to be processed in a single iteration."
),
)
parser
.
add_argument
(
"--gpu-memory-utilization"
,
type
=
float
,
default
=
0.8
,
help
=
(
"Fraction of GPU memory vLLM is allowed to allocate (0.0, 1.0]."
),
)
return
parser
.
parse_args
()
def
main
(
model
,
dp_size
,
local_dp_rank
,
global_dp_rank
,
dp_master_ip
,
dp_master_port
,
GPUs_per_dp_rank
,
enforce_eager
,
trust_remote_code
,
max_num_seqs
,
gpu_memory_utilization
,
):
os
.
environ
[
"VLLM_DP_RANK"
]
=
str
(
global_dp_rank
)
os
.
environ
[
"VLLM_DP_RANK_LOCAL"
]
=
str
(
local_dp_rank
)
os
.
environ
[
"VLLM_DP_SIZE"
]
=
str
(
dp_size
)
os
.
environ
[
"VLLM_DP_MASTER_IP"
]
=
dp_master_ip
os
.
environ
[
"VLLM_DP_MASTER_PORT"
]
=
str
(
dp_master_port
)
# CUDA_VISIBLE_DEVICES for each DP rank is set automatically inside the
# engine processes.
# Sample prompts.
prompts
=
[
"Hello, my name is"
,
"The president of the United States is"
,
"The capital of France is"
,
"The future of AI is"
,
]
*
100
# with DP, each rank should process different prompts.
# usually all the DP ranks process a full dataset,
# and each rank processes a different part of the dataset.
floor
=
len
(
prompts
)
//
dp_size
remainder
=
len
(
prompts
)
%
dp_size
# Distribute prompts into even groups.
def
start
(
rank
):
return
rank
*
floor
+
min
(
rank
,
remainder
)
prompts
=
prompts
[
start
(
global_dp_rank
)
:
start
(
global_dp_rank
+
1
)]
if
len
(
prompts
)
==
0
:
# if any rank has no prompts to process,
# we need to set a placeholder prompt
prompts
=
[
"Placeholder"
]
print
(
f
"DP rank
{
global_dp_rank
}
needs to process
{
len
(
prompts
)
}
prompts"
)
# Create a sampling params object.
# since we are doing data parallel, every rank can have different
# sampling params. here we set different max_tokens for different
# ranks for demonstration.
sampling_params
=
SamplingParams
(
temperature
=
0.8
,
top_p
=
0.95
,
max_tokens
=
[
16
,
20
][
global_dp_rank
%
2
]
)
# Create an LLM.
llm
=
LLM
(
model
=
model
,
tensor_parallel_size
=
GPUs_per_dp_rank
,
enforce_eager
=
enforce_eager
,
enable_expert_parallel
=
True
,
trust_remote_code
=
trust_remote_code
,
max_num_seqs
=
max_num_seqs
,
gpu_memory_utilization
=
gpu_memory_utilization
,
)
outputs
=
llm
.
generate
(
prompts
,
sampling_params
)
# Print the outputs.
for
i
,
output
in
enumerate
(
outputs
):
if
i
>=
5
:
# print only 5 outputs
break
prompt
=
output
.
prompt
generated_text
=
output
.
outputs
[
0
].
text
print
(
f
"DP rank
{
global_dp_rank
}
, Prompt:
{
prompt
!
r
}
, "
f
"Generated text:
{
generated_text
!
r
}
"
)
# Give engines time to pause their processing loops before exiting.
sleep
(
1
)
if
__name__
==
"__main__"
:
args
=
parse_args
()
dp_size
=
args
.
dp_size
tp_size
=
args
.
tp_size
node_size
=
args
.
node_size
node_rank
=
args
.
node_rank
if
node_size
==
1
:
dp_master_ip
=
"127.0.0.1"
dp_master_port
=
get_open_port
()
else
:
dp_master_ip
=
args
.
master_addr
dp_master_port
=
args
.
master_port
assert
dp_size
%
node_size
==
0
,
"dp_size should be divisible by node_size"
dp_per_node
=
dp_size
//
node_size
from
multiprocessing
import
Process
procs
=
[]
for
local_dp_rank
,
global_dp_rank
in
enumerate
(
range
(
node_rank
*
dp_per_node
,
(
node_rank
+
1
)
*
dp_per_node
)
):
proc
=
Process
(
target
=
main
,
args
=
(
args
.
model
,
dp_size
,
local_dp_rank
,
global_dp_rank
,
dp_master_ip
,
dp_master_port
,
tp_size
,
args
.
enforce_eager
,
args
.
trust_remote_code
,
args
.
max_num_seqs
,
args
.
gpu_memory_utilization
,
),
)
proc
.
start
()
procs
.
append
(
proc
)
exit_code
=
0
for
proc
in
procs
:
proc
.
join
(
timeout
=
300
)
if
proc
.
exitcode
is
None
:
print
(
f
"Killing process
{
proc
.
pid
}
that didn't stop within 5 minutes."
)
proc
.
kill
()
exit_code
=
1
elif
proc
.
exitcode
:
exit_code
=
proc
.
exitcode
exit
(
exit_code
)
examples/offline_inference/disaggregated-prefill-v1/README.md
0 → 100644
View file @
25f39502
# Disaggregated Prefill V1
This example contains scripts that demonstrate disaggregated prefill in the offline setting of vLLM.
## Files
-
`run.sh`
- A helper script that will run
`prefill_example.py`
and
`decode_example.py`
sequentially.
-
Make sure you are in the
`examples/offline_inference/disaggregated-prefill-v1`
directory before running
`run.sh`
.
-
`prefill_example.py`
- A script which performs prefill only, saving the KV state to the
`local_storage`
directory and the prompts to
`output.txt`
.
-
`decode_example.py`
- A script which performs decode only, loading the KV state from the
`local_storage`
directory and the prompts from
`output.txt`
.
examples/offline_inference/disaggregated-prefill-v1/decode_example.py
0 → 100644
View file @
25f39502
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from
vllm
import
LLM
,
SamplingParams
from
vllm.config
import
KVTransferConfig
def
read_prompts
():
"""Read prompts from output.txt"""
prompts
=
[]
try
:
with
open
(
"output.txt"
)
as
f
:
for
line
in
f
:
prompts
.
append
(
line
.
strip
())
print
(
f
"Loaded
{
len
(
prompts
)
}
prompts from output.txt"
)
return
prompts
except
FileNotFoundError
:
print
(
"Error: output.txt file not found"
)
exit
(
-
1
)
def
main
():
prompts
=
read_prompts
()
sampling_params
=
SamplingParams
(
temperature
=
0
,
top_p
=
0.95
,
max_tokens
=
10
)
llm
=
LLM
(
model
=
"meta-llama/Llama-3.2-1B-Instruct"
,
enforce_eager
=
True
,
gpu_memory_utilization
=
0.8
,
max_num_batched_tokens
=
64
,
max_num_seqs
=
16
,
kv_transfer_config
=
KVTransferConfig
(
kv_connector
=
"SharedStorageConnector"
,
kv_role
=
"kv_both"
,
kv_connector_extra_config
=
{
"shared_storage_path"
:
"local_storage"
},
),
)
# , max_model_len=2048, max_num_batched_tokens=2048)
# 1ST generation (prefill instance)
outputs
=
llm
.
generate
(
prompts
,
sampling_params
)
print
(
"-"
*
30
)
for
output
in
outputs
:
prompt
=
output
.
prompt
generated_text
=
output
.
outputs
[
0
].
text
print
(
f
"Prompt:
{
prompt
!
r
}
\n
Generated text:
{
generated_text
!
r
}
"
)
print
(
"-"
*
30
)
if
__name__
==
"__main__"
:
main
()
examples/offline_inference/disaggregated-prefill-v1/prefill_example.py
0 → 100644
View file @
25f39502
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from
vllm
import
LLM
,
SamplingParams
from
vllm.config
import
KVTransferConfig
def
read_prompts
():
context
=
"Hi "
*
1000
context2
=
"Hey "
*
500
return
[
context
+
"Hello, my name is"
,
context
+
"The capital of France is"
,
context2
+
"Your name is"
,
context2
+
"The capital of China is"
,
]
def
main
():
prompts
=
read_prompts
()
sampling_params
=
SamplingParams
(
temperature
=
0
,
top_p
=
0.95
,
max_tokens
=
1
)
llm
=
LLM
(
model
=
"meta-llama/Llama-3.2-1B-Instruct"
,
enforce_eager
=
True
,
gpu_memory_utilization
=
0.8
,
kv_transfer_config
=
KVTransferConfig
(
kv_connector
=
"SharedStorageConnector"
,
kv_role
=
"kv_both"
,
kv_connector_extra_config
=
{
"shared_storage_path"
:
"local_storage"
},
),
)
# , max_model_len=2048, max_num_batched_tokens=2048)
# 1ST generation (prefill instance)
outputs
=
llm
.
generate
(
prompts
,
sampling_params
,
)
new_prompts
=
[]
print
(
"-"
*
30
)
for
output
in
outputs
:
prompt
=
output
.
prompt
generated_text
=
output
.
outputs
[
0
].
text
new_prompts
.
append
(
prompt
+
generated_text
)
print
(
f
"Prompt:
{
prompt
!
r
}
\n
Generated text:
{
generated_text
!
r
}
"
)
print
(
"-"
*
30
)
# Write new_prompts to output.txt
with
open
(
"output.txt"
,
"w"
)
as
f
:
for
prompt
in
new_prompts
:
f
.
write
(
prompt
+
"
\n
"
)
print
(
f
"Saved
{
len
(
new_prompts
)
}
prompts to output.txt"
)
if
__name__
==
"__main__"
:
main
()
examples/offline_inference/disaggregated-prefill-v1/run.sh
0 → 100644
View file @
25f39502
rm
-rf
local_storage/
if
[
-f
"output.txt"
]
;
then
rm
output.txt
fi
# The directory of current script
SCRIPT_DIR
=
$(
dirname
"
$(
readlink
-f
"
$0
"
)
"
)
VLLM_ENABLE_V1_MULTIPROCESSING
=
0
CUDA_VISIBLE_DEVICES
=
0 python3
"
$SCRIPT_DIR
/prefill_example.py"
VLLM_ENABLE_V1_MULTIPROCESSING
=
0
CUDA_VISIBLE_DEVICES
=
0 python3
"
$SCRIPT_DIR
/decode_example.py"
examples/offline_inference/disaggregated_prefill.py
0 → 100644
View file @
25f39502
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""
This file demonstrates the example usage of disaggregated prefilling
We will launch 2 vllm instances (GPU 0 for prefill and GPU 1 for decode),
and then transfer the KV cache between them.
"""
import
os
import
time
from
multiprocessing
import
Event
,
Process
from
vllm
import
LLM
,
SamplingParams
from
vllm.config
import
KVTransferConfig
def
run_prefill
(
prefill_done
):
# We use GPU 0 for prefill node.
os
.
environ
[
"CUDA_VISIBLE_DEVICES"
]
=
"0"
# The prefill node receives two requests, while the decode node receives
# three requests. So the decode node will only receive the KV Cache for
# requests 1 and 3. The decode node will use the KV Cache of requests 1
# and 3 and do prefilling on request 2.
prompts
=
[
"Hello, my name is"
,
"Hi, your name is"
,
# The decode node will actually "prefill" this request.
"Tell me a very long story"
,
]
sampling_params
=
SamplingParams
(
temperature
=
0
,
top_p
=
0.95
,
max_tokens
=
1
)
# Using PyNcclConnector to transmit KV caches between vLLM instances.
# This instance is the prefill node (kv_producer, rank 0).
# The number of parallel instances for KV cache transfer is set to 2,
# as required for PyNcclConnector.
ktc
=
KVTransferConfig
(
kv_connector
=
"PyNcclConnector"
,
kv_role
=
"kv_producer"
,
kv_rank
=
0
,
kv_parallel_size
=
2
,
)
# Set GPU memory utilization to 0.8 for an A6000 GPU with 40GB
# memory. You may need to adjust the value to fit your GPU.
llm
=
LLM
(
model
=
"meta-llama/Meta-Llama-3.1-8B-Instruct"
,
kv_transfer_config
=
ktc
,
max_model_len
=
2000
,
gpu_memory_utilization
=
0.8
,
)
llm
.
generate
(
prompts
,
sampling_params
)
print
(
"Prefill node is finished."
)
prefill_done
.
set
()
# To keep the prefill node running in case the decode node is not done;
# otherwise, the script might exit prematurely, causing incomplete decoding.
try
:
while
True
:
time
.
sleep
(
1
)
except
KeyboardInterrupt
:
print
(
"Script stopped by user."
)
def
run_decode
(
prefill_done
):
# We use GPU 1 for decode node.
os
.
environ
[
"CUDA_VISIBLE_DEVICES"
]
=
"1"
prompts
=
[
"Hello, my name is"
,
"Hi, your name is"
,
"Tell me a very long story"
,
]
sampling_params
=
SamplingParams
(
temperature
=
0
,
top_p
=
0.95
)
# Using PyNcclConnector to transmit KV caches between vLLM instances.
# This instance is the decode node (kv_consumer, rank 1).
# The number of parallel instances for KV cache transfer is set to 2,
# as required for PyNcclConnector.
ktc
=
KVTransferConfig
(
kv_connector
=
"PyNcclConnector"
,
kv_role
=
"kv_consumer"
,
kv_rank
=
1
,
kv_parallel_size
=
2
,
)
# Set GPU memory utilization to 0.8 for an A6000 GPU with 40GB
# memory. You may need to adjust the value to fit your GPU.
llm
=
LLM
(
model
=
"meta-llama/Meta-Llama-3.1-8B-Instruct"
,
kv_transfer_config
=
ktc
,
max_model_len
=
2000
,
gpu_memory_utilization
=
0.8
,
)
# Wait for the producer to start the pipe
print
(
"Waiting for prefill node to finish..."
)
prefill_done
.
wait
()
# At this point when the prefill_done is set, the kv-cache should have been
# transferred to this decode node, so we can start decoding.
outputs
=
llm
.
generate
(
prompts
,
sampling_params
)
for
output
in
outputs
:
prompt
=
output
.
prompt
generated_text
=
output
.
outputs
[
0
].
text
print
(
f
"Prompt:
{
prompt
!
r
}
, Generated text:
{
generated_text
!
r
}
"
)
def
main
():
prefill_done
=
Event
()
prefill_process
=
Process
(
target
=
run_prefill
,
args
=
(
prefill_done
,))
decode_process
=
Process
(
target
=
run_decode
,
args
=
(
prefill_done
,))
# Start prefill node
prefill_process
.
start
()
# Start decode node
decode_process
.
start
()
# Terminate the prefill node when decode is finished
decode_process
.
join
()
prefill_process
.
terminate
()
if
__name__
==
"__main__"
:
main
()
examples/offline_inference/embed_jina_embeddings_v3.py
0 → 100644
View file @
25f39502
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from
argparse
import
Namespace
from
vllm
import
LLM
,
EngineArgs
from
vllm.utils
import
FlexibleArgumentParser
def
parse_args
():
parser
=
FlexibleArgumentParser
()
parser
=
EngineArgs
.
add_cli_args
(
parser
)
# Set example specific arguments
parser
.
set_defaults
(
model
=
"jinaai/jina-embeddings-v3"
,
task
=
"embed"
,
trust_remote_code
=
True
)
return
parser
.
parse_args
()
def
main
(
args
:
Namespace
):
# Sample prompts.
prompts
=
[
"Follow the white rabbit."
,
# English
"Sigue al conejo blanco."
,
# Spanish
"Suis le lapin blanc."
,
# French
"跟着白兔走。"
,
# Chinese
"اتبع الأرنب الأبيض."
,
# Arabic
"Folge dem weißen Kaninchen."
,
# German
]
# Create an LLM.
# You should pass task="embed" for embedding models
model
=
LLM
(
**
vars
(
args
))
# Generate embedding. The output is a list of EmbeddingRequestOutputs.
# Only text matching task is supported for now. See #16120
outputs
=
model
.
embed
(
prompts
)
# Print the outputs.
print
(
"
\n
Generated Outputs:"
)
print
(
"Only text matching task is supported for now. See #16120"
)
print
(
"-"
*
60
)
for
prompt
,
output
in
zip
(
prompts
,
outputs
):
embeds
=
output
.
outputs
.
embedding
embeds_trimmed
=
(
(
str
(
embeds
[:
16
])[:
-
1
]
+
", ...]"
)
if
len
(
embeds
)
>
16
else
embeds
)
print
(
f
"Prompt:
{
prompt
!
r
}
\n
"
f
"Embeddings for text matching:
{
embeds_trimmed
}
"
f
"(size=
{
len
(
embeds
)
}
)"
)
print
(
"-"
*
60
)
if
__name__
==
"__main__"
:
args
=
parse_args
()
main
(
args
)
examples/offline_inference/embed_matryoshka_fy.py
0 → 100644
View file @
25f39502
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from
argparse
import
Namespace
from
vllm
import
LLM
,
EngineArgs
,
PoolingParams
from
vllm.utils
import
FlexibleArgumentParser
def
parse_args
():
parser
=
FlexibleArgumentParser
()
parser
=
EngineArgs
.
add_cli_args
(
parser
)
# Set example specific arguments
parser
.
set_defaults
(
model
=
"jinaai/jina-embeddings-v3"
,
task
=
"embed"
,
trust_remote_code
=
True
)
return
parser
.
parse_args
()
def
main
(
args
:
Namespace
):
# Sample prompts.
prompts
=
[
"Follow the white rabbit."
,
# English
"Sigue al conejo blanco."
,
# Spanish
"Suis le lapin blanc."
,
# French
"跟着白兔走。"
,
# Chinese
"اتبع الأرنب الأبيض."
,
# Arabic
"Folge dem weißen Kaninchen."
,
# German
]
# Create an LLM.
# You should pass task="embed" for embedding models
model
=
LLM
(
**
vars
(
args
))
# Generate embedding. The output is a list of EmbeddingRequestOutputs.
outputs
=
model
.
embed
(
prompts
,
pooling_params
=
PoolingParams
(
dimensions
=
32
))
# Print the outputs.
print
(
"
\n
Generated Outputs:"
)
print
(
"-"
*
60
)
for
prompt
,
output
in
zip
(
prompts
,
outputs
):
embeds
=
output
.
outputs
.
embedding
embeds_trimmed
=
(
(
str
(
embeds
[:
16
])[:
-
1
]
+
", ...]"
)
if
len
(
embeds
)
>
16
else
embeds
)
print
(
f
"Prompt:
{
prompt
!
r
}
\n
Embeddings:
{
embeds_trimmed
}
(size=
{
len
(
embeds
)
}
)"
)
print
(
"-"
*
60
)
if
__name__
==
"__main__"
:
args
=
parse_args
()
main
(
args
)
examples/offline_inference/encoder_decoder.py
0 → 100644
View file @
25f39502
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""
Demonstrate prompting of text-to-text
encoder/decoder models, specifically BART
"""
from
vllm
import
LLM
,
SamplingParams
from
vllm.inputs
import
(
ExplicitEncoderDecoderPrompt
,
TextPrompt
,
TokensPrompt
,
zip_enc_dec_prompts
,
)
def
create_prompts
(
tokenizer
):
# Test prompts
#
# This section shows all of the valid ways to prompt an
# encoder/decoder model.
#
# - Helpers for building prompts
text_prompt_raw
=
"Hello, my name is"
text_prompt
=
TextPrompt
(
prompt
=
"The president of the United States is"
)
tokens_prompt
=
TokensPrompt
(
prompt_token_ids
=
tokenizer
.
encode
(
prompt
=
"The capital of France is"
)
)
# - Pass a single prompt to encoder/decoder model
# (implicitly encoder input prompt);
# decoder input prompt is assumed to be None
single_text_prompt_raw
=
text_prompt_raw
# Pass a string directly
single_text_prompt
=
text_prompt
# Pass a TextPrompt
single_tokens_prompt
=
tokens_prompt
# Pass a TokensPrompt
# ruff: noqa: E501
# - Pass explicit encoder and decoder input prompts within one data structure.
# Encoder and decoder prompts can both independently be text or tokens, with
# no requirement that they be the same prompt type. Some example prompt-type
# combinations are shown below, note that these are not exhaustive.
enc_dec_prompt1
=
ExplicitEncoderDecoderPrompt
(
# Pass encoder prompt string directly, &
# pass decoder prompt tokens
encoder_prompt
=
single_text_prompt_raw
,
decoder_prompt
=
single_tokens_prompt
,
)
enc_dec_prompt2
=
ExplicitEncoderDecoderPrompt
(
# Pass TextPrompt to encoder, and
# pass decoder prompt string directly
encoder_prompt
=
single_text_prompt
,
decoder_prompt
=
single_text_prompt_raw
,
)
enc_dec_prompt3
=
ExplicitEncoderDecoderPrompt
(
# Pass encoder prompt tokens directly, and
# pass TextPrompt to decoder
encoder_prompt
=
single_tokens_prompt
,
decoder_prompt
=
single_text_prompt
,
)
# - Finally, here's a useful helper function for zipping encoder and
# decoder prompts together into a list of ExplicitEncoderDecoderPrompt
# instances
zipped_prompt_list
=
zip_enc_dec_prompts
(
[
"An encoder prompt"
,
"Another encoder prompt"
],
[
"A decoder prompt"
,
"Another decoder prompt"
],
)
# - Let's put all of the above example prompts together into one list
# which we will pass to the encoder/decoder LLM.
return
[
single_text_prompt_raw
,
single_text_prompt
,
single_tokens_prompt
,
enc_dec_prompt1
,
enc_dec_prompt2
,
enc_dec_prompt3
,
]
+
zipped_prompt_list
# Create a sampling params object.
def
create_sampling_params
():
return
SamplingParams
(
temperature
=
0
,
top_p
=
1.0
,
min_tokens
=
0
,
max_tokens
=
20
,
)
# Print the outputs.
def
print_outputs
(
outputs
):
print
(
"-"
*
50
)
for
i
,
output
in
enumerate
(
outputs
):
prompt
=
output
.
prompt
encoder_prompt
=
output
.
encoder_prompt
generated_text
=
output
.
outputs
[
0
].
text
print
(
f
"Output
{
i
+
1
}
:"
)
print
(
f
"Encoder prompt:
{
encoder_prompt
!
r
}
\n
"
f
"Decoder prompt:
{
prompt
!
r
}
\n
"
f
"Generated text:
{
generated_text
!
r
}
"
)
print
(
"-"
*
50
)
def
main
():
dtype
=
"float"
# Create a BART encoder/decoder model instance
llm
=
LLM
(
model
=
"facebook/bart-large-cnn"
,
dtype
=
dtype
,
)
# Get BART tokenizer
tokenizer
=
llm
.
llm_engine
.
get_tokenizer_group
()
prompts
=
create_prompts
(
tokenizer
)
sampling_params
=
create_sampling_params
()
# Generate output tokens from the prompts. The output is a list of
# RequestOutput objects that contain the prompt, generated
# text, and other information.
outputs
=
llm
.
generate
(
prompts
,
sampling_params
)
print_outputs
(
outputs
)
if
__name__
==
"__main__"
:
main
()
examples/offline_inference/encoder_decoder_multimodal.py
0 → 100644
View file @
25f39502
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""
This example shows how to use vLLM for running offline inference with
the explicit/implicit prompt format on enc-dec LMMs for text generation.
"""
import
time
from
collections.abc
import
Sequence
from
dataclasses
import
asdict
from
typing
import
NamedTuple
from
vllm
import
LLM
,
EngineArgs
,
PromptType
,
SamplingParams
from
vllm.assets.audio
import
AudioAsset
from
vllm.assets.image
import
ImageAsset
from
vllm.utils
import
FlexibleArgumentParser
class
ModelRequestData
(
NamedTuple
):
engine_args
:
EngineArgs
prompts
:
Sequence
[
PromptType
]
def
run_florence2
():
engine_args
=
EngineArgs
(
model
=
"microsoft/Florence-2-large"
,
tokenizer
=
"Isotr0py/Florence-2-tokenizer"
,
max_num_seqs
=
8
,
trust_remote_code
=
True
,
limit_mm_per_prompt
=
{
"image"
:
1
},
dtype
=
"half"
,
)
prompts
=
[
{
# implicit prompt with task token
"prompt"
:
"<DETAILED_CAPTION>"
,
"multi_modal_data"
:
{
"image"
:
ImageAsset
(
"stop_sign"
).
pil_image
},
},
{
# explicit encoder/decoder prompt
"encoder_prompt"
:
{
"prompt"
:
"Describe in detail what is shown in the image."
,
"multi_modal_data"
:
{
"image"
:
ImageAsset
(
"cherry_blossom"
).
pil_image
},
},
"decoder_prompt"
:
""
,
},
]
return
ModelRequestData
(
engine_args
=
engine_args
,
prompts
=
prompts
,
)
def
run_mllama
():
engine_args
=
EngineArgs
(
model
=
"meta-llama/Llama-3.2-11B-Vision-Instruct"
,
max_model_len
=
8192
,
max_num_seqs
=
2
,
limit_mm_per_prompt
=
{
"image"
:
1
},
dtype
=
"half"
,
)
prompts
=
[
{
# Implicit prompt
"prompt"
:
"<|image|><|begin_of_text|>What is the content of this image?"
,
# noqa: E501
"multi_modal_data"
:
{
"image"
:
ImageAsset
(
"stop_sign"
).
pil_image
,
},
},
{
# Explicit prompt
"encoder_prompt"
:
{
"prompt"
:
"<|image|>"
,
"multi_modal_data"
:
{
"image"
:
ImageAsset
(
"stop_sign"
).
pil_image
,
},
},
"decoder_prompt"
:
"<|image|><|begin_of_text|>Please describe the image."
,
# noqa: E501
},
]
return
ModelRequestData
(
engine_args
=
engine_args
,
prompts
=
prompts
,
)
def
run_whisper
():
engine_args
=
EngineArgs
(
model
=
"openai/whisper-large-v3-turbo"
,
max_model_len
=
448
,
max_num_seqs
=
16
,
limit_mm_per_prompt
=
{
"audio"
:
1
},
dtype
=
"half"
,
)
prompts
=
[
{
# Test implicit prompt
"prompt"
:
"<|startoftranscript|>"
,
"multi_modal_data"
:
{
"audio"
:
AudioAsset
(
"mary_had_lamb"
).
audio_and_sample_rate
,
},
},
{
# Test explicit encoder/decoder prompt
"encoder_prompt"
:
{
"prompt"
:
""
,
"multi_modal_data"
:
{
"audio"
:
AudioAsset
(
"winning_call"
).
audio_and_sample_rate
,
},
},
"decoder_prompt"
:
"<|startoftranscript|>"
,
},
]
return
ModelRequestData
(
engine_args
=
engine_args
,
prompts
=
prompts
,
)
model_example_map
=
{
"florence2"
:
run_florence2
,
"mllama"
:
run_mllama
,
"whisper"
:
run_whisper
,
}
def
parse_args
():
parser
=
FlexibleArgumentParser
(
description
=
"Demo on using vLLM for offline inference with "
"vision language models for text generation"
)
parser
.
add_argument
(
"--model-type"
,
"-m"
,
type
=
str
,
default
=
"mllama"
,
choices
=
model_example_map
.
keys
(),
help
=
'Huggingface "model_type".'
,
)
parser
.
add_argument
(
"--seed"
,
type
=
int
,
default
=
None
,
help
=
"Set the seed when initializing `vllm.LLM`."
,
)
return
parser
.
parse_args
()
def
main
(
args
):
model
=
args
.
model_type
if
model
not
in
model_example_map
:
raise
ValueError
(
f
"Model type
{
model
}
is not supported."
)
req_data
=
model_example_map
[
model
]()
# Disable other modalities to save memory
default_limits
=
{
"image"
:
0
,
"video"
:
0
,
"audio"
:
0
}
req_data
.
engine_args
.
limit_mm_per_prompt
=
default_limits
|
dict
(
req_data
.
engine_args
.
limit_mm_per_prompt
or
{}
)
engine_args
=
asdict
(
req_data
.
engine_args
)
|
{
"seed"
:
args
.
seed
}
llm
=
LLM
(
**
engine_args
)
prompts
=
req_data
.
prompts
# Create a sampling params object.
sampling_params
=
SamplingParams
(
temperature
=
0
,
top_p
=
1.0
,
max_tokens
=
64
,
skip_special_tokens
=
False
,
)
start
=
time
.
time
()
# Generate output tokens from the prompts. The output is a list of
# RequestOutput objects that contain the prompt, generated
# text, and other information.
outputs
=
llm
.
generate
(
prompts
,
sampling_params
)
# Print the outputs.
for
output
in
outputs
:
prompt
=
output
.
prompt
generated_text
=
output
.
outputs
[
0
].
text
print
(
f
"Decoder prompt:
{
prompt
!
r
}
, Generated text:
{
generated_text
!
r
}
"
)
duration
=
time
.
time
()
-
start
print
(
"Duration:"
,
duration
)
print
(
"RPS:"
,
len
(
prompts
)
/
duration
)
if
__name__
==
"__main__"
:
args
=
parse_args
()
main
(
args
)
examples/offline_inference/llm_engine_example.py
0 → 100644
View file @
25f39502
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""
This file demonstrates using the `LLMEngine`
for processing prompts with various sampling parameters.
"""
import
argparse
from
vllm
import
EngineArgs
,
LLMEngine
,
RequestOutput
,
SamplingParams
from
vllm.utils
import
FlexibleArgumentParser
def
create_test_prompts
()
->
list
[
tuple
[
str
,
SamplingParams
]]:
"""Create a list of test prompts with their sampling parameters."""
return
[
(
"A robot may not injure a human being"
,
SamplingParams
(
temperature
=
0.0
,
logprobs
=
1
,
prompt_logprobs
=
1
),
),
(
"To be or not to be,"
,
SamplingParams
(
temperature
=
0.8
,
top_k
=
5
,
presence_penalty
=
0.2
),
),
(
"What is the meaning of life?"
,
SamplingParams
(
n
=
2
,
temperature
=
0.8
,
top_p
=
0.95
,
frequency_penalty
=
0.1
),
),
]
def
process_requests
(
engine
:
LLMEngine
,
test_prompts
:
list
[
tuple
[
str
,
SamplingParams
]]):
"""Continuously process a list of prompts and handle the outputs."""
request_id
=
0
print
(
"-"
*
50
)
while
test_prompts
or
engine
.
has_unfinished_requests
():
if
test_prompts
:
prompt
,
sampling_params
=
test_prompts
.
pop
(
0
)
engine
.
add_request
(
str
(
request_id
),
prompt
,
sampling_params
)
request_id
+=
1
request_outputs
:
list
[
RequestOutput
]
=
engine
.
step
()
for
request_output
in
request_outputs
:
if
request_output
.
finished
:
print
(
request_output
)
print
(
"-"
*
50
)
def
initialize_engine
(
args
:
argparse
.
Namespace
)
->
LLMEngine
:
"""Initialize the LLMEngine from the command line arguments."""
engine_args
=
EngineArgs
.
from_cli_args
(
args
)
return
LLMEngine
.
from_engine_args
(
engine_args
)
def
parse_args
():
parser
=
FlexibleArgumentParser
(
description
=
"Demo on using the LLMEngine class directly"
)
parser
=
EngineArgs
.
add_cli_args
(
parser
)
return
parser
.
parse_args
()
def
main
(
args
:
argparse
.
Namespace
):
"""Main function that sets up and runs the prompt processing."""
engine
=
initialize_engine
(
args
)
test_prompts
=
create_test_prompts
()
process_requests
(
engine
,
test_prompts
)
if
__name__
==
"__main__"
:
args
=
parse_args
()
main
(
args
)
examples/offline_inference/load_sharded_state.py
0 → 100644
View file @
25f39502
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""
Validates the loading of a model saved with the sharded_state format.
This script demonstrates how to load a model that was previously saved
using save_sharded_state.py and validates it by running inference.
Example usage:
(First need to save a sharded_state mode)
python save_sharded_state.py
\
--model /path/to/load
\
--quantization deepspeedfp
\
--tensor-parallel-size 8
\
--output /path/to/save/sharded/modele
python load_sharded_state.py
\
--model /path/to/saved/sharded/model
\
--load-format sharded_state
\
--quantization deepspeedfp
\
--tensor-parallel-size 8
\
--prompt "Hello, my name is"
\
--max-tokens 50
"""
import
dataclasses
from
vllm
import
LLM
,
EngineArgs
,
SamplingParams
from
vllm.utils
import
FlexibleArgumentParser
def
parse_args
():
parser
=
FlexibleArgumentParser
()
# Add engine arguments
EngineArgs
.
add_cli_args
(
parser
)
# Override default load_format for clarity
parser
.
set_defaults
(
load_format
=
"sharded_state"
)
# Add validation arguments
parser
.
add_argument
(
"--prompt"
,
type
=
str
,
default
=
"Hello, world!"
,
help
=
"Prompt for validation"
)
parser
.
add_argument
(
"--max-tokens"
,
type
=
int
,
default
=
100
,
help
=
"Maximum number of tokens to generate"
,
)
parser
.
add_argument
(
"--temperature"
,
type
=
float
,
default
=
0.7
,
help
=
"Sampling temperature"
)
parser
.
add_argument
(
"--top-p"
,
type
=
float
,
default
=
1.0
,
help
=
"Top-p sampling parameter"
)
return
parser
.
parse_args
()
def
main
():
args
=
parse_args
()
engine_args
=
EngineArgs
.
from_cli_args
(
args
)
print
(
f
"Loading model from
{
engine_args
.
model
}
using format
{
engine_args
.
load_format
}
"
)
print
(
f
"Tensor parallel size:
{
engine_args
.
tensor_parallel_size
}
"
)
# Load the model using engine args
llm
=
LLM
(
**
dataclasses
.
asdict
(
engine_args
))
# Prepare sampling parameters
sampling_params
=
SamplingParams
(
temperature
=
args
.
temperature
,
top_p
=
args
.
top_p
,
max_tokens
=
args
.
max_tokens
,
)
print
(
"
\n
Running inference:"
)
print
(
f
"Prompt:
{
args
.
prompt
}
"
)
# Generate completion
outputs
=
llm
.
generate
(
args
.
prompt
,
sampling_params
)
# Display generated text
print
(
"
\n
Generated outputs:"
)
for
output
in
outputs
:
generated_text
=
output
.
outputs
[
0
].
text
print
(
"-"
*
50
)
print
(
f
"Full output:
{
args
.
prompt
}{
generated_text
}
"
)
print
(
"-"
*
50
)
if
__name__
==
"__main__"
:
main
()
examples/offline_inference/lora_with_quantization_inference.py
0 → 100644
View file @
25f39502
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""
This example shows how to use LoRA with different quantization techniques
for offline inference.
Requires HuggingFace credentials for access.
"""
import
gc
from
typing
import
Optional
import
torch
from
huggingface_hub
import
snapshot_download
from
vllm
import
EngineArgs
,
LLMEngine
,
RequestOutput
,
SamplingParams
from
vllm.lora.request
import
LoRARequest
def
create_test_prompts
(
lora_path
:
str
,
)
->
list
[
tuple
[
str
,
SamplingParams
,
Optional
[
LoRARequest
]]]:
return
[
# this is an example of using quantization without LoRA
(
"My name is"
,
SamplingParams
(
temperature
=
0.0
,
logprobs
=
1
,
prompt_logprobs
=
1
,
max_tokens
=
128
),
None
,
),
# the next three examples use quantization with LoRA
(
"my name is"
,
SamplingParams
(
temperature
=
0.0
,
logprobs
=
1
,
prompt_logprobs
=
1
,
max_tokens
=
128
),
LoRARequest
(
"lora-test-1"
,
1
,
lora_path
),
),
(
"The capital of USA is"
,
SamplingParams
(
temperature
=
0.0
,
logprobs
=
1
,
prompt_logprobs
=
1
,
max_tokens
=
128
),
LoRARequest
(
"lora-test-2"
,
1
,
lora_path
),
),
(
"The capital of France is"
,
SamplingParams
(
temperature
=
0.0
,
logprobs
=
1
,
prompt_logprobs
=
1
,
max_tokens
=
128
),
LoRARequest
(
"lora-test-3"
,
1
,
lora_path
),
),
]
def
process_requests
(
engine
:
LLMEngine
,
test_prompts
:
list
[
tuple
[
str
,
SamplingParams
,
Optional
[
LoRARequest
]]],
):
"""Continuously process a list of prompts and handle the outputs."""
request_id
=
0
while
test_prompts
or
engine
.
has_unfinished_requests
():
if
test_prompts
:
prompt
,
sampling_params
,
lora_request
=
test_prompts
.
pop
(
0
)
engine
.
add_request
(
str
(
request_id
),
prompt
,
sampling_params
,
lora_request
=
lora_request
)
request_id
+=
1
request_outputs
:
list
[
RequestOutput
]
=
engine
.
step
()
for
request_output
in
request_outputs
:
if
request_output
.
finished
:
print
(
"----------------------------------------------------"
)
print
(
f
"Prompt:
{
request_output
.
prompt
}
"
)
print
(
f
"Output:
{
request_output
.
outputs
[
0
].
text
}
"
)
def
initialize_engine
(
model
:
str
,
quantization
:
str
,
lora_repo
:
Optional
[
str
]
)
->
LLMEngine
:
"""Initialize the LLMEngine."""
engine_args
=
EngineArgs
(
model
=
model
,
quantization
=
quantization
,
enable_lora
=
True
,
max_lora_rank
=
64
,
max_loras
=
4
,
)
return
LLMEngine
.
from_engine_args
(
engine_args
)
def
main
():
"""Main function that sets up and runs the prompt processing."""
test_configs
=
[
# QLoRA (https://arxiv.org/abs/2305.14314)
{
"name"
:
"qlora_inference_example"
,
"model"
:
"huggyllama/llama-7b"
,
"quantization"
:
"bitsandbytes"
,
"lora_repo"
:
"timdettmers/qlora-flan-7b"
,
},
{
"name"
:
"AWQ_inference_with_lora_example"
,
"model"
:
"TheBloke/TinyLlama-1.1B-Chat-v0.3-AWQ"
,
"quantization"
:
"awq"
,
"lora_repo"
:
"jashing/tinyllama-colorist-lora"
,
},
{
"name"
:
"GPTQ_inference_with_lora_example"
,
"model"
:
"TheBloke/TinyLlama-1.1B-Chat-v0.3-GPTQ"
,
"quantization"
:
"gptq"
,
"lora_repo"
:
"jashing/tinyllama-colorist-lora"
,
},
]
for
test_config
in
test_configs
:
print
(
f
"~~~~~~~~~~~~~~~~ Running:
{
test_config
[
'name'
]
}
~~~~~~~~~~~~~~~~"
)
engine
=
initialize_engine
(
test_config
[
"model"
],
test_config
[
"quantization"
],
test_config
[
"lora_repo"
]
)
lora_path
=
snapshot_download
(
repo_id
=
test_config
[
"lora_repo"
])
test_prompts
=
create_test_prompts
(
lora_path
)
process_requests
(
engine
,
test_prompts
)
# Clean up the GPU memory for the next test
del
engine
gc
.
collect
()
torch
.
cuda
.
empty_cache
()
if
__name__
==
"__main__"
:
main
()
examples/offline_inference/metrics.py
0 → 100644
View file @
25f39502
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from
vllm
import
LLM
,
SamplingParams
from
vllm.v1.metrics.reader
import
Counter
,
Gauge
,
Histogram
,
Vector
# Sample prompts.
prompts
=
[
"Hello, my name is"
,
"The president of the United States is"
,
"The capital of France is"
,
"The future of AI is"
,
]
# Create a sampling params object.
sampling_params
=
SamplingParams
(
temperature
=
0.8
,
top_p
=
0.95
)
def
main
():
# Create an LLM.
llm
=
LLM
(
model
=
"facebook/opt-125m"
,
disable_log_stats
=
False
)
# Generate texts from the prompts.
outputs
=
llm
.
generate
(
prompts
,
sampling_params
)
# Print the outputs.
print
(
"-"
*
50
)
for
output
in
outputs
:
prompt
=
output
.
prompt
generated_text
=
output
.
outputs
[
0
].
text
print
(
f
"Prompt:
{
prompt
!
r
}
\n
Generated text:
{
generated_text
!
r
}
"
)
print
(
"-"
*
50
)
# Dump all metrics
for
metric
in
llm
.
get_metrics
():
if
isinstance
(
metric
,
Gauge
):
print
(
f
"
{
metric
.
name
}
(gauge) =
{
metric
.
value
}
"
)
elif
isinstance
(
metric
,
Counter
):
print
(
f
"
{
metric
.
name
}
(counter) =
{
metric
.
value
}
"
)
elif
isinstance
(
metric
,
Vector
):
print
(
f
"
{
metric
.
name
}
(vector) =
{
metric
.
values
}
"
)
elif
isinstance
(
metric
,
Histogram
):
print
(
f
"
{
metric
.
name
}
(histogram)"
)
print
(
f
" sum =
{
metric
.
sum
}
"
)
print
(
f
" count =
{
metric
.
count
}
"
)
for
bucket_le
,
value
in
metric
.
buckets
.
items
():
print
(
f
"
{
bucket_le
}
=
{
value
}
"
)
if
__name__
==
"__main__"
:
main
()
examples/offline_inference
_pixtra
l.py
→
examples/offline_inference
/mistral-smal
l.py
View file @
25f39502
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
# ruff: noqa
import
argparse
from
vllm
import
LLM
from
vllm.sampling_params
import
SamplingParams
from
vllm.assets.image
import
ImageAsset
# This script is an offline demo for running
Pix
tral
.
# This script is an offline demo for running
Mis
tral
-Small-3.1
#
# If you want to run a server/client setup, please follow this code:
#
# - Server:
#
# ```bash
# vllm serve mistralai/Pixtral-12B-2409 --tokenizer-mode mistral --limit-mm-per-prompt 'image=4' --max-model-len 16384
# # Mistral format
# vllm serve mistralai/Mistral-Small-3.1-24B-Instruct-2503 \
# --tokenizer-mode mistral --config-format mistral --load-format mistral \
# --limit-mm-per-prompt '{"image":4}' --max-model-len 16384
#
# # HF format
# vllm serve mistralai/Mistral-Small-3.1-24B-Instruct-2503 \
# --limit-mm-per-prompt '{"image":4}' --max-model-len 16384
# ```
#
# - Client:
...
...
@@ -21,7 +32,7 @@ from vllm.sampling_params import SamplingParams
# --header 'Content-Type: application/json' \
# --header 'Authorization: Bearer token' \
# --data '{
# "model": "mistralai/
Pix
tral-
12B-2409
",
# "model": "mistralai/
Mis
tral-
Small-3.1-24B-Instruct-2503
",
# "messages": [
# {
# "role": "user",
...
...
@@ -40,51 +51,61 @@ from vllm.sampling_params import SamplingParams
# python demo.py simple
# python demo.py advanced
# Lower max_model_len and/or max_num_seqs on low-VRAM GPUs.
# These scripts have been tested on 2x L40 GPUs
def
run_simple_demo
():
model_name
=
"mistralai/Pixtral-12B-2409"
def
run_simple_demo
(
args
:
argparse
.
Namespace
):
model_name
=
"mistralai/Mistral-Small-3.1-24B-Instruct-2503"
sampling_params
=
SamplingParams
(
max_tokens
=
8192
)
# Lower max_num_seqs or max_model_len on low-VRAM GPUs.
llm
=
LLM
(
model
=
model_name
,
tokenizer_mode
=
"mistral"
)
llm
=
LLM
(
model
=
model_name
,
tokenizer_mode
=
"mistral"
if
args
.
format
==
"mistral"
else
"auto"
,
config_format
=
"mistral"
if
args
.
format
==
"mistral"
else
"auto"
,
load_format
=
"mistral"
if
args
.
format
==
"mistral"
else
"auto"
,
limit_mm_per_prompt
=
{
"image"
:
1
},
max_model_len
=
4096
,
max_num_seqs
=
2
,
tensor_parallel_size
=
2
,
disable_mm_preprocessor_cache
=
args
.
disable_mm_preprocessor_cache
,
)
prompt
=
"Describe this image in one sentence."
image_url
=
"https://picsum.photos/id/237/200/300"
messages
=
[
{
"role"
:
"user"
,
"role"
:
"user"
,
"content"
:
[
{
"type"
:
"text"
,
"text"
:
prompt
},
{
"type"
:
"text"
,
"text"
:
prompt
},
{
"type"
:
"image_url"
,
"image_url"
:
{
"url"
:
image_url
}
"type"
:
"image_pil"
,
"image_pil"
:
ImageAsset
(
"cherry_blossom"
).
pil_image
,
},
],
},
]
outputs
=
llm
.
chat
(
messages
,
sampling_params
=
sampling_params
)
print
(
"-"
*
50
)
print
(
outputs
[
0
].
outputs
[
0
].
text
)
print
(
"-"
*
50
)
def
run_advanced_demo
():
model_name
=
"mistralai/
Pix
tral-
12B-2409
"
max_img_per_msg
=
5
def
run_advanced_demo
(
args
:
argparse
.
Namespace
):
model_name
=
"mistralai/
Mis
tral-
Small-3.1-24B-Instruct-2503
"
max_img_per_msg
=
3
max_tokens_per_img
=
4096
sampling_params
=
SamplingParams
(
max_tokens
=
8192
,
temperature
=
0.7
)
llm
=
LLM
(
model
=
model_name
,
tokenizer_mode
=
"mistral"
,
tokenizer_mode
=
"mistral"
if
args
.
format
==
"mistral"
else
"auto"
,
config_format
=
"mistral"
if
args
.
format
==
"mistral"
else
"auto"
,
load_format
=
"mistral"
if
args
.
format
==
"mistral"
else
"auto"
,
limit_mm_per_prompt
=
{
"image"
:
max_img_per_msg
},
max_model_len
=
max_img_per_msg
*
max_tokens_per_img
,
tensor_parallel_size
=
2
,
disable_mm_preprocessor_cache
=
args
.
disable_mm_preprocessor_cache
,
)
prompt
=
"Describe the following image."
...
...
@@ -95,25 +116,11 @@ def run_advanced_demo():
messages
=
[
{
"role"
:
"user"
,
"role"
:
"user"
,
"content"
:
[
{
"type"
:
"text"
,
"text"
:
prompt
},
{
"type"
:
"image_url"
,
"image_url"
:
{
"url"
:
url_1
}
},
{
"type"
:
"image_url"
,
"image_url"
:
{
"url"
:
url_2
}
},
{
"type"
:
"text"
,
"text"
:
prompt
},
{
"type"
:
"image_url"
,
"image_url"
:
{
"url"
:
url_1
}},
{
"type"
:
"image_url"
,
"image_url"
:
{
"url"
:
url_2
}},
],
},
{
...
...
@@ -127,23 +134,21 @@ def run_advanced_demo():
{
"role"
:
"user"
,
"content"
:
[
{
"type"
:
"image_url"
,
"image_url"
:
{
"url"
:
url_3
}
},
{
"type"
:
"image_url"
,
"image_url"
:
{
"url"
:
url_3
}},
],
},
]
outputs
=
llm
.
chat
(
messages
=
messages
,
sampling_params
=
sampling_params
)
print
(
"-"
*
50
)
print
(
outputs
[
0
].
outputs
[
0
].
text
)
print
(
"-"
*
50
)
def
main
():
def
parse_args
():
parser
=
argparse
.
ArgumentParser
(
description
=
"Run a demo in simple or advanced mode."
)
description
=
"Run a demo in simple or advanced mode."
)
parser
.
add_argument
(
"mode"
,
...
...
@@ -151,14 +156,30 @@ def main():
help
=
"Specify the demo mode: 'simple' or 'advanced'"
,
)
args
=
parser
.
parse_args
()
parser
.
add_argument
(
"--format"
,
choices
=
[
"mistral"
,
"hf"
],
default
=
"mistral"
,
help
=
"Specify the format of the model to load."
,
)
parser
.
add_argument
(
"--disable-mm-preprocessor-cache"
,
action
=
"store_true"
,
help
=
"If True, disables caching of multi-modal preprocessor/mapper."
,
)
return
parser
.
parse_args
()
def
main
():
args
=
parse_args
()
if
args
.
mode
==
"simple"
:
print
(
"Running simple demo..."
)
run_simple_demo
()
run_simple_demo
(
args
)
elif
args
.
mode
==
"advanced"
:
print
(
"Running advanced demo..."
)
run_advanced_demo
()
run_advanced_demo
(
args
)
if
__name__
==
"__main__"
:
...
...
examples/offline_inference
_
mlpspeculator.py
→
examples/offline_inference
/
mlpspeculator.py
View file @
25f39502
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""
This file demonstrates the usage of text generation with an LLM model,
comparing the performance with and without speculative decoding.
Note that still not support `v1`:
VLLM_USE_V1=0 python examples/offline_inference/mlpspeculator.py
"""
import
gc
import
time
from
typing
import
List
from
vllm
import
LLM
,
SamplingParams
def
time_generation
(
llm
:
LLM
,
prompts
:
List
[
str
],
sampling_params
:
SamplingParams
):
def
time_generation
(
llm
:
LLM
,
prompts
:
list
[
str
],
sampling_params
:
SamplingParams
,
title
:
str
):
# Generate texts from the prompts. The output is a list of RequestOutput
# objects that contain the prompt, generated text, and other information.
# Warmup first
...
...
@@ -15,19 +25,22 @@ def time_generation(llm: LLM, prompts: List[str],
start
=
time
.
time
()
outputs
=
llm
.
generate
(
prompts
,
sampling_params
)
end
=
time
.
time
()
print
((
end
-
start
)
/
sum
([
len
(
o
.
outputs
[
0
].
token_ids
)
for
o
in
outputs
]))
print
(
"-"
*
50
)
print
(
title
)
print
(
"time: "
,
(
end
-
start
)
/
sum
(
len
(
o
.
outputs
[
0
].
token_ids
)
for
o
in
outputs
))
# Print the outputs.
for
output
in
outputs
:
generated_text
=
output
.
outputs
[
0
].
text
print
(
f
"text:
{
generated_text
!
r
}
"
)
print
(
"-"
*
50
)
if
__name__
==
"__main__"
:
def
main
():
template
=
(
"Below is an instruction that describes a task. Write a response "
"that appropriately completes the request.
\n\n
### Instruction:
\n
{}"
"
\n\n
### Response:
\n
"
)
"
\n\n
### Response:
\n
"
)
# Sample prompts.
prompts
=
[
...
...
@@ -40,8 +53,7 @@ if __name__ == "__main__":
# Create an LLM without spec decoding
llm
=
LLM
(
model
=
"meta-llama/Llama-2-13b-chat-hf"
)
print
(
"Without speculation"
)
time_generation
(
llm
,
prompts
,
sampling_params
)
time_generation
(
llm
,
prompts
,
sampling_params
,
"Without speculation"
)
del
llm
gc
.
collect
()
...
...
@@ -49,10 +61,13 @@ if __name__ == "__main__":
# Create an LLM with spec decoding
llm
=
LLM
(
model
=
"meta-llama/Llama-2-13b-chat-hf"
,
speculative_
model
=
"ibm-fms/llama-13b-accelerator"
,
# These are currently required for MLPSpeculator decoding
use_v2_block_manager
=
True
,
speculative_
config
=
{
"model"
:
"ibm-ai-platform/llama-13b-accelerator"
,
}
,
)
print
(
"With speculation"
)
time_generation
(
llm
,
prompts
,
sampling_params
)
time_generation
(
llm
,
prompts
,
sampling_params
,
"With speculation"
)
if
__name__
==
"__main__"
:
main
()
examples/offline_inference/multilora_inference.py
0 → 100644
View file @
25f39502
This diff is collapsed.
Click to expand it.
examples/offline_inference/neuron.py
0 → 100644
View file @
25f39502
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from
vllm
import
LLM
,
SamplingParams
# Sample prompts.
prompts
=
[
"Hello, my name is"
,
"The president of the United States is"
,
"The capital of France is"
,
"The future of AI is"
,
]
# Create a sampling params object.
sampling_params
=
SamplingParams
(
temperature
=
0.8
,
top_p
=
0.95
)
def
main
():
# Create an LLM.
llm
=
LLM
(
model
=
"TinyLlama/TinyLlama-1.1B-Chat-v1.0"
,
max_num_seqs
=
8
,
# The max_model_len and block_size arguments are required to be same as
# max sequence length when targeting neuron device.
# Currently, this is a known limitation in continuous batching support
# in transformers-neuronx.
# TODO(liangfu): Support paged-attention in transformers-neuronx.
max_model_len
=
1024
,
block_size
=
1024
,
# ruff: noqa: E501
# The device can be automatically detected when AWS Neuron SDK is installed.
# The device argument can be either unspecified for automated detection,
# or explicitly assigned.
device
=
"neuron"
,
tensor_parallel_size
=
2
,
)
# Generate texts from the prompts. The output is a list of RequestOutput objects
# that contain the prompt, generated text, and other information.
outputs
=
llm
.
generate
(
prompts
,
sampling_params
)
# Print the outputs.
print
(
"-"
*
50
)
for
output
in
outputs
:
prompt
=
output
.
prompt
generated_text
=
output
.
outputs
[
0
].
text
print
(
f
"Prompt:
{
prompt
!
r
}
\n
Generated text:
{
generated_text
!
r
}
"
)
print
(
"-"
*
50
)
if
__name__
==
"__main__"
:
main
()
examples/offline_inference/neuron_eagle.py
0 → 100644
View file @
25f39502
This diff is collapsed.
Click to expand it.
Prev
1
2
3
4
5
6
…
10
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment