Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
e7c1b7f3
Commit
e7c1b7f3
authored
Sep 06, 2024
by
zhuwenwen
Browse files
Merge branch 'v0.5.4-dtk24.04.1'
parents
7462218e
04c62b93
Changes
442
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
38 additions
and
52 deletions
+38
-52
vllm/benchmark_throughput.py
vllm/benchmark_throughput.py
+38
-9
vllm/block.py
vllm/block.py
+0
-43
No files found.
Too many changes to show.
To preserve performance only
442 of 442+
files are displayed.
Plain diff
Email patch
vllm/benchmark_throughput.py
View file @
e7c1b7f3
...
@@ -11,8 +11,10 @@ from tqdm import tqdm
...
@@ -11,8 +11,10 @@ from tqdm import tqdm
from
transformers
import
(
AutoModelForCausalLM
,
AutoTokenizer
,
from
transformers
import
(
AutoModelForCausalLM
,
AutoTokenizer
,
PreTrainedTokenizerBase
)
PreTrainedTokenizerBase
)
from
vllm.inputs
import
PromptStrictInputs
from
vllm.engine.arg_utils
import
EngineArgs
from
vllm.inputs
import
PromptInputs
from
vllm.model_executor.layers.quantization
import
QUANTIZATION_METHODS
from
vllm.model_executor.layers.quantization
import
QUANTIZATION_METHODS
from
vllm.utils
import
FlexibleArgumentParser
def
sample_requests
(
def
sample_requests
(
...
@@ -84,6 +86,7 @@ def run_vllm(
...
@@ -84,6 +86,7 @@ def run_vllm(
distributed_executor_backend
:
Optional
[
str
],
distributed_executor_backend
:
Optional
[
str
],
gpu_memory_utilization
:
float
=
0.9
,
gpu_memory_utilization
:
float
=
0.9
,
download_dir
:
Optional
[
str
]
=
None
,
download_dir
:
Optional
[
str
]
=
None
,
load_format
:
str
=
EngineArgs
.
load_format
,
)
->
float
:
)
->
float
:
from
vllm
import
LLM
,
SamplingParams
from
vllm
import
LLM
,
SamplingParams
llm
=
LLM
(
llm
=
LLM
(
...
@@ -105,11 +108,12 @@ def run_vllm(
...
@@ -105,11 +108,12 @@ def run_vllm(
enable_chunked_prefill
=
enable_chunked_prefill
,
enable_chunked_prefill
=
enable_chunked_prefill
,
max_num_batched_tokens
=
max_num_batched_tokens
,
max_num_batched_tokens
=
max_num_batched_tokens
,
distributed_executor_backend
=
distributed_executor_backend
,
distributed_executor_backend
=
distributed_executor_backend
,
load_format
=
load_format
,
)
)
# Add the requests to the engine.
# Add the requests to the engine.
prompts
=
[]
prompts
:
List
[
str
]
=
[]
sampling_params
=
[]
sampling_params
:
List
[
SamplingParams
]
=
[]
for
prompt
,
_
,
output_len
in
requests
:
for
prompt
,
_
,
output_len
in
requests
:
prompts
.
append
(
prompt
)
prompts
.
append
(
prompt
)
sampling_params
.
append
(
sampling_params
.
append
(
...
@@ -144,7 +148,7 @@ def run_vllm(
...
@@ -144,7 +148,7 @@ def run_vllm(
# dummy_prompt_token_ids = np.random.randint(10000,
# dummy_prompt_token_ids = np.random.randint(10000,
# size=(args.num_prompts,
# size=(args.num_prompts,
# args.input_len))
# args.input_len))
# dummy_inputs: List[Prompt
Strict
Inputs] = [{
# dummy_inputs: List[PromptInputs] = [{
# "prompt_token_ids": batch
# "prompt_token_ids": batch
# } for batch in dummy_prompt_token_ids.tolist()]
# } for batch in dummy_prompt_token_ids.tolist()]
...
@@ -270,7 +274,7 @@ def main(args: argparse.Namespace):
...
@@ -270,7 +274,7 @@ def main(args: argparse.Namespace):
args
.
quantization_param_path
,
args
.
device
,
args
.
quantization_param_path
,
args
.
device
,
args
.
enable_prefix_caching
,
args
.
enable_chunked_prefill
,
args
.
enable_prefix_caching
,
args
.
enable_chunked_prefill
,
args
.
max_num_batched_tokens
,
args
.
distributed_executor_backend
,
args
.
max_num_batched_tokens
,
args
.
distributed_executor_backend
,
args
.
gpu_memory_utilization
,
args
.
download_dir
)
args
.
gpu_memory_utilization
,
args
.
download_dir
,
args
.
load_format
)
elif
args
.
backend
==
"hf"
:
elif
args
.
backend
==
"hf"
:
assert
args
.
tensor_parallel_size
==
1
assert
args
.
tensor_parallel_size
==
1
elapsed_time
=
run_hf
(
requests
,
args
.
model
,
tokenizer
,
args
.
n
,
elapsed_time
=
run_hf
(
requests
,
args
.
model
,
tokenizer
,
args
.
n
,
...
@@ -283,6 +287,7 @@ def main(args: argparse.Namespace):
...
@@ -283,6 +287,7 @@ def main(args: argparse.Namespace):
raise
ValueError
(
f
"Unknown backend:
{
args
.
backend
}
"
)
raise
ValueError
(
f
"Unknown backend:
{
args
.
backend
}
"
)
total_num_tokens
=
sum
(
prompt_len
+
output_len
total_num_tokens
=
sum
(
prompt_len
+
output_len
for
_
,
prompt_len
,
output_len
in
requests
)
for
_
,
prompt_len
,
output_len
in
requests
)
if
args
.
dataset
is
None
:
if
args
.
dataset
is
None
:
total_out_tokens
=
args
.
output_len
*
args
.
num_prompts
total_out_tokens
=
args
.
output_len
*
args
.
num_prompts
else
:
else
:
...
@@ -307,7 +312,7 @@ def main(args: argparse.Namespace):
...
@@ -307,7 +312,7 @@ def main(args: argparse.Namespace):
if
__name__
==
"__main__"
:
if
__name__
==
"__main__"
:
parser
=
argparse
.
ArgumentParser
(
description
=
"Benchmark the throughput."
)
parser
=
Flexible
ArgumentParser
(
description
=
"Benchmark the throughput."
)
parser
.
add_argument
(
"--backend"
,
parser
.
add_argument
(
"--backend"
,
type
=
str
,
type
=
str
,
choices
=
[
"vllm"
,
"hf"
,
"mii"
],
choices
=
[
"vllm"
,
"hf"
,
"mii"
],
...
@@ -398,9 +403,10 @@ if __name__ == "__main__":
...
@@ -398,9 +403,10 @@ if __name__ == "__main__":
parser
.
add_argument
(
parser
.
add_argument
(
"--device"
,
"--device"
,
type
=
str
,
type
=
str
,
default
=
"cuda"
,
default
=
"auto"
,
choices
=
[
"cuda"
,
"cpu"
],
choices
=
[
"auto"
,
"cuda"
,
"cpu"
,
"openvino"
,
"tpu"
,
"xpu"
],
help
=
'device type for vLLM execution, supporting CUDA and CPU.'
)
help
=
'device type for vLLM execution, supporting CUDA, OpenVINO and '
'CPU.'
)
parser
.
add_argument
(
parser
.
add_argument
(
"--enable-prefix-caching"
,
"--enable-prefix-caching"
,
action
=
'store_true'
,
action
=
'store_true'
,
...
@@ -430,6 +436,29 @@ if __name__ == "__main__":
...
@@ -430,6 +436,29 @@ if __name__ == "__main__":
help
=
'Backend to use for distributed serving. When more than 1 GPU '
help
=
'Backend to use for distributed serving. When more than 1 GPU '
'is used, will be automatically set to "ray" if installed '
'is used, will be automatically set to "ray" if installed '
'or "mp" (multiprocessing) otherwise.'
)
'or "mp" (multiprocessing) otherwise.'
)
parser
.
add_argument
(
'--load-format'
,
type
=
str
,
default
=
EngineArgs
.
load_format
,
choices
=
[
'auto'
,
'pt'
,
'safetensors'
,
'npcache'
,
'dummy'
,
'tensorizer'
,
'bitsandbytes'
],
help
=
'The format of the model weights to load.
\n\n
'
'* "auto" will try to load the weights in the safetensors format '
'and fall back to the pytorch bin format if safetensors format '
'is not available.
\n
'
'* "pt" will load the weights in the pytorch bin format.
\n
'
'* "safetensors" will load the weights in the safetensors format.
\n
'
'* "npcache" will load the weights in pytorch format and store '
'a numpy cache to speed up the loading.
\n
'
'* "dummy" will initialize the weights with random values, '
'which is mainly for profiling.
\n
'
'* "tensorizer" will load the weights using tensorizer from '
'CoreWeave. See the Tensorize vLLM Model script in the Examples'
'section for more information.
\n
'
'* "bitsandbytes" will load the weights using bitsandbytes '
'quantization.
\n
'
)
args
=
parser
.
parse_args
()
args
=
parser
.
parse_args
()
if
args
.
tokenizer
is
None
:
if
args
.
tokenizer
is
None
:
args
.
tokenizer
=
args
.
model
args
.
tokenizer
=
args
.
model
...
...
vllm/block.py
View file @
e7c1b7f3
...
@@ -3,52 +3,9 @@ from typing import List
...
@@ -3,52 +3,9 @@ from typing import List
from
vllm.utils
import
Device
from
vllm.utils
import
Device
_BLANK_TOKEN_ID
=
-
1
DEFAULT_LAST_ACCESSED_TIME
=
-
1
DEFAULT_LAST_ACCESSED_TIME
=
-
1
class
LogicalTokenBlock
:
"""A block that stores a contiguous chunk of tokens from left to right.
Logical blocks are used to represent the states of the corresponding
physical blocks in the KV cache.
"""
def
__init__
(
self
,
block_number
:
int
,
block_size
:
int
,
)
->
None
:
self
.
block_number
=
block_number
self
.
block_size
=
block_size
self
.
token_ids
=
[
_BLANK_TOKEN_ID
]
*
block_size
self
.
num_tokens
=
0
def
is_empty
(
self
)
->
bool
:
return
self
.
num_tokens
==
0
def
get_num_empty_slots
(
self
)
->
int
:
return
self
.
block_size
-
self
.
num_tokens
def
is_full
(
self
)
->
bool
:
return
self
.
num_tokens
==
self
.
block_size
def
append_tokens
(
self
,
token_ids
:
List
[
int
])
->
None
:
assert
len
(
token_ids
)
<=
self
.
get_num_empty_slots
()
curr_idx
=
self
.
num_tokens
self
.
token_ids
[
curr_idx
:
curr_idx
+
len
(
token_ids
)]
=
token_ids
self
.
num_tokens
+=
len
(
token_ids
)
def
get_token_ids
(
self
)
->
List
[
int
]:
return
self
.
token_ids
[:
self
.
num_tokens
]
def
get_last_token_id
(
self
)
->
int
:
assert
self
.
num_tokens
>
0
return
self
.
token_ids
[
self
.
num_tokens
-
1
]
class
PhysicalTokenBlock
:
class
PhysicalTokenBlock
:
"""Represents the state of a block in the KV cache."""
"""Represents the state of a block in the KV cache."""
...
...
Prev
1
…
19
20
21
22
23
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment