Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
220e6456
Commit
220e6456
authored
Jan 06, 2025
by
zhuwenwen
Browse files
update qwen2-moe layout and benchmark_throughput.py
parent
96ae75ad
Changes
2
Show whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
36 additions
and
76 deletions
+36
-76
benchmarks/benchmark_throughput.py
benchmarks/benchmark_throughput.py
+18
-38
vllm/benchmarks/benchmark_throughput.py
vllm/benchmarks/benchmark_throughput.py
+18
-38
No files found.
benchmarks/benchmark_throughput.py
View file @
220e6456
...
@@ -165,9 +165,9 @@ def sample_requests(tokenizer: PreTrainedTokenizerBase,
...
@@ -165,9 +165,9 @@ def sample_requests(tokenizer: PreTrainedTokenizerBase,
def
run_vllm
(
def
run_vllm
(
warmup_requests
:
List
[
SampleRequest
],
requests
:
List
[
SampleRequest
],
requests
:
List
[
SampleRequest
],
n
:
int
,
n
:
int
,
num_iters_warmup
:
int
,
engine_args
:
EngineArgs
,
engine_args
:
EngineArgs
,
)
->
float
:
)
->
float
:
from
vllm
import
LLM
,
SamplingParams
from
vllm
import
LLM
,
SamplingParams
...
@@ -193,40 +193,23 @@ def run_vllm(
...
@@ -193,40 +193,23 @@ def run_vllm(
lora_requests
=
[
request
.
lora_request
for
request
in
requests
]
lora_requests
=
[
request
.
lora_request
for
request
in
requests
]
# warmup
# warmup
warmup_prompts
:
List
[
TextPrompt
]
=
[]
warmup_sampling_params
=
SamplingParams
(
warmup_sampling_params
:
List
[
SamplingParams
]
=
[]
n
=
args
.
n
,
for
request
in
warmup_requests
:
warmup_prompts
.
append
(
TextPrompt
(
prompt
=
request
.
prompt
,
multi_modal_data
=
request
.
multi_modal_data
))
warmup_sampling_params
.
append
(
SamplingParams
(
n
=
n
,
temperature
=
1.0
,
temperature
=
1.0
,
top_p
=
1.0
,
top_p
=
1.0
,
ignore_eos
=
True
,
ignore_eos
=
True
,
max_tokens
=
request
.
expected_output_len
,
max_tokens
=
10
,
))
)
dummy_prompt_token_ids
=
np
.
random
.
randint
(
10000
,
size
=
(
1
,
10
))
dummy_prompts
:
List
[
PromptType
]
=
[{
"prompt_token_ids"
:
batch
}
for
batch
in
dummy_prompt_token_ids
.
tolist
()]
print
(
"Warming up..."
)
print
(
"Warming up..."
)
for
_
in
tqdm
(
range
(
args
.
num_iters_warmup
),
desc
=
"Warmup iterations"
):
for
_
in
tqdm
(
range
(
num_iters_warmup
),
desc
=
"Warmup iterations"
):
llm
.
generate
(
warmup_prompts
,
warmup_sampling_params
,
use_tqdm
=
True
)
llm
.
generate
(
dummy_prompts
,
sampling_params
=
warmup_sampling_params
,
# dummy_prompt_token_ids = np.random.randint(10000,
use_tqdm
=
False
)
# size=(args.num_prompts,
# args.input_len))
# dummy_prompts: List[PromptType] = [{
# "prompt_token_ids": batch
# } for batch in dummy_prompt_token_ids.tolist()]
# def run_to_completion(profile_dir: Optional[str] = None):
# llm.generate(dummy_prompts,
# sampling_params=sampling_params,
# use_tqdm=False)
# print("Warming up...")
# for _ in tqdm(range(args.num_iters_warmup), desc="Warmup iterations"):
# run_to_completion(profile_dir=None)
use_beam_search
=
False
use_beam_search
=
False
...
@@ -384,9 +367,6 @@ def main(args: argparse.Namespace):
...
@@ -384,9 +367,6 @@ def main(args: argparse.Namespace):
# Sample the requests.
# Sample the requests.
tokenizer
=
AutoTokenizer
.
from_pretrained
(
tokenizer
=
AutoTokenizer
.
from_pretrained
(
args
.
tokenizer
,
trust_remote_code
=
args
.
trust_remote_code
)
args
.
tokenizer
,
trust_remote_code
=
args
.
trust_remote_code
)
warmup_prompt
=
"hi"
*
10
warmup_requests
=
[(
warmup_prompt
,
10
,
10
)
for
_
in
range
(
1
)]
if
args
.
dataset
is
None
:
if
args
.
dataset
is
None
:
vocab_size
=
tokenizer
.
vocab_size
vocab_size
=
tokenizer
.
vocab_size
requests
=
[]
requests
=
[]
...
@@ -442,7 +422,7 @@ def main(args: argparse.Namespace):
...
@@ -442,7 +422,7 @@ def main(args: argparse.Namespace):
args
.
disable_frontend_multiprocessing
,
args
.
disable_frontend_multiprocessing
,
))
))
else
:
else
:
elapsed_time
=
run_vllm
(
warmup_
requests
,
requests
,
args
.
n
,
elapsed_time
=
run_vllm
(
requests
,
args
.
n
,
args
.
num_iters_warmup
,
EngineArgs
.
from_cli_args
(
args
))
EngineArgs
.
from_cli_args
(
args
))
elif
args
.
backend
==
"hf"
:
elif
args
.
backend
==
"hf"
:
assert
args
.
tensor_parallel_size
==
1
assert
args
.
tensor_parallel_size
==
1
...
...
vllm/benchmarks/benchmark_throughput.py
View file @
220e6456
...
@@ -165,9 +165,9 @@ def sample_requests(tokenizer: PreTrainedTokenizerBase,
...
@@ -165,9 +165,9 @@ def sample_requests(tokenizer: PreTrainedTokenizerBase,
def
run_vllm
(
def
run_vllm
(
warmup_requests
:
List
[
SampleRequest
],
requests
:
List
[
SampleRequest
],
requests
:
List
[
SampleRequest
],
n
:
int
,
n
:
int
,
num_iters_warmup
:
int
,
engine_args
:
EngineArgs
,
engine_args
:
EngineArgs
,
)
->
float
:
)
->
float
:
from
vllm
import
LLM
,
SamplingParams
from
vllm
import
LLM
,
SamplingParams
...
@@ -193,40 +193,23 @@ def run_vllm(
...
@@ -193,40 +193,23 @@ def run_vllm(
lora_requests
=
[
request
.
lora_request
for
request
in
requests
]
lora_requests
=
[
request
.
lora_request
for
request
in
requests
]
# warmup
# warmup
warmup_prompts
:
List
[
TextPrompt
]
=
[]
warmup_sampling_params
=
SamplingParams
(
warmup_sampling_params
:
List
[
SamplingParams
]
=
[]
n
=
args
.
n
,
for
request
in
warmup_requests
:
warmup_prompts
.
append
(
TextPrompt
(
prompt
=
request
.
prompt
,
multi_modal_data
=
request
.
multi_modal_data
))
warmup_sampling_params
.
append
(
SamplingParams
(
n
=
n
,
temperature
=
1.0
,
temperature
=
1.0
,
top_p
=
1.0
,
top_p
=
1.0
,
ignore_eos
=
True
,
ignore_eos
=
True
,
max_tokens
=
request
.
expected_output_len
,
max_tokens
=
10
,
))
)
dummy_prompt_token_ids
=
np
.
random
.
randint
(
10000
,
size
=
(
1
,
10
))
dummy_prompts
:
List
[
PromptType
]
=
[{
"prompt_token_ids"
:
batch
}
for
batch
in
dummy_prompt_token_ids
.
tolist
()]
print
(
"Warming up..."
)
print
(
"Warming up..."
)
for
_
in
tqdm
(
range
(
args
.
num_iters_warmup
),
desc
=
"Warmup iterations"
):
for
_
in
tqdm
(
range
(
num_iters_warmup
),
desc
=
"Warmup iterations"
):
llm
.
generate
(
warmup_prompts
,
warmup_sampling_params
,
use_tqdm
=
True
)
llm
.
generate
(
dummy_prompts
,
sampling_params
=
warmup_sampling_params
,
# dummy_prompt_token_ids = np.random.randint(10000,
use_tqdm
=
False
)
# size=(args.num_prompts,
# args.input_len))
# dummy_prompts: List[PromptType] = [{
# "prompt_token_ids": batch
# } for batch in dummy_prompt_token_ids.tolist()]
# def run_to_completion(profile_dir: Optional[str] = None):
# llm.generate(dummy_prompts,
# sampling_params=sampling_params,
# use_tqdm=False)
# print("Warming up...")
# for _ in tqdm(range(args.num_iters_warmup), desc="Warmup iterations"):
# run_to_completion(profile_dir=None)
use_beam_search
=
False
use_beam_search
=
False
...
@@ -384,9 +367,6 @@ def main(args: argparse.Namespace):
...
@@ -384,9 +367,6 @@ def main(args: argparse.Namespace):
# Sample the requests.
# Sample the requests.
tokenizer
=
AutoTokenizer
.
from_pretrained
(
tokenizer
=
AutoTokenizer
.
from_pretrained
(
args
.
tokenizer
,
trust_remote_code
=
args
.
trust_remote_code
)
args
.
tokenizer
,
trust_remote_code
=
args
.
trust_remote_code
)
warmup_prompt
=
"hi"
*
10
warmup_requests
=
[(
warmup_prompt
,
10
,
10
)
for
_
in
range
(
1
)]
if
args
.
dataset
is
None
:
if
args
.
dataset
is
None
:
vocab_size
=
tokenizer
.
vocab_size
vocab_size
=
tokenizer
.
vocab_size
requests
=
[]
requests
=
[]
...
@@ -442,7 +422,7 @@ def main(args: argparse.Namespace):
...
@@ -442,7 +422,7 @@ def main(args: argparse.Namespace):
args
.
disable_frontend_multiprocessing
,
args
.
disable_frontend_multiprocessing
,
))
))
else
:
else
:
elapsed_time
=
run_vllm
(
warmup_
requests
,
requests
,
args
.
n
,
elapsed_time
=
run_vllm
(
requests
,
args
.
n
,
args
.
num_iters_warmup
,
EngineArgs
.
from_cli_args
(
args
))
EngineArgs
.
from_cli_args
(
args
))
elif
args
.
backend
==
"hf"
:
elif
args
.
backend
==
"hf"
:
assert
args
.
tensor_parallel_size
==
1
assert
args
.
tensor_parallel_size
==
1
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment