Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
tsoc
vllm-auto-test
Commits
d1a06223
Commit
d1a06223
authored
Feb 24, 2026
by
liuxu3
Browse files
added vllm092 auto test scripts
parent
fba2e3b5
Changes
162
Show whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
5520 additions
and
0 deletions
+5520
-0
offline_benchmark_test/benchmarks/benchmark_throughput.py
offline_benchmark_test/benchmarks/benchmark_throughput.py
+757
-0
offline_benchmark_test/benchmarks/benchmark_utils.py
offline_benchmark_test/benchmarks/benchmark_utils.py
+74
-0
offline_benchmark_test/benchmarks/cutlass_benchmarks/sparse_benchmarks.py
...k_test/benchmarks/cutlass_benchmarks/sparse_benchmarks.py
+516
-0
offline_benchmark_test/benchmarks/cutlass_benchmarks/utils.py
...ine_benchmark_test/benchmarks/cutlass_benchmarks/utils.py
+100
-0
offline_benchmark_test/benchmarks/cutlass_benchmarks/w8a8_benchmarks.py
...ark_test/benchmarks/cutlass_benchmarks/w8a8_benchmarks.py
+372
-0
offline_benchmark_test/benchmarks/cutlass_benchmarks/weight_shapes.py
...hmark_test/benchmarks/cutlass_benchmarks/weight_shapes.py
+46
-0
offline_benchmark_test/benchmarks/disagg_benchmarks/disagg_overhead_benchmark.sh
...benchmarks/disagg_benchmarks/disagg_overhead_benchmark.sh
+145
-0
offline_benchmark_test/benchmarks/disagg_benchmarks/disagg_performance_benchmark.sh
...chmarks/disagg_benchmarks/disagg_performance_benchmark.sh
+163
-0
offline_benchmark_test/benchmarks/disagg_benchmarks/disagg_prefill_proxy_server.py
...nchmarks/disagg_benchmarks/disagg_prefill_proxy_server.py
+63
-0
offline_benchmark_test/benchmarks/disagg_benchmarks/round_robin_proxy.py
...rk_test/benchmarks/disagg_benchmarks/round_robin_proxy.py
+63
-0
offline_benchmark_test/benchmarks/disagg_benchmarks/visualize_benchmark_results.py
...nchmarks/disagg_benchmarks/visualize_benchmark_results.py
+47
-0
offline_benchmark_test/benchmarks/fused_kernels/layernorm_rms_benchmarks.py
...test/benchmarks/fused_kernels/layernorm_rms_benchmarks.py
+228
-0
offline_benchmark_test/benchmarks/kernels/bench_fp8_gemm.py
offline_benchmark_test/benchmarks/kernels/bench_fp8_gemm.py
+159
-0
offline_benchmark_test/benchmarks/kernels/bench_int8_gemm.py
offline_benchmark_test/benchmarks/kernels/bench_int8_gemm.py
+169
-0
offline_benchmark_test/benchmarks/kernels/benchmark_aqlm.py
offline_benchmark_test/benchmarks/kernels/benchmark_aqlm.py
+345
-0
offline_benchmark_test/benchmarks/kernels/benchmark_bitblas.py
...ne_benchmark_test/benchmarks/kernels/benchmark_bitblas.py
+242
-0
offline_benchmark_test/benchmarks/kernels/benchmark_cutlass_fp4_moe.py
...mark_test/benchmarks/kernels/benchmark_cutlass_fp4_moe.py
+490
-0
offline_benchmark_test/benchmarks/kernels/benchmark_grouped_gemm_cutlass.py
...test/benchmarks/kernels/benchmark_grouped_gemm_cutlass.py
+383
-0
offline_benchmark_test/benchmarks/kernels/benchmark_layernorm.py
..._benchmark_test/benchmarks/kernels/benchmark_layernorm.py
+93
-0
offline_benchmark_test/benchmarks/kernels/benchmark_lora.py
offline_benchmark_test/benchmarks/kernels/benchmark_lora.py
+1065
-0
No files found.
Too many changes to show.
To preserve performance only
162 of 162+
files are displayed.
Plain diff
Email patch
offline_benchmark_test/benchmarks/benchmark_throughput.py
0 → 100644
View file @
d1a06223
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""Benchmark offline inference throughput."""
import
argparse
import
dataclasses
import
json
import
os
import
random
import
time
import
warnings
from
typing
import
Any
,
Optional
,
Union
import
torch
import
uvloop
from
tqdm
import
tqdm
from
transformers
import
AutoModelForCausalLM
,
AutoTokenizer
,
PreTrainedTokenizerBase
from
benchmark_dataset
import
(
AIMODataset
,
BurstGPTDataset
,
ConversationDataset
,
InstructCoderDataset
,
RandomDataset
,
SampleRequest
,
ShareGPTDataset
,
SonnetDataset
,
VisionArenaDataset
,
)
from
benchmark_utils
import
convert_to_pytorch_benchmark_format
,
write_to_json
from
vllm.engine.arg_utils
import
AsyncEngineArgs
,
EngineArgs
from
vllm.entrypoints.openai.api_server
import
(
build_async_engine_client_from_engine_args
,
)
from
vllm.inputs
import
TextPrompt
,
TokensPrompt
from
vllm.lora.request
import
LoRARequest
from
vllm.outputs
import
RequestOutput
from
vllm.sampling_params
import
BeamSearchParams
from
vllm.utils
import
FlexibleArgumentParser
,
merge_async_iterators
def
run_vllm
(
requests
:
list
[
SampleRequest
],
n
:
int
,
engine_args
:
EngineArgs
,
disable_detokenize
:
bool
=
False
,
)
->
tuple
[
float
,
Optional
[
list
[
RequestOutput
]]]:
from
vllm
import
LLM
,
SamplingParams
llm
=
LLM
(
**
dataclasses
.
asdict
(
engine_args
))
assert
all
(
llm
.
llm_engine
.
model_config
.
max_model_len
>=
(
request
.
prompt_len
+
request
.
expected_output_len
)
for
request
in
requests
),
(
"Please ensure that max_model_len is greater than the sum of"
" prompt_len and expected_output_len for all requests."
)
# Add the requests to the engine.
prompts
:
list
[
Union
[
TextPrompt
,
TokensPrompt
]]
=
[]
sampling_params
:
list
[
SamplingParams
]
=
[]
for
request
in
requests
:
prompts
.
append
(
TokensPrompt
(
prompt_token_ids
=
request
.
prompt
[
"prompt_token_ids"
],
multi_modal_data
=
request
.
multi_modal_data
,
)
if
"prompt_token_ids"
in
request
.
prompt
else
TextPrompt
(
prompt
=
request
.
prompt
,
multi_modal_data
=
request
.
multi_modal_data
)
)
sampling_params
.
append
(
SamplingParams
(
n
=
n
,
temperature
=
1.0
,
top_p
=
1.0
,
ignore_eos
=
True
,
max_tokens
=
request
.
expected_output_len
,
detokenize
=
not
disable_detokenize
,
)
)
lora_requests
:
Optional
[
list
[
LoRARequest
]]
=
None
if
engine_args
.
enable_lora
:
lora_requests
=
[
request
.
lora_request
for
request
in
requests
]
use_beam_search
=
False
outputs
=
None
if
not
use_beam_search
:
start
=
time
.
perf_counter
()
outputs
=
llm
.
generate
(
prompts
,
sampling_params
,
lora_request
=
lora_requests
,
use_tqdm
=
True
)
end
=
time
.
perf_counter
()
total_ttfts
=
[]
total_tpots
=
[]
for
output
in
outputs
:
if
output
.
metrics
.
first_token_time
==
None
:
pass
else
:
ttft_
=
output
.
metrics
.
first_token_time
-
output
.
metrics
.
arrival_time
tpot_
=
(
output
.
metrics
.
finished_time
-
output
.
metrics
.
arrival_time
-
ttft_
)
/
(
args
.
output_len
-
1
)
total_ttfts
.
append
(
ttft_
)
total_tpots
.
append
(
tpot_
)
import
numpy
as
np
ttft_mean
=
np
.
mean
(
total_ttfts
)
ttft_max
=
np
.
max
(
total_ttfts
)
ttft_min
=
np
.
min
(
total_ttfts
)
tpot_mean
=
np
.
mean
(
total_tpots
)
tpot_min
=
np
.
min
(
total_tpots
)
tpot_max
=
np
.
max
(
total_tpots
)
else
:
assert
lora_requests
is
None
,
"BeamSearch API does not support LoRA"
prompts
=
[
request
.
prompt
for
request
in
requests
]
# output_len should be the same for all requests.
output_len
=
requests
[
0
].
expected_output_len
for
request
in
requests
:
assert
request
.
expected_output_len
==
output_len
start
=
time
.
perf_counter
()
llm
.
beam_search
(
prompts
,
BeamSearchParams
(
beam_width
=
n
,
max_tokens
=
output_len
,
ignore_eos
=
True
,
),
)
end
=
time
.
perf_counter
()
return
end
-
start
,
ttft_mean
,
tpot_mean
,
outputs
# return end - start, outputs
def
run_vllm_chat
(
requests
:
list
[
SampleRequest
],
n
:
int
,
engine_args
:
EngineArgs
,
disable_detokenize
:
bool
=
False
,
)
->
tuple
[
float
,
list
[
RequestOutput
]]:
"""
Run vLLM chat benchmark. This function is recommended ONLY for benchmarking
multimodal models as it properly handles multimodal inputs and chat
formatting. For non-multimodal models, use run_vllm() instead.
"""
from
vllm
import
LLM
,
SamplingParams
llm
=
LLM
(
**
dataclasses
.
asdict
(
engine_args
))
assert
all
(
llm
.
llm_engine
.
model_config
.
max_model_len
>=
(
request
.
prompt_len
+
request
.
expected_output_len
)
for
request
in
requests
),
(
"Please ensure that max_model_len is greater than the sum of "
"prompt_len and expected_output_len for all requests."
)
prompts
=
[]
sampling_params
:
list
[
SamplingParams
]
=
[]
for
request
in
requests
:
prompts
.
append
(
request
.
prompt
)
sampling_params
.
append
(
SamplingParams
(
n
=
n
,
temperature
=
1.0
,
top_p
=
1.0
,
ignore_eos
=
True
,
max_tokens
=
request
.
expected_output_len
,
detokenize
=
not
disable_detokenize
,
)
)
start
=
time
.
perf_counter
()
outputs
=
llm
.
chat
(
prompts
,
sampling_params
,
use_tqdm
=
True
)
end
=
time
.
perf_counter
()
return
end
-
start
,
outputs
async
def
run_vllm_async
(
requests
:
list
[
SampleRequest
],
n
:
int
,
engine_args
:
AsyncEngineArgs
,
disable_frontend_multiprocessing
:
bool
=
False
,
disable_detokenize
:
bool
=
False
,
)
->
float
:
from
vllm
import
SamplingParams
async
with
build_async_engine_client_from_engine_args
(
engine_args
,
disable_frontend_multiprocessing
)
as
llm
:
model_config
=
await
llm
.
get_model_config
()
assert
all
(
model_config
.
max_model_len
>=
(
request
.
prompt_len
+
request
.
expected_output_len
)
for
request
in
requests
),
(
"Please ensure that max_model_len is greater than the sum of"
" prompt_len and expected_output_len for all requests."
)
# Add the requests to the engine.
prompts
:
list
[
Union
[
TextPrompt
,
TokensPrompt
]]
=
[]
sampling_params
:
list
[
SamplingParams
]
=
[]
lora_requests
:
list
[
Optional
[
LoRARequest
]]
=
[]
for
request
in
requests
:
prompts
.
append
(
TokensPrompt
(
prompt_token_ids
=
request
.
prompt
[
"prompt_token_ids"
],
multi_modal_data
=
request
.
multi_modal_data
,
)
if
"prompt_token_ids"
in
request
.
prompt
else
TextPrompt
(
prompt
=
request
.
prompt
,
multi_modal_data
=
request
.
multi_modal_data
)
)
sampling_params
.
append
(
SamplingParams
(
n
=
n
,
temperature
=
1.0
,
top_p
=
1.0
,
ignore_eos
=
True
,
max_tokens
=
request
.
expected_output_len
,
detokenize
=
not
disable_detokenize
,
)
)
lora_requests
.
append
(
request
.
lora_request
)
generators
=
[]
start
=
time
.
perf_counter
()
for
i
,
(
prompt
,
sp
,
lr
)
in
enumerate
(
zip
(
prompts
,
sampling_params
,
lora_requests
)
):
generator
=
llm
.
generate
(
prompt
,
sp
,
lora_request
=
lr
,
request_id
=
f
"test
{
i
}
"
)
generators
.
append
(
generator
)
all_gens
=
merge_async_iterators
(
*
generators
)
async
for
i
,
res
in
all_gens
:
pass
end
=
time
.
perf_counter
()
return
end
-
start
def
run_hf
(
requests
:
list
[
SampleRequest
],
model
:
str
,
tokenizer
:
PreTrainedTokenizerBase
,
n
:
int
,
max_batch_size
:
int
,
trust_remote_code
:
bool
,
disable_detokenize
:
bool
=
False
,
)
->
float
:
llm
=
AutoModelForCausalLM
.
from_pretrained
(
model
,
torch_dtype
=
torch
.
float16
,
trust_remote_code
=
trust_remote_code
)
if
llm
.
config
.
model_type
==
"llama"
:
# To enable padding in the HF backend.
tokenizer
.
pad_token
=
tokenizer
.
eos_token
llm
=
llm
.
cuda
()
pbar
=
tqdm
(
total
=
len
(
requests
))
start
=
time
.
perf_counter
()
batch
:
list
[
str
]
=
[]
max_prompt_len
=
0
max_output_len
=
0
for
i
in
range
(
len
(
requests
)):
prompt
=
requests
[
i
].
prompt
prompt_len
=
requests
[
i
].
prompt_len
output_len
=
requests
[
i
].
expected_output_len
# Add the prompt to the batch.
batch
.
append
(
prompt
)
max_prompt_len
=
max
(
max_prompt_len
,
prompt_len
)
max_output_len
=
max
(
max_output_len
,
output_len
)
if
len
(
batch
)
<
max_batch_size
and
i
!=
len
(
requests
)
-
1
:
# Check if we can add more requests to the batch.
next_prompt_len
=
requests
[
i
+
1
].
prompt_len
next_output_len
=
requests
[
i
+
1
].
expected_output_len
if
(
max
(
max_prompt_len
,
next_prompt_len
)
+
max
(
max_output_len
,
next_output_len
)
)
<=
2048
:
# We can add more requests to the batch.
continue
# Generate the sequences.
input_ids
=
tokenizer
(
batch
,
return_tensors
=
"pt"
,
padding
=
True
).
input_ids
llm_outputs
=
llm
.
generate
(
input_ids
=
input_ids
.
cuda
(),
do_sample
=
True
,
num_return_sequences
=
n
,
temperature
=
1.0
,
top_p
=
1.0
,
use_cache
=
True
,
max_new_tokens
=
max_output_len
,
)
if
not
disable_detokenize
:
# Include the decoding time.
tokenizer
.
batch_decode
(
llm_outputs
,
skip_special_tokens
=
True
)
pbar
.
update
(
len
(
batch
))
# Clear the batch.
batch
=
[]
max_prompt_len
=
0
max_output_len
=
0
end
=
time
.
perf_counter
()
return
end
-
start
def
run_mii
(
requests
:
list
[
SampleRequest
],
model
:
str
,
tensor_parallel_size
:
int
,
output_len
:
int
,
)
->
float
:
from
mii
import
client
,
serve
llm
=
serve
(
model
,
tensor_parallel
=
tensor_parallel_size
)
prompts
=
[
request
.
prompt
for
request
in
requests
]
start
=
time
.
perf_counter
()
llm
.
generate
(
prompts
,
max_new_tokens
=
output_len
)
end
=
time
.
perf_counter
()
client
=
client
(
model
)
client
.
terminate_server
()
return
end
-
start
def
save_to_pytorch_benchmark_format
(
args
:
argparse
.
Namespace
,
results
:
dict
[
str
,
Any
]
)
->
None
:
pt_records
=
convert_to_pytorch_benchmark_format
(
args
=
args
,
metrics
=
{
"requests_per_second"
:
[
results
[
"requests_per_second"
]],
"tokens_per_second"
:
[
results
[
"tokens_per_second"
]],
},
extra_info
=
{
k
:
results
[
k
]
for
k
in
[
"elapsed_time"
,
"num_requests"
,
"total_num_tokens"
]
},
)
if
pt_records
:
# Don't use json suffix here as we don't want CI to pick it up
pt_file
=
f
"
{
os
.
path
.
splitext
(
args
.
output_json
)[
0
]
}
.pytorch.json"
write_to_json
(
pt_file
,
pt_records
)
def
get_requests
(
args
,
tokenizer
):
# Common parameters for all dataset types.
common_kwargs
=
{
"dataset_path"
:
args
.
dataset_path
,
"random_seed"
:
args
.
seed
,
}
sample_kwargs
=
{
"tokenizer"
:
tokenizer
,
"lora_path"
:
args
.
lora_path
,
"max_loras"
:
args
.
max_loras
,
"num_requests"
:
args
.
num_prompts
,
"input_len"
:
args
.
input_len
,
"output_len"
:
args
.
output_len
,
}
if
args
.
dataset_path
is
None
or
args
.
dataset_name
==
"random"
:
sample_kwargs
[
"range_ratio"
]
=
args
.
random_range_ratio
sample_kwargs
[
"prefix_len"
]
=
args
.
prefix_len
dataset_cls
=
RandomDataset
elif
args
.
dataset_name
==
"sharegpt"
:
dataset_cls
=
ShareGPTDataset
if
args
.
backend
==
"vllm-chat"
:
sample_kwargs
[
"enable_multimodal_chat"
]
=
True
elif
args
.
dataset_name
==
"sonnet"
:
assert
tokenizer
.
chat_template
or
tokenizer
.
default_chat_template
,
(
"Tokenizer/model must have chat template for sonnet dataset."
)
dataset_cls
=
SonnetDataset
sample_kwargs
[
"prefix_len"
]
=
args
.
prefix_len
sample_kwargs
[
"return_prompt_formatted"
]
=
True
elif
args
.
dataset_name
==
"burstgpt"
:
dataset_cls
=
BurstGPTDataset
elif
args
.
dataset_name
==
"hf"
:
if
args
.
dataset_path
in
VisionArenaDataset
.
SUPPORTED_DATASET_PATHS
:
dataset_cls
=
VisionArenaDataset
common_kwargs
[
"dataset_subset"
]
=
None
common_kwargs
[
"dataset_split"
]
=
"train"
sample_kwargs
[
"enable_multimodal_chat"
]
=
True
elif
args
.
dataset_path
in
InstructCoderDataset
.
SUPPORTED_DATASET_PATHS
:
dataset_cls
=
InstructCoderDataset
common_kwargs
[
"dataset_split"
]
=
"train"
elif
args
.
dataset_path
in
ConversationDataset
.
SUPPORTED_DATASET_PATHS
:
dataset_cls
=
ConversationDataset
common_kwargs
[
"dataset_subset"
]
=
args
.
hf_subset
common_kwargs
[
"dataset_split"
]
=
args
.
hf_split
sample_kwargs
[
"enable_multimodal_chat"
]
=
True
elif
args
.
dataset_path
in
AIMODataset
.
SUPPORTED_DATASET_PATHS
:
dataset_cls
=
AIMODataset
common_kwargs
[
"dataset_subset"
]
=
None
common_kwargs
[
"dataset_split"
]
=
"train"
else
:
raise
ValueError
(
f
"Unknown dataset name:
{
args
.
dataset_name
}
"
)
# Remove None values
sample_kwargs
=
{
k
:
v
for
k
,
v
in
sample_kwargs
.
items
()
if
v
is
not
None
}
return
dataset_cls
(
**
common_kwargs
).
sample
(
**
sample_kwargs
)
def
main
(
args
:
argparse
.
Namespace
):
if
args
.
seed
is
None
:
args
.
seed
=
0
print
(
args
)
random
.
seed
(
args
.
seed
)
# Sample the requests.
tokenizer
=
AutoTokenizer
.
from_pretrained
(
args
.
tokenizer
,
trust_remote_code
=
args
.
trust_remote_code
)
requests
=
get_requests
(
args
,
tokenizer
)
is_multi_modal
=
any
(
request
.
multi_modal_data
is
not
None
for
request
in
requests
)
request_outputs
:
Optional
[
list
[
RequestOutput
]]
=
None
if
args
.
backend
==
"vllm"
:
if
args
.
async_engine
:
elapsed_time
=
uvloop
.
run
(
run_vllm_async
(
requests
,
args
.
n
,
AsyncEngineArgs
.
from_cli_args
(
args
),
args
.
disable_frontend_multiprocessing
,
args
.
disable_detokenize
,
)
)
else
:
elapsed_time
,
ttft_mean
,
tpop_mean
,
request_outputs
=
run_vllm
(
requests
,
args
.
n
,
EngineArgs
.
from_cli_args
(
args
),
args
.
disable_detokenize
,
)
elif
args
.
backend
==
"hf"
:
assert
args
.
tensor_parallel_size
==
1
elapsed_time
=
run_hf
(
requests
,
args
.
model
,
tokenizer
,
args
.
n
,
args
.
hf_max_batch_size
,
args
.
trust_remote_code
,
args
.
disable_detokenize
,
)
elif
args
.
backend
==
"mii"
:
elapsed_time
=
run_mii
(
requests
,
args
.
model
,
args
.
tensor_parallel_size
,
args
.
output_len
)
elif
args
.
backend
==
"vllm-chat"
:
elapsed_time
,
request_outputs
=
run_vllm_chat
(
requests
,
args
.
n
,
EngineArgs
.
from_cli_args
(
args
),
args
.
disable_detokenize
)
else
:
raise
ValueError
(
f
"Unknown backend:
{
args
.
backend
}
"
)
if
request_outputs
:
# Note: with the vllm and vllm-chat backends,
# we have request_outputs, which we use to count tokens.
total_prompt_tokens
=
0
total_output_tokens
=
0
for
ro
in
request_outputs
:
if
not
isinstance
(
ro
,
RequestOutput
):
continue
total_prompt_tokens
+=
(
len
(
ro
.
prompt_token_ids
)
if
ro
.
prompt_token_ids
else
0
)
total_output_tokens
+=
sum
(
len
(
o
.
token_ids
)
for
o
in
ro
.
outputs
if
o
)
total_num_tokens
=
total_prompt_tokens
+
total_output_tokens
else
:
total_num_tokens
=
sum
(
r
.
prompt_len
+
r
.
expected_output_len
for
r
in
requests
)
total_output_tokens
=
sum
(
r
.
expected_output_len
for
r
in
requests
)
total_prompt_tokens
=
total_num_tokens
-
total_output_tokens
if
is_multi_modal
and
args
.
backend
!=
"vllm-chat"
:
print
(
"
\033
[91mWARNING
\033
[0m: Multi-modal request with "
f
"
{
args
.
backend
}
backend detected. The "
"following metrics are not accurate because image tokens are not"
" counted. See vllm-project/vllm/issues/9778 for details."
)
# TODO(vllm-project/vllm/issues/9778): Count multi-modal token length.
# vllm-chat backend counts the image tokens now
print
(
f
"Throughput:
{
len
(
requests
)
/
elapsed_time
:.
2
f
}
requests/s, "
f
"
{
total_num_tokens
/
elapsed_time
:.
2
f
}
total tokens/s, "
f
"
{
total_output_tokens
/
elapsed_time
:.
2
f
}
output tokens/s"
)
# print(f"Total num prompt tokens: {total_prompt_tokens}")
# print(f"Total num output tokens: {total_output_tokens}")
print
(
f
"Generate Throughput:
{
total_output_tokens
/
elapsed_time
:.
2
f
}
tokens/s"
)
print
(
f
"Elapsed_time:
{
elapsed_time
:
.
2
f
}
s"
)
print
(
f
"TTFT mean:
{
ttft_mean
:
.
4
f
}
s"
)
print
(
f
"TPOT mean:
{
tpop_mean
:
.
4
f
}
s"
)
# Output JSON results if specified
if
args
.
output_json
:
results
=
{
"elapsed_time"
:
elapsed_time
,
"num_requests"
:
len
(
requests
),
"total_num_tokens"
:
total_num_tokens
,
"requests_per_second"
:
len
(
requests
)
/
elapsed_time
,
"tokens_per_second"
:
total_num_tokens
/
elapsed_time
,
}
with
open
(
args
.
output_json
,
"w"
)
as
f
:
json
.
dump
(
results
,
f
,
indent
=
4
)
save_to_pytorch_benchmark_format
(
args
,
results
)
def
validate_args
(
args
):
"""
Validate command-line arguments.
"""
# === Deprecation and Defaulting ===
if
args
.
dataset
is
not
None
:
warnings
.
warn
(
"The '--dataset' argument will be deprecated in the next release. "
"Please use '--dataset-name' and '--dataset-path' instead."
,
stacklevel
=
2
,
)
args
.
dataset_path
=
args
.
dataset
if
not
getattr
(
args
,
"tokenizer"
,
None
):
args
.
tokenizer
=
args
.
model
# === Backend Validation ===
valid_backends
=
{
"vllm"
,
"hf"
,
"mii"
,
"vllm-chat"
}
if
args
.
backend
not
in
valid_backends
:
raise
ValueError
(
f
"Unsupported backend:
{
args
.
backend
}
"
)
# === Dataset Configuration ===
if
not
args
.
dataset
and
not
args
.
dataset_path
:
print
(
"When dataset path is not set, it will default to random dataset"
)
args
.
dataset_name
=
"random"
if
args
.
input_len
is
None
:
raise
ValueError
(
"input_len must be provided for a random dataset"
)
# === Dataset Name Specific Checks ===
# --hf-subset and --hf-split: only used
# when dataset_name is 'hf'
if
args
.
dataset_name
!=
"hf"
and
(
getattr
(
args
,
"hf_subset"
,
None
)
is
not
None
or
getattr
(
args
,
"hf_split"
,
None
)
is
not
None
):
warnings
.
warn
(
"--hf-subset and --hf-split will be ignored
\
since --dataset-name is not 'hf'."
,
stacklevel
=
2
,
)
elif
args
.
dataset_name
==
"hf"
:
if
args
.
dataset_path
in
(
VisionArenaDataset
.
SUPPORTED_DATASET_PATHS
.
keys
()
|
ConversationDataset
.
SUPPORTED_DATASET_PATHS
):
assert
args
.
backend
==
"vllm-chat"
,
(
f
"
{
args
.
dataset_path
}
needs to use vllm-chat as the backend."
)
# noqa: E501
elif
args
.
dataset_path
in
(
InstructCoderDataset
.
SUPPORTED_DATASET_PATHS
|
AIMODataset
.
SUPPORTED_DATASET_PATHS
):
assert
args
.
backend
==
"vllm"
,
(
f
"
{
args
.
dataset_path
}
needs to use vllm as the backend."
)
# noqa: E501
else
:
raise
ValueError
(
f
"
{
args
.
dataset_path
}
is not supported by hf dataset."
)
# --random-range-ratio: only used when dataset_name is 'random'
if
args
.
dataset_name
!=
"random"
and
args
.
random_range_ratio
is
not
None
:
warnings
.
warn
(
"--random-range-ratio will be ignored since
\
--dataset-name is not 'random'."
,
stacklevel
=
2
,
)
# --prefix-len: only used when dataset_name is 'random', 'sonnet', or not
# set.
if
(
args
.
dataset_name
not
in
{
"random"
,
"sonnet"
,
None
}
and
args
.
prefix_len
is
not
None
):
warnings
.
warn
(
"--prefix-len will be ignored since --dataset-name
\
is not 'random', 'sonnet', or not set."
,
stacklevel
=
2
,
)
# === LoRA Settings ===
if
getattr
(
args
,
"enable_lora"
,
False
)
and
args
.
backend
!=
"vllm"
:
raise
ValueError
(
"LoRA benchmarking is only supported for vLLM backend"
)
if
getattr
(
args
,
"enable_lora"
,
False
)
and
args
.
lora_path
is
None
:
raise
ValueError
(
"LoRA path must be provided when enable_lora is True"
)
# === Backend-specific Validations ===
if
args
.
backend
==
"hf"
and
args
.
hf_max_batch_size
is
None
:
raise
ValueError
(
"HF max batch size is required for HF backend"
)
if
args
.
backend
!=
"hf"
and
args
.
hf_max_batch_size
is
not
None
:
raise
ValueError
(
"HF max batch size is only for HF backend."
)
if
(
args
.
backend
in
{
"hf"
,
"mii"
}
and
getattr
(
args
,
"quantization"
,
None
)
is
not
None
):
raise
ValueError
(
"Quantization is only for vLLM backend."
)
if
args
.
backend
==
"mii"
and
args
.
dtype
!=
"auto"
:
raise
ValueError
(
"dtype must be auto for MII backend."
)
if
args
.
backend
==
"mii"
and
args
.
n
!=
1
:
raise
ValueError
(
"n must be 1 for MII backend."
)
if
args
.
backend
==
"mii"
and
args
.
tokenizer
!=
args
.
model
:
raise
ValueError
(
"Tokenizer must be the same as the model for MII backend."
)
# --data-parallel is not supported currently.
# https://github.com/vllm-project/vllm/issues/16222
if
args
.
data_parallel_size
>
1
:
raise
ValueError
(
"Data parallel is not supported in offline benchmark,
\
please use benchmark serving instead"
)
def
create_argument_parser
():
parser
=
FlexibleArgumentParser
(
description
=
"Benchmark the throughput."
)
parser
.
add_argument
(
"--backend"
,
type
=
str
,
choices
=
[
"vllm"
,
"hf"
,
"mii"
,
"vllm-chat"
],
default
=
"vllm"
,
)
parser
.
add_argument
(
"--dataset-name"
,
type
=
str
,
choices
=
[
"sharegpt"
,
"random"
,
"sonnet"
,
"burstgpt"
,
"hf"
],
help
=
"Name of the dataset to benchmark on."
,
default
=
"sharegpt"
,
)
parser
.
add_argument
(
"--dataset"
,
type
=
str
,
default
=
None
,
help
=
"Path to the ShareGPT dataset, will be deprecated in
\
the next release. The dataset is expected to "
"be a json in form of list[dict[..., conversations: "
"list[dict[..., value: <prompt_or_response>]]]]"
,
)
parser
.
add_argument
(
"--dataset-path"
,
type
=
str
,
default
=
None
,
help
=
"Path to the dataset"
)
parser
.
add_argument
(
"--input-len"
,
type
=
int
,
default
=
None
,
help
=
"Input prompt length for each request"
,
)
parser
.
add_argument
(
"--output-len"
,
type
=
int
,
default
=
None
,
help
=
"Output length for each request. Overrides the "
"output length from the dataset."
,
)
parser
.
add_argument
(
"--n"
,
type
=
int
,
default
=
1
,
help
=
"Number of generated sequences per prompt."
)
parser
.
add_argument
(
"--num-prompts"
,
type
=
int
,
default
=
1000
,
help
=
"Number of prompts to process."
)
parser
.
add_argument
(
"--hf-max-batch-size"
,
type
=
int
,
default
=
None
,
help
=
"Maximum batch size for HF backend."
,
)
parser
.
add_argument
(
"--output-json"
,
type
=
str
,
default
=
None
,
help
=
"Path to save the throughput results in JSON format."
,
)
parser
.
add_argument
(
"--async-engine"
,
action
=
"store_true"
,
default
=
False
,
help
=
"Use vLLM async engine rather than LLM class."
,
)
parser
.
add_argument
(
"--disable-frontend-multiprocessing"
,
action
=
"store_true"
,
default
=
False
,
help
=
"Disable decoupled async engine frontend."
,
)
parser
.
add_argument
(
"--disable-detokenize"
,
action
=
"store_true"
,
help
=
(
"Do not detokenize the response (i.e. do not include "
"detokenization time in the measurement)"
),
)
# LoRA
parser
.
add_argument
(
"--lora-path"
,
type
=
str
,
default
=
None
,
help
=
"Path to the LoRA adapters to use. This can be an absolute path, "
"a relative path, or a Hugging Face model identifier."
,
)
parser
.
add_argument
(
"--prefix-len"
,
type
=
int
,
default
=
None
,
help
=
f
"Number of prefix tokens to be used in RandomDataset "
"and SonnetDataset. For RandomDataset, the total input "
"length is the sum of prefix-len (default: "
f
"
{
RandomDataset
.
DEFAULT_PREFIX_LEN
}
) and a random context length "
"sampled from [input_len * (1 - range_ratio), "
"input_len * (1 + range_ratio)]. For SonnetDataset, "
f
"prefix_len (default:
{
SonnetDataset
.
DEFAULT_PREFIX_LEN
}
) "
"controls how much of the input is fixed lines versus "
"random lines, but the total input length remains approximately "
"input_len tokens."
,
)
# random dataset
parser
.
add_argument
(
"--random-range-ratio"
,
type
=
float
,
default
=
None
,
help
=
f
"Range ratio (default :
{
RandomDataset
.
DEFAULT_RANGE_RATIO
}
) "
"for sampling input/output length, "
"used only for RandomDataset. Must be in the range [0, 1) to "
"define a symmetric sampling range "
"[length * (1 - range_ratio), length * (1 + range_ratio)]."
,
)
# hf dtaset
parser
.
add_argument
(
"--hf-subset"
,
type
=
str
,
default
=
None
,
help
=
"Subset of the HF dataset."
)
parser
.
add_argument
(
"--hf-split"
,
type
=
str
,
default
=
None
,
help
=
"Split of the HF dataset."
)
parser
=
AsyncEngineArgs
.
add_cli_args
(
parser
)
return
parser
if
__name__
==
"__main__"
:
parser
=
create_argument_parser
()
args
=
parser
.
parse_args
()
if
args
.
tokenizer
is
None
:
args
.
tokenizer
=
args
.
model
validate_args
(
args
)
main
(
args
)
offline_benchmark_test/benchmarks/benchmark_utils.py
0 → 100644
View file @
d1a06223
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import
argparse
import
json
import
math
import
os
from
typing
import
Any
def
convert_to_pytorch_benchmark_format
(
args
:
argparse
.
Namespace
,
metrics
:
dict
[
str
,
list
],
extra_info
:
dict
[
str
,
Any
]
)
->
list
:
"""
Save the benchmark results in the format used by PyTorch OSS benchmark with
on metric per record
https://github.com/pytorch/pytorch/wiki/How-to-integrate-with-PyTorch-OSS-benchmark-database
"""
records
=
[]
if
not
os
.
environ
.
get
(
"SAVE_TO_PYTORCH_BENCHMARK_FORMAT"
,
False
):
return
records
for
name
,
benchmark_values
in
metrics
.
items
():
record
=
{
"benchmark"
:
{
"name"
:
"vLLM benchmark"
,
"extra_info"
:
{
"args"
:
vars
(
args
),
},
},
"model"
:
{
"name"
:
args
.
model
,
},
"metric"
:
{
"name"
:
name
,
"benchmark_values"
:
benchmark_values
,
"extra_info"
:
extra_info
,
},
}
tp
=
record
[
"benchmark"
][
"extra_info"
][
"args"
].
get
(
"tensor_parallel_size"
)
# Save tensor_parallel_size parameter if it's part of the metadata
if
not
tp
and
"tensor_parallel_size"
in
extra_info
:
record
[
"benchmark"
][
"extra_info"
][
"args"
][
"tensor_parallel_size"
]
=
(
extra_info
[
"tensor_parallel_size"
]
)
records
.
append
(
record
)
return
records
class
InfEncoder
(
json
.
JSONEncoder
):
def
clear_inf
(
self
,
o
:
Any
):
if
isinstance
(
o
,
dict
):
return
{
k
:
self
.
clear_inf
(
v
)
for
k
,
v
in
o
.
items
()}
elif
isinstance
(
o
,
list
):
return
[
self
.
clear_inf
(
v
)
for
v
in
o
]
elif
isinstance
(
o
,
float
)
and
math
.
isinf
(
o
):
return
"inf"
return
o
def
iterencode
(
self
,
o
:
Any
,
*
args
,
**
kwargs
)
->
Any
:
return
super
().
iterencode
(
self
.
clear_inf
(
o
),
*
args
,
**
kwargs
)
def
write_to_json
(
filename
:
str
,
records
:
list
)
->
None
:
with
open
(
filename
,
"w"
)
as
f
:
json
.
dump
(
records
,
f
,
cls
=
InfEncoder
,
default
=
lambda
o
:
f
"<
{
type
(
o
).
__name__
}
object is not JSON serializable>"
,
)
offline_benchmark_test/benchmarks/cutlass_benchmarks/sparse_benchmarks.py
0 → 100644
View file @
d1a06223
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import
argparse
import
copy
import
itertools
import
pickle
as
pkl
import
time
from
collections.abc
import
Iterable
from
typing
import
Callable
import
torch
import
torch.utils.benchmark
as
TBenchmark
from
torch.utils.benchmark
import
Measurement
as
TMeasurement
from
utils
import
make_rand_sparse_tensors
from
weight_shapes
import
WEIGHT_SHAPES
from
vllm
import
_custom_ops
as
ops
from
vllm.utils
import
FlexibleArgumentParser
DEFAULT_MODELS
=
list
(
WEIGHT_SHAPES
.
keys
())
DEFAULT_BATCH_SIZES
=
[
1
,
16
,
32
,
64
,
128
,
256
,
512
]
DEFAULT_TP_SIZES
=
[
1
]
# bench
def
bench_fn
(
label
:
str
,
sub_label
:
str
,
description
:
str
,
fn
:
Callable
,
*
args
,
**
kwargs
)
->
TMeasurement
:
min_run_time
=
1
globals
=
{
"args"
:
args
,
"kwargs"
:
kwargs
,
"fn"
:
fn
,
}
return
TBenchmark
.
Timer
(
stmt
=
"fn(*args, **kwargs)"
,
globals
=
globals
,
label
=
label
,
sub_label
=
sub_label
,
description
=
description
,
).
blocked_autorange
(
min_run_time
=
min_run_time
)
def
bench_int8
(
dtype
:
torch
.
dtype
,
m
:
int
,
k
:
int
,
n
:
int
,
label
:
str
,
sub_label
:
str
)
->
Iterable
[
TMeasurement
]:
assert
dtype
==
torch
.
int8
b_compressed
,
e
,
a
,
b
=
make_rand_sparse_tensors
(
torch
.
int8
,
m
,
n
,
k
)
scale_a
=
torch
.
tensor
(
1.0
,
device
=
"cuda"
,
dtype
=
torch
.
float32
)
scale_b
=
torch
.
tensor
(
1.0
,
device
=
"cuda"
,
dtype
=
torch
.
float32
)
bias
=
torch
.
zeros
((
n
,),
device
=
"cuda"
,
dtype
=
torch
.
bfloat16
)
out
=
ops
.
cutlass_scaled_sparse_mm
(
a
,
b_compressed
,
e
,
scale_a
,
scale_b
,
torch
.
bfloat16
)
out_ref
=
ops
.
cutlass_scaled_mm
(
a
,
b
,
scale_a
,
scale_b
,
torch
.
bfloat16
)
if
not
torch
.
allclose
(
out
,
out_ref
):
print
(
"Incorrect results"
)
print
(
out
)
print
(
out_ref
)
else
:
print
(
"Correct results"
)
timers
=
[]
# pytorch impl - bfloat16
timers
.
append
(
bench_fn
(
label
,
sub_label
,
"pytorch_bf16_bf16_bf16_matmul-no-scales"
,
torch
.
mm
,
a
.
to
(
dtype
=
torch
.
bfloat16
),
b
.
to
(
dtype
=
torch
.
bfloat16
),
)
)
# pytorch impl - float16
timers
.
append
(
bench_fn
(
label
,
sub_label
,
"pytorch_fp16_fp16_fp16_matmul-no-scales"
,
torch
.
mm
,
a
.
to
(
dtype
=
torch
.
float16
),
b
.
to
(
dtype
=
torch
.
float16
),
)
)
# cutlass impl
timers
.
append
(
bench_fn
(
label
,
sub_label
,
"cutlass_i8_i8_bf16_scaled_mm"
,
ops
.
cutlass_scaled_mm
,
a
,
b
,
scale_a
,
scale_b
,
torch
.
bfloat16
,
)
)
# cutlass with bias
timers
.
append
(
bench_fn
(
label
,
sub_label
,
"cutlass_i8_i8_bf16_scaled_mm_bias"
,
ops
.
cutlass_scaled_mm
,
a
,
b
,
scale_a
,
scale_b
,
torch
.
bfloat16
,
bias
,
)
)
# cutlass sparse impl
timers
.
append
(
bench_fn
(
label
,
sub_label
,
"cutlass_i8_i8_bf16_scaled_sparse_mm"
,
ops
.
cutlass_scaled_sparse_mm
,
a
,
b_compressed
,
e
,
scale_a
,
scale_b
,
torch
.
bfloat16
,
)
)
# cutlass sparse with bias
timers
.
append
(
bench_fn
(
label
,
sub_label
,
"cutlass_i8_i8_bf16_scaled_sparse_mm_bias"
,
ops
.
cutlass_scaled_sparse_mm
,
a
,
b_compressed
,
e
,
scale_a
,
scale_b
,
torch
.
bfloat16
,
bias
,
)
)
return
timers
def
bench_fp8
(
dtype
:
torch
.
dtype
,
m
:
int
,
k
:
int
,
n
:
int
,
label
:
str
,
sub_label
:
str
)
->
Iterable
[
TMeasurement
]:
assert
dtype
==
torch
.
float8_e4m3fn
b_compressed
,
e
,
a
,
b
=
make_rand_sparse_tensors
(
torch
.
float8_e4m3fn
,
m
,
n
,
k
)
scale_a
=
torch
.
tensor
(
1.0
,
device
=
"cuda"
,
dtype
=
torch
.
float32
)
scale_b
=
torch
.
tensor
(
1.0
,
device
=
"cuda"
,
dtype
=
torch
.
float32
)
bias
=
torch
.
zeros
((
n
,),
device
=
"cuda"
,
dtype
=
torch
.
bfloat16
)
out
=
ops
.
cutlass_scaled_sparse_mm
(
a
,
b_compressed
,
e
,
scale_a
,
scale_b
,
torch
.
bfloat16
)
out_ref
=
ops
.
cutlass_scaled_mm
(
a
,
b
,
scale_a
,
scale_b
,
torch
.
bfloat16
)
if
not
torch
.
allclose
(
out
,
out_ref
):
print
(
"Incorrect results"
)
print
(
out
)
print
(
out_ref
)
else
:
print
(
"Correct results"
)
timers
=
[]
# pytorch impl w. bf16
timers
.
append
(
bench_fn
(
label
,
sub_label
,
"pytorch_bf16_bf16_bf16_matmul-no-scales"
,
torch
.
mm
,
a
.
to
(
dtype
=
torch
.
bfloat16
,
device
=
"cuda"
),
b
.
to
(
dtype
=
torch
.
bfloat16
,
device
=
"cuda"
),
)
)
# pytorch impl: bf16 output, without fp8 fast accum
timers
.
append
(
bench_fn
(
label
,
sub_label
,
"pytorch_fp8_fp8_bf16_scaled_mm"
,
torch
.
_scaled_mm
,
a
,
b
,
scale_a
=
scale_a
,
scale_b
=
scale_b
,
out_dtype
=
torch
.
bfloat16
,
)
)
# pytorch impl: bf16 output, with fp8 fast accum
timers
.
append
(
bench_fn
(
label
,
sub_label
,
"pytorch_fp8_fp8_bf16_scaled_mm_fast_accum"
,
torch
.
_scaled_mm
,
a
,
b
,
scale_a
=
scale_a
,
scale_b
=
scale_b
,
out_dtype
=
torch
.
bfloat16
,
use_fast_accum
=
True
,
)
)
# pytorch impl: fp16 output, without fp8 fast accum
timers
.
append
(
bench_fn
(
label
,
sub_label
,
"pytorch_fp8_fp8_fp16_scaled_mm"
,
torch
.
_scaled_mm
,
a
,
b
,
scale_a
=
scale_a
,
scale_b
=
scale_b
,
out_dtype
=
torch
.
float16
,
)
)
# pytorch impl: fp16 output, with fp8 fast accum
timers
.
append
(
bench_fn
(
label
,
sub_label
,
"pytorch_fp8_fp8_fp16_scaled_mm_fast_accum"
,
torch
.
_scaled_mm
,
a
,
b
,
scale_a
=
scale_a
,
scale_b
=
scale_b
,
out_dtype
=
torch
.
float16
,
use_fast_accum
=
True
,
)
)
# cutlass impl: bf16 output
timers
.
append
(
bench_fn
(
label
,
sub_label
,
"cutlass_fp8_fp8_bf16_scaled_mm"
,
ops
.
cutlass_scaled_mm
,
a
,
b
,
scale_a
,
scale_b
,
torch
.
bfloat16
,
)
)
# cutlass impl: bf16 output
timers
.
append
(
bench_fn
(
label
,
sub_label
,
"cutlass_fp8_fp8_bf16_scaled_sparse_mm"
,
ops
.
cutlass_scaled_sparse_mm
,
a
,
b_compressed
,
e
,
scale_a
,
scale_b
,
torch
.
bfloat16
,
)
)
# cutlass impl: fp16 output
timers
.
append
(
bench_fn
(
label
,
sub_label
,
"cutlass_fp8_fp8_fp16_scaled_sparse_mm"
,
ops
.
cutlass_scaled_sparse_mm
,
a
,
b_compressed
,
e
,
scale_a
,
scale_b
,
torch
.
float16
,
)
)
# cutlass impl: bf16 output, with bias
timers
.
append
(
bench_fn
(
label
,
sub_label
,
"cutlass_fp8_fp8_bf16_scaled_sparse_mm_bias"
,
ops
.
cutlass_scaled_sparse_mm
,
a
,
b_compressed
,
e
,
scale_a
,
scale_b
,
torch
.
bfloat16
,
bias
,
)
)
# cutlass impl: fp16 output, with bias
timers
.
append
(
bench_fn
(
label
,
sub_label
,
"cutlass_fp8_fp8_fp16_scaled_sparse_mm_bias"
,
ops
.
cutlass_scaled_sparse_mm
,
a
,
b_compressed
,
e
,
scale_a
,
scale_b
,
torch
.
float16
,
bias
.
to
(
dtype
=
torch
.
float16
),
)
)
return
timers
def
bench
(
dtype
:
torch
.
dtype
,
m
:
int
,
k
:
int
,
n
:
int
,
label
:
str
,
sub_label
:
str
)
->
Iterable
[
TMeasurement
]:
if
dtype
==
torch
.
int8
:
return
bench_int8
(
dtype
,
m
,
k
,
n
,
label
,
sub_label
)
if
dtype
==
torch
.
float8_e4m3fn
:
return
bench_fp8
(
dtype
,
m
,
k
,
n
,
label
,
sub_label
)
raise
ValueError
(
"unsupported type"
)
# runner
def
print_timers
(
timers
:
Iterable
[
TMeasurement
]):
compare
=
TBenchmark
.
Compare
(
timers
)
compare
.
print
()
def
run
(
dtype
:
torch
.
dtype
,
MKNs
:
Iterable
[
tuple
[
int
,
int
,
int
]]
)
->
Iterable
[
TMeasurement
]:
results
=
[]
for
m
,
k
,
n
in
MKNs
:
timers
=
bench
(
dtype
,
m
,
k
,
n
,
f
"scaled-
{
dtype
}
-gemm"
,
f
"MKN=(
{
m
}
x
{
k
}
x
{
n
}
)"
)
print_timers
(
timers
)
results
.
extend
(
timers
)
return
results
# output makers
def
make_output
(
data
:
Iterable
[
TMeasurement
],
MKNs
:
Iterable
[
tuple
[
int
,
int
,
int
]],
base_description
:
str
,
timestamp
=
None
,
):
print
(
f
"== All Results
{
base_description
}
===="
)
print_timers
(
data
)
# pickle all the results
timestamp
=
int
(
time
.
time
())
if
timestamp
is
None
else
timestamp
with
open
(
f
"
{
base_description
}
-
{
timestamp
}
.pkl"
,
"wb"
)
as
f
:
pkl
.
dump
(
data
,
f
)
# argparse runners
def
run_square_bench
(
args
):
dim_sizes
=
list
(
range
(
args
.
dim_start
,
args
.
dim_end
+
1
,
args
.
dim_increment
))
MKNs
=
list
(
zip
(
dim_sizes
,
dim_sizes
,
dim_sizes
))
data
=
run
(
args
.
dtype
,
MKNs
)
make_output
(
data
,
MKNs
,
f
"square_bench-
{
args
.
dtype
}
"
)
def
run_range_bench
(
args
):
dim_sizes
=
list
(
range
(
args
.
dim_start
,
args
.
dim_end
,
args
.
dim_increment
))
n
=
len
(
dim_sizes
)
Ms
=
[
args
.
m_constant
]
*
n
if
args
.
m_constant
is
not
None
else
dim_sizes
Ks
=
[
args
.
k_constant
]
*
n
if
args
.
k_constant
is
not
None
else
dim_sizes
Ns
=
[
args
.
n_constant
]
*
n
if
args
.
n_constant
is
not
None
else
dim_sizes
MKNs
=
list
(
zip
(
Ms
,
Ks
,
Ns
))
data
=
run
(
args
.
dtype
,
MKNs
)
make_output
(
data
,
MKNs
,
f
"range_bench-
{
args
.
dtype
}
"
)
def
run_model_bench
(
args
):
print
(
"Benchmarking models:"
)
for
i
,
model
in
enumerate
(
args
.
models
):
print
(
f
"[
{
i
}
]
{
model
}
"
)
def
model_shapes
(
model_name
:
str
,
tp_size
:
int
)
->
list
[
tuple
[
int
,
int
]]:
KNs
=
[]
for
KN
,
tp_split_dim
in
copy
.
deepcopy
(
WEIGHT_SHAPES
[
model_name
]):
KN
[
tp_split_dim
]
=
KN
[
tp_split_dim
]
//
tp_size
KNs
.
append
(
KN
)
return
KNs
model_bench_data
=
[]
models_tps
=
list
(
itertools
.
product
(
args
.
models
,
args
.
tp_sizes
))
for
model
,
tp_size
in
models_tps
:
Ms
=
args
.
batch_sizes
KNs
=
model_shapes
(
model
,
tp_size
)
MKNs
=
[]
for
m
in
Ms
:
for
k
,
n
in
KNs
:
MKNs
.
append
((
m
,
k
,
n
))
data
=
run
(
args
.
dtype
,
MKNs
)
model_bench_data
.
append
(
data
)
# Print all results
for
data
,
model_tp
in
zip
(
model_bench_data
,
models_tps
):
model
,
tp_size
=
model_tp
print
(
f
"== Results
{
args
.
dtype
}
{
model
}
-TP
{
tp_size
}
===="
)
print_timers
(
data
)
timestamp
=
int
(
time
.
time
())
all_data
=
[]
for
d
in
model_bench_data
:
all_data
.
extend
(
d
)
# pickle all data
with
open
(
f
"model_bench-
{
args
.
dtype
}
-
{
timestamp
}
.pkl"
,
"wb"
)
as
f
:
pkl
.
dump
(
all_data
,
f
)
if
__name__
==
"__main__"
:
def
to_torch_dtype
(
dt
):
if
dt
==
"int8"
:
return
torch
.
int8
if
dt
==
"fp8"
:
return
torch
.
float8_e4m3fn
raise
ValueError
(
"unsupported dtype"
)
parser
=
FlexibleArgumentParser
(
description
=
"""
Benchmark Cutlass GEMM.
To run square GEMMs:
python3 ./benchmarks/cutlass_benchmarks/sparse_benchmarks.py --dtype fp8 square_bench --dim-start 128 --dim-end 512 --dim-increment 64
To run constant N and K and sweep M:
python3 ./benchmarks/cutlass_benchmarks/sparse_benchmarks.py --dtype fp8 range_bench --dim-start 128 --dim-end 512 --dim-increment 64 --n-constant 16384 --k-constant 16384
To run dimensions from a model:
python3 ./benchmarks/cutlass_benchmarks/sparse_benchmarks.py --dtype fp8 model_bench --models meta-llama/Llama-2-7b-hf --batch-sizes 16 --tp-sizes 1
Output:
- a .pkl file, that is a list of raw torch.benchmark.utils.Measurements for the pytorch and cutlass implementations for the various GEMMs.
"""
,
# noqa: E501
formatter_class
=
argparse
.
RawTextHelpFormatter
,
)
parser
.
add_argument
(
"--dtype"
,
type
=
to_torch_dtype
,
required
=
True
,
help
=
"Available options are ['int8', 'fp8']"
,
)
subparsers
=
parser
.
add_subparsers
(
dest
=
"cmd"
)
square_parser
=
subparsers
.
add_parser
(
"square_bench"
)
square_parser
.
add_argument
(
"--dim-start"
,
type
=
int
,
required
=
True
)
square_parser
.
add_argument
(
"--dim-end"
,
type
=
int
,
required
=
True
)
square_parser
.
add_argument
(
"--dim-increment"
,
type
=
int
,
required
=
True
)
square_parser
.
set_defaults
(
func
=
run_square_bench
)
range_parser
=
subparsers
.
add_parser
(
"range_bench"
)
range_parser
.
add_argument
(
"--dim-start"
,
type
=
int
,
required
=
True
)
range_parser
.
add_argument
(
"--dim-end"
,
type
=
int
,
required
=
True
)
range_parser
.
add_argument
(
"--dim-increment"
,
type
=
int
,
required
=
True
)
range_parser
.
add_argument
(
"--m-constant"
,
type
=
int
,
default
=
None
)
range_parser
.
add_argument
(
"--n-constant"
,
type
=
int
,
default
=
None
)
range_parser
.
add_argument
(
"--k-constant"
,
type
=
int
,
default
=
None
)
range_parser
.
set_defaults
(
func
=
run_range_bench
)
model_parser
=
subparsers
.
add_parser
(
"model_bench"
)
model_parser
.
add_argument
(
"--models"
,
nargs
=
"+"
,
type
=
str
,
default
=
DEFAULT_MODELS
,
choices
=
WEIGHT_SHAPES
.
keys
(),
)
model_parser
.
add_argument
(
"--tp-sizes"
,
nargs
=
"+"
,
type
=
int
,
default
=
DEFAULT_TP_SIZES
)
model_parser
.
add_argument
(
"--batch-sizes"
,
nargs
=
"+"
,
type
=
int
,
default
=
DEFAULT_BATCH_SIZES
)
model_parser
.
set_defaults
(
func
=
run_model_bench
)
args
=
parser
.
parse_args
()
args
.
func
(
args
)
offline_benchmark_test/benchmarks/cutlass_benchmarks/utils.py
0 → 100644
View file @
d1a06223
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
# Cutlass bench utils
from
collections.abc
import
Iterable
import
torch
import
vllm._custom_ops
as
ops
def
to_fp8
(
tensor
:
torch
.
Tensor
)
->
torch
.
Tensor
:
finfo
=
torch
.
finfo
(
torch
.
float8_e4m3fn
)
return
torch
.
round
(
tensor
.
clamp
(
min
=
finfo
.
min
,
max
=
finfo
.
max
)).
to
(
dtype
=
torch
.
float8_e4m3fn
)
def
to_int8
(
tensor
:
torch
.
Tensor
)
->
torch
.
Tensor
:
return
torch
.
round
(
tensor
.
clamp
(
min
=-
128
,
max
=
127
)).
to
(
dtype
=
torch
.
int8
)
def
to_bf16
(
tensor
:
torch
.
Tensor
)
->
torch
.
Tensor
:
return
tensor
.
to
(
dtype
=
torch
.
bfloat16
)
def
to_fp16
(
tensor
:
torch
.
Tensor
)
->
torch
.
Tensor
:
return
tensor
.
to
(
dtype
=
torch
.
float16
)
def
make_rand_tensors
(
dtype
:
torch
.
dtype
,
m
:
int
,
n
:
int
,
k
:
int
)
->
tuple
[
torch
.
Tensor
,
torch
.
Tensor
]:
a
=
torch
.
randn
((
m
,
k
),
device
=
"cuda"
)
*
5
b
=
torch
.
randn
((
n
,
k
),
device
=
"cuda"
).
t
()
*
5
if
dtype
==
torch
.
int8
:
return
to_int8
(
a
),
to_int8
(
b
)
if
dtype
==
torch
.
float8_e4m3fn
:
return
to_fp8
(
a
),
to_fp8
(
b
)
raise
ValueError
(
"unsupported dtype"
)
def
prune_to_2_4
(
tensor
):
# Reshape tensor to [N, 4] where N is number of groups of 4
original_shape
=
tensor
.
shape
reshaped
=
tensor
.
reshape
(
-
1
,
4
)
# Get indices of top 2 absolute values in each group of 4
_
,
indices
=
torch
.
topk
(
torch
.
abs
(
reshaped
),
k
=
2
,
dim
=
1
)
# Create binary mask
mask
=
torch
.
zeros_like
(
reshaped
)
mask
.
scatter_
(
dim
=
1
,
index
=
indices
,
src
=
torch
.
ones_like
(
indices
,
dtype
=
mask
.
dtype
))
# Apply mask and reshape back
pruned
=
reshaped
*
mask
# Turn all -0.0 to 0.0
pruned
[
pruned
==
-
0.0
]
=
0.0
return
pruned
.
reshape
(
original_shape
)
def
make_rand_sparse_tensors
(
dtype
:
torch
.
dtype
,
m
:
int
,
n
:
int
,
k
:
int
)
->
tuple
[
torch
.
Tensor
,
torch
.
Tensor
]:
a
=
torch
.
randn
((
m
,
k
),
device
=
"cuda"
)
*
5
b
=
torch
.
randn
((
n
,
k
),
device
=
"cuda"
).
t
()
*
5
b
=
prune_to_2_4
(
b
.
t
()).
t
()
if
dtype
==
torch
.
int8
:
a
,
b
=
to_int8
(
a
),
to_int8
(
b
)
elif
dtype
==
torch
.
float8_e4m3fn
:
a
,
b
=
to_fp8
(
a
),
to_fp8
(
b
)
elif
dtype
==
torch
.
float16
:
a
,
b
=
to_fp16
(
a
),
to_fp16
(
b
)
elif
dtype
==
torch
.
bfloat16
:
a
,
b
=
to_bf16
(
a
),
to_bf16
(
b
)
else
:
raise
ValueError
(
"unsupported dtype"
)
b_compressed
,
e
=
ops
.
cutlass_sparse_compress
(
b
.
t
())
# Compressed B, Metadata, Original A, B
return
b_compressed
,
e
,
a
,
b
def
make_n_rand_sparse_tensors
(
num_tensors
:
int
,
dtype
:
torch
.
dtype
,
m
:
int
,
n
:
int
,
k
:
int
)
->
tuple
[
Iterable
[
torch
.
Tensor
],
Iterable
[
torch
.
Tensor
]]:
ABs
=
[]
for
_
in
range
(
num_tensors
):
b_comp
,
e
,
a
,
b
=
make_rand_sparse_tensors
(
dtype
,
m
,
n
,
k
)
if
b_comp
is
not
None
:
ABs
.
append
(
make_rand_sparse_tensors
(
dtype
,
m
,
n
,
k
))
BComps
,
Es
,
As
,
Bs
=
zip
(
*
ABs
)
return
list
(
BComps
),
list
(
Es
),
list
(
As
),
list
(
Bs
)
offline_benchmark_test/benchmarks/cutlass_benchmarks/w8a8_benchmarks.py
0 → 100644
View file @
d1a06223
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import
argparse
import
copy
import
itertools
import
pickle
as
pkl
import
time
from
collections.abc
import
Iterable
from
typing
import
Callable
,
Optional
import
torch
import
torch.utils.benchmark
as
TBenchmark
from
torch.utils.benchmark
import
Measurement
as
TMeasurement
from
utils
import
make_rand_tensors
from
weight_shapes
import
WEIGHT_SHAPES
from
vllm
import
_custom_ops
as
ops
from
vllm.model_executor.layers.quantization.utils.fp8_utils
import
(
w8a8_block_fp8_matmul
,
)
from
vllm.utils
import
FlexibleArgumentParser
,
cdiv
DEFAULT_MODELS
=
list
(
WEIGHT_SHAPES
.
keys
())
DEFAULT_BATCH_SIZES
=
[
1
,
16
,
32
,
64
,
128
,
256
,
512
]
DEFAULT_TP_SIZES
=
[
1
]
# bench
def
bench_fn
(
label
:
str
,
sub_label
:
str
,
description
:
str
,
fn
:
Callable
,
*
args
,
**
kwargs
)
->
TMeasurement
:
min_run_time
=
1
globals
=
{
"args"
:
args
,
"kwargs"
:
kwargs
,
"fn"
:
fn
,
}
return
TBenchmark
.
Timer
(
stmt
=
"fn(*args, **kwargs)"
,
globals
=
globals
,
label
=
label
,
sub_label
=
sub_label
,
description
=
description
,
).
blocked_autorange
(
min_run_time
=
min_run_time
)
def
bench_int8
(
dtype
:
torch
.
dtype
,
m
:
int
,
k
:
int
,
n
:
int
,
label
:
str
,
sub_label
:
str
,
bench_kernels
:
Optional
[
list
[
str
]]
=
None
,
)
->
Iterable
[
TMeasurement
]:
"""Benchmark INT8-based kernels."""
assert
dtype
==
torch
.
int8
a
,
b
=
make_rand_tensors
(
torch
.
int8
,
m
,
n
,
k
)
scale_a
=
torch
.
tensor
(
1.0
,
device
=
"cuda"
,
dtype
=
torch
.
float32
)
scale_b
=
torch
.
tensor
(
1.0
,
device
=
"cuda"
,
dtype
=
torch
.
float32
)
bias
=
torch
.
zeros
((
n
,),
device
=
"cuda"
,
dtype
=
torch
.
bfloat16
)
azp
=
torch
.
zeros
((
m
,),
device
=
"cuda"
,
dtype
=
torch
.
int32
)
azp_adj
=
torch
.
zeros
((
n
,),
device
=
"cuda"
,
dtype
=
torch
.
int32
)
bench_fns
=
{
"pytorch_bf16_bf16_bf16_matmul-no-scales"
:
lambda
:
torch
.
mm
(
a
.
to
(
dtype
=
torch
.
bfloat16
),
b
.
to
(
dtype
=
torch
.
bfloat16
)
),
"pytorch_fp16_fp16_fp16_matmul-no-scales"
:
lambda
:
torch
.
mm
(
a
.
to
(
dtype
=
torch
.
float16
),
b
.
to
(
dtype
=
torch
.
float16
)
),
"cutlass_i8_i8_bf16_scaled_mm"
:
lambda
:
ops
.
cutlass_scaled_mm
(
a
,
b
,
scale_a
,
scale_b
,
torch
.
bfloat16
),
"cutlass_i8_i8_bf16_scaled_mm_bias"
:
lambda
:
ops
.
cutlass_scaled_mm
(
a
,
b
,
scale_a
,
scale_b
,
torch
.
bfloat16
,
bias
),
"cutlass_i8_i8_bf16_scaled_mm_azp"
:
lambda
:
ops
.
cutlass_scaled_mm_azp
(
a
,
b
,
scale_a
,
scale_b
,
torch
.
bfloat16
,
azp_adj
),
"cutlass_i8_i8_bf16_scaled_mm_azp_bias"
:
lambda
:
ops
.
cutlass_scaled_mm_azp
(
a
,
b
,
scale_a
,
scale_b
,
torch
.
bfloat16
,
azp_adj
,
None
,
bias
),
"cutlass_i8_i8_bf16_scaled_mm_azp_pt"
:
lambda
:
ops
.
cutlass_scaled_mm_azp
(
a
,
b
,
scale_a
,
scale_b
,
torch
.
bfloat16
,
azp_adj
,
azp
),
"cutlass_i8_i8_bf16_scaled_mm_azp_pt_bias"
:
lambda
:
ops
.
cutlass_scaled_mm_azp
(
a
,
b
,
scale_a
,
scale_b
,
torch
.
bfloat16
,
azp_adj
,
azp
,
bias
),
}
timers
=
[]
for
name
,
fn
in
bench_fns
.
items
():
# If bench_kernels is None, run all. Otherwise, run only exact matches.
if
bench_kernels
is
None
or
name
in
bench_kernels
:
print
(
f
"Running
{
name
}
"
)
timers
.
append
(
bench_fn
(
label
,
sub_label
,
name
,
fn
))
return
timers
def
bench_fp8
(
dtype
:
torch
.
dtype
,
m
:
int
,
k
:
int
,
n
:
int
,
label
:
str
,
sub_label
:
str
,
bench_kernels
:
Optional
[
list
[
str
]]
=
None
,
)
->
Iterable
[
TMeasurement
]:
"""Benchmark FP8-based kernels."""
assert
dtype
==
torch
.
float8_e4m3fn
a
,
b
=
make_rand_tensors
(
torch
.
float8_e4m3fn
,
m
,
n
,
k
)
a_cont
=
a
.
contiguous
()
scale_a
=
torch
.
tensor
(
1.0
,
device
=
"cuda"
,
dtype
=
torch
.
float32
)
scale_b
=
torch
.
tensor
(
1.0
,
device
=
"cuda"
,
dtype
=
torch
.
float32
)
block_scale_a
=
torch
.
rand
((
m
,
cdiv
(
k
,
128
)),
device
=
"cuda"
,
dtype
=
torch
.
float32
)
block_scale_b
=
torch
.
rand
(
cdiv
(
k
,
128
),
cdiv
(
n
,
128
),
device
=
"cuda"
,
dtype
=
torch
.
float32
)
block_scale_a_M_major
=
block_scale_a
.
t
().
contiguous
().
t
()
block_scale_b_K_major
=
block_scale_b
.
t
().
contiguous
().
t
()
bias
=
torch
.
zeros
((
n
,),
device
=
"cuda"
,
dtype
=
torch
.
bfloat16
)
print
(
m
,
k
,
n
)
bench_fns
=
{
"pytorch_bf16_bf16_bf16_matmul-no-scales"
:
lambda
:
torch
.
mm
(
a
.
to
(
dtype
=
torch
.
bfloat16
),
b
.
to
(
dtype
=
torch
.
bfloat16
)
),
"pytorch_fp16_fp16_fp16_matmul-no-scales"
:
lambda
:
torch
.
mm
(
a
.
to
(
dtype
=
torch
.
float16
),
b
.
to
(
dtype
=
torch
.
float16
)
),
"pytorch_fp8_fp8_fp16_scaled_mm"
:
lambda
:
torch
.
_scaled_mm
(
a
,
b
,
scale_a
,
scale_b
,
out_dtype
=
torch
.
float16
),
"pytorch_fp8_fp8_fp16_scaled_mm_fast_accum"
:
lambda
:
torch
.
_scaled_mm
(
a
,
b
,
scale_a
,
scale_b
,
out_dtype
=
torch
.
float16
,
use_fast_accum
=
True
),
"pytorch_fp8_fp8_bf16_scaled_mm"
:
lambda
:
torch
.
_scaled_mm
(
a
,
b
,
scale_a
,
scale_b
,
out_dtype
=
torch
.
bfloat16
),
"pytorch_fp8_fp8_bf16_scaled_mm_fast_accum"
:
lambda
:
torch
.
_scaled_mm
(
a
,
b
,
scale_a
,
scale_b
,
out_dtype
=
torch
.
bfloat16
,
use_fast_accum
=
True
),
"cutlass_fp8_fp8_bf16_scaled_mm"
:
lambda
:
ops
.
cutlass_scaled_mm
(
a
,
b
,
scale_a
,
scale_b
,
torch
.
bfloat16
),
"cutlass_fp8_fp8_fp16_scaled_mm"
:
lambda
:
ops
.
cutlass_scaled_mm
(
a
,
b
,
scale_a
,
scale_b
,
torch
.
float16
),
"cutlass_fp8_fp8_bf16_scaled_mm_bias"
:
lambda
:
ops
.
cutlass_scaled_mm
(
a
,
b
,
scale_a
,
scale_b
,
torch
.
bfloat16
,
bias
),
"cutlass_fp8_fp8_fp16_scaled_mm_bias"
:
lambda
:
ops
.
cutlass_scaled_mm
(
a
,
b
,
scale_a
,
scale_b
,
torch
.
float16
,
bias
.
to
(
dtype
=
torch
.
float16
)
),
"triton_fp8_fp8_fp16_scaled_mm_blockwise"
:
lambda
:
w8a8_block_fp8_matmul
(
a_cont
,
b
.
t
(),
block_scale_a
,
block_scale_b
.
t
(),
(
128
,
128
)
),
"cutlass_fp8_fp8_fp16_scaled_mm_blockwise"
:
lambda
:
ops
.
cutlass_scaled_mm
(
a
,
b
,
block_scale_a_M_major
,
block_scale_b_K_major
,
torch
.
float16
),
}
timers
=
[]
for
name
,
fn
in
bench_fns
.
items
():
# If bench_kernels is None, run all. Otherwise, run only exact matches.
if
bench_kernels
is
None
or
name
in
bench_kernels
:
print
(
f
"Running
{
name
}
"
)
timers
.
append
(
bench_fn
(
label
,
sub_label
,
name
,
fn
))
return
timers
def
bench
(
dtype
:
torch
.
dtype
,
m
:
int
,
k
:
int
,
n
:
int
,
label
:
str
,
sub_label
:
str
,
bench_kernels
:
Optional
[
list
[
str
]]
=
None
,
)
->
Iterable
[
TMeasurement
]:
if
dtype
==
torch
.
int8
:
return
bench_int8
(
dtype
,
m
,
k
,
n
,
label
,
sub_label
,
bench_kernels
)
if
dtype
==
torch
.
float8_e4m3fn
:
return
bench_fp8
(
dtype
,
m
,
k
,
n
,
label
,
sub_label
,
bench_kernels
)
raise
ValueError
(
"unsupported type"
)
# runner
def
print_timers
(
timers
:
Iterable
[
TMeasurement
]):
compare
=
TBenchmark
.
Compare
(
timers
)
compare
.
print
()
def
run
(
dtype
:
torch
.
dtype
,
MKNs
:
Iterable
[
tuple
[
int
,
int
,
int
]],
bench_kernels
:
Optional
[
list
[
str
]]
=
None
,
)
->
Iterable
[
TMeasurement
]:
results
=
[]
for
m
,
k
,
n
in
MKNs
:
timers
=
bench
(
dtype
,
m
,
k
,
n
,
f
"scaled-
{
dtype
}
-gemm"
,
f
"MKN=(
{
m
}
x
{
k
}
x
{
n
}
)"
,
bench_kernels
=
bench_kernels
,
)
print_timers
(
timers
)
results
.
extend
(
timers
)
return
results
def
make_output
(
data
:
Iterable
[
TMeasurement
],
MKNs
:
Iterable
[
tuple
[
int
,
int
,
int
]],
base_description
:
str
,
timestamp
=
None
,
):
print
(
f
"== All Results
{
base_description
}
===="
)
print_timers
(
data
)
# pickle all the results
timestamp
=
int
(
time
.
time
())
if
timestamp
is
None
else
timestamp
with
open
(
f
"
{
base_description
}
-
{
timestamp
}
.pkl"
,
"wb"
)
as
f
:
pkl
.
dump
(
data
,
f
)
def
run_square_bench
(
args
):
dim_sizes
=
list
(
range
(
args
.
dim_start
,
args
.
dim_end
+
1
,
args
.
dim_increment
))
MKNs
=
list
(
zip
(
dim_sizes
,
dim_sizes
,
dim_sizes
))
data
=
run
(
args
.
dtype
,
MKNs
,
bench_kernels
=
args
.
kernels
)
make_output
(
data
,
MKNs
,
f
"square_bench-
{
args
.
dtype
}
"
)
def
run_range_bench
(
args
):
dim_sizes
=
list
(
range
(
args
.
dim_start
,
args
.
dim_end
,
args
.
dim_increment
))
n
=
len
(
dim_sizes
)
Ms
=
[
args
.
m_constant
]
*
n
if
args
.
m_constant
is
not
None
else
dim_sizes
Ks
=
[
args
.
k_constant
]
*
n
if
args
.
k_constant
is
not
None
else
dim_sizes
Ns
=
[
args
.
n_constant
]
*
n
if
args
.
n_constant
is
not
None
else
dim_sizes
MKNs
=
list
(
zip
(
Ms
,
Ks
,
Ns
))
data
=
run
(
args
.
dtype
,
MKNs
,
bench_kernels
=
args
.
kernels
)
make_output
(
data
,
MKNs
,
f
"range_bench-
{
args
.
dtype
}
"
)
def
run_model_bench
(
args
):
print
(
"Benchmarking models:"
)
for
i
,
model
in
enumerate
(
args
.
models
):
print
(
f
"[
{
i
}
]
{
model
}
"
)
def
model_shapes
(
model_name
:
str
,
tp_size
:
int
)
->
list
[
tuple
[
int
,
int
]]:
KNs
=
[]
for
KN
,
tp_split_dim
in
copy
.
deepcopy
(
WEIGHT_SHAPES
[
model_name
]):
KN
[
tp_split_dim
]
=
KN
[
tp_split_dim
]
//
tp_size
KNs
.
append
(
KN
)
return
KNs
model_bench_data
=
[]
models_tps
=
list
(
itertools
.
product
(
args
.
models
,
args
.
tp_sizes
))
for
model
,
tp_size
in
models_tps
:
Ms
=
args
.
batch_sizes
KNs
=
model_shapes
(
model
,
tp_size
)
MKNs
=
[]
for
m
in
Ms
:
for
k
,
n
in
KNs
:
MKNs
.
append
((
m
,
k
,
n
))
data
=
run
(
args
.
dtype
,
MKNs
,
bench_kernels
=
args
.
kernels
)
model_bench_data
.
append
(
data
)
# Print all results
for
data
,
model_tp
in
zip
(
model_bench_data
,
models_tps
):
model
,
tp_size
=
model_tp
print
(
f
"== Results
{
args
.
dtype
}
{
model
}
-TP
{
tp_size
}
===="
)
print_timers
(
data
)
timestamp
=
int
(
time
.
time
())
all_data
=
[]
for
d
in
model_bench_data
:
all_data
.
extend
(
d
)
# pickle all data
with
open
(
f
"model_bench-
{
args
.
dtype
}
-
{
timestamp
}
.pkl"
,
"wb"
)
as
f
:
pkl
.
dump
(
all_data
,
f
)
if
__name__
==
"__main__"
:
def
to_torch_dtype
(
dt
):
if
dt
==
"int8"
:
return
torch
.
int8
if
dt
==
"fp8"
:
return
torch
.
float8_e4m3fn
raise
ValueError
(
"unsupported dtype"
)
parser
=
FlexibleArgumentParser
(
description
=
"""
Benchmark Cutlass GEMM.
To run square GEMMs:
python3 ./benchmarks/cutlass_benchmarks/w8a8_benchmarks.py --dtype fp8 square_bench --dim-start 128 --dim-end 512 --dim-increment 64
To run constant N and K and sweep M:
python3 ./benchmarks/cutlass_benchmarks/w8a8_benchmarks.py --dtype fp8 range_bench --dim-start 128 --dim-end 512 --dim-increment 64 --n-constant 16384 --k-constant 16384
To run dimensions from a model:
python3 ./benchmarks/cutlass_benchmarks/w8a8_benchmarks.py --dtype fp8 model_bench --models meta-llama/Llama-2-7b-hf --batch-sizes 16 --tp-sizes 1
Output:
- a .pkl file, that is a list of raw torch.benchmark.utils.Measurements for the pytorch and cutlass implementations for the various GEMMs.
"""
,
# noqa: E501
formatter_class
=
argparse
.
RawTextHelpFormatter
,
)
parser
.
add_argument
(
"--dtype"
,
type
=
to_torch_dtype
,
required
=
True
,
help
=
"Available options are ['int8', 'fp8']"
,
)
parser
.
add_argument
(
"--kernels"
,
nargs
=
"+"
,
type
=
str
,
default
=
None
,
help
=
"Exact names of the kernels to benchmark. If not set, runs all kernels."
,
)
subparsers
=
parser
.
add_subparsers
(
dest
=
"cmd"
)
square_parser
=
subparsers
.
add_parser
(
"square_bench"
)
square_parser
.
add_argument
(
"--dim-start"
,
type
=
int
,
required
=
True
)
square_parser
.
add_argument
(
"--dim-end"
,
type
=
int
,
required
=
True
)
square_parser
.
add_argument
(
"--dim-increment"
,
type
=
int
,
required
=
True
)
square_parser
.
set_defaults
(
func
=
run_square_bench
)
range_parser
=
subparsers
.
add_parser
(
"range_bench"
)
range_parser
.
add_argument
(
"--dim-start"
,
type
=
int
,
required
=
True
)
range_parser
.
add_argument
(
"--dim-end"
,
type
=
int
,
required
=
True
)
range_parser
.
add_argument
(
"--dim-increment"
,
type
=
int
,
required
=
True
)
range_parser
.
add_argument
(
"--m-constant"
,
type
=
int
,
default
=
None
)
range_parser
.
add_argument
(
"--n-constant"
,
type
=
int
,
default
=
None
)
range_parser
.
add_argument
(
"--k-constant"
,
type
=
int
,
default
=
None
)
range_parser
.
set_defaults
(
func
=
run_range_bench
)
model_parser
=
subparsers
.
add_parser
(
"model_bench"
)
model_parser
.
add_argument
(
"--models"
,
nargs
=
"+"
,
type
=
str
,
default
=
DEFAULT_MODELS
,
choices
=
WEIGHT_SHAPES
.
keys
(),
)
model_parser
.
add_argument
(
"--tp-sizes"
,
nargs
=
"+"
,
type
=
int
,
default
=
DEFAULT_TP_SIZES
)
model_parser
.
add_argument
(
"--batch-sizes"
,
nargs
=
"+"
,
type
=
int
,
default
=
DEFAULT_BATCH_SIZES
)
model_parser
.
set_defaults
(
func
=
run_model_bench
)
args
=
parser
.
parse_args
()
args
.
func
(
args
)
offline_benchmark_test/benchmarks/cutlass_benchmarks/weight_shapes.py
0 → 100644
View file @
d1a06223
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
# Weight Shapes are in the format
# ([K, N], TP_SPLIT_DIM)
# Example:
# A shape of ([14336, 4096], 0) indicates the following GEMM shape,
# - TP1 : K = 14336, N = 4096
# - TP2 : K = 7168, N = 4096
# A shape of ([4096, 6144], 1) indicates the following GEMM shape,
# - TP1 : K = 4096, N = 6144
# - TP4 : K = 4096, N = 1536
# TP1 shapes
WEIGHT_SHAPES
=
{
"mistralai/Mistral-7B-v0.1"
:
[
([
4096
,
6144
],
1
),
([
4096
,
4096
],
0
),
([
4096
,
28672
],
1
),
([
14336
,
4096
],
0
),
],
"meta-llama/Llama-2-7b-hf"
:
[
([
4096
,
12288
],
1
),
([
4096
,
4096
],
0
),
([
4096
,
22016
],
1
),
([
11008
,
4096
],
0
),
],
"meta-llama/Llama-3-8b"
:
[
([
4096
,
6144
],
1
),
([
4096
,
4096
],
0
),
([
4096
,
28672
],
1
),
([
14336
,
4096
],
0
),
],
"meta-llama/Llama-2-13b-hf"
:
[
([
5120
,
15360
],
1
),
([
5120
,
5120
],
0
),
([
5120
,
27648
],
1
),
([
13824
,
5120
],
0
),
],
"meta-llama/Llama-2-70b-hf"
:
[
([
8192
,
10240
],
1
),
([
8192
,
8192
],
0
),
([
8192
,
57344
],
1
),
([
28672
,
8192
],
0
),
],
}
offline_benchmark_test/benchmarks/disagg_benchmarks/disagg_overhead_benchmark.sh
0 → 100644
View file @
d1a06223
#!/bin/bash
# benchmark the overhead of disaggregated prefill.
# methodology:
# - send all request to prefill vLLM instance. It will buffer KV cache.
# - then send all request to decode instance.
# - The TTFT of decode instance is the overhead.
set
-ex
kill_gpu_processes
()
{
# kill all processes on GPU.
pgrep pt_main_thread | xargs
-r
kill
-9
pgrep python3 | xargs
-r
kill
-9
sleep
10
# remove vllm config file
rm
-rf
~/.config/vllm
# Print the GPU memory usage
# so that we know if all GPU processes are killed.
gpu_memory_usage
=
$(
nvidia-smi
--query-gpu
=
memory.used
--format
=
csv,noheader,nounits
-i
0
)
# The memory usage should be 0 MB.
echo
"GPU 0 Memory Usage:
$gpu_memory_usage
MB"
}
wait_for_server
()
{
# wait for vllm server to start
# return 1 if vllm server crashes
local
port
=
$1
timeout
1200 bash
-c
"
until curl -s localhost:
${
port
}
/v1/completions > /dev/null; do
sleep 1
done"
&&
return
0
||
return
1
}
benchmark
()
{
export
VLLM_LOGGING_LEVEL
=
DEBUG
export
VLLM_HOST_IP
=
$(
hostname
-I
|
awk
'{print $1}'
)
# compare chunked prefill with disaggregated prefill
results_folder
=
"./results"
model
=
"meta-llama/Meta-Llama-3.1-8B-Instruct"
dataset_name
=
"sonnet"
dataset_path
=
"../sonnet_4x.txt"
num_prompts
=
10
qps
=
$1
prefix_len
=
50
input_len
=
2048
output_len
=
$2
CUDA_VISIBLE_DEVICES
=
0 python3
\
-m
vllm.entrypoints.openai.api_server
\
--model
$model
\
--port
8100
\
--max-model-len
10000
\
--gpu-memory-utilization
0.6
\
--kv-transfer-config
\
'{"kv_connector":"PyNcclConnector","kv_role":"kv_producer","kv_rank":0,"kv_parallel_size":2,"kv_buffer_size":5e9}'
&
CUDA_VISIBLE_DEVICES
=
1 python3
\
-m
vllm.entrypoints.openai.api_server
\
--model
$model
\
--port
8200
\
--max-model-len
10000
\
--gpu-memory-utilization
0.6
\
--kv-transfer-config
\
'{"kv_connector":"PyNcclConnector","kv_role":"kv_consumer","kv_rank":1,"kv_parallel_size":2,"kv_buffer_size":5e9}'
&
wait_for_server 8100
wait_for_server 8200
# let the prefill instance finish prefill
python3 ../benchmark_serving.py
\
--backend
vllm
\
--model
$model
\
--dataset-name
$dataset_name
\
--dataset-path
$dataset_path
\
--sonnet-input-len
$input_len
\
--sonnet-output-len
"
$output_len
"
\
--sonnet-prefix-len
$prefix_len
\
--num-prompts
$num_prompts
\
--port
8100
\
--save-result
\
--result-dir
$results_folder
\
--result-filename
disagg_prefill_tp1.json
\
--request-rate
"inf"
# send the request to decode.
# The TTFT of this command will be the overhead of disagg prefill impl.
python3 ../benchmark_serving.py
\
--backend
vllm
\
--model
$model
\
--dataset-name
$dataset_name
\
--dataset-path
$dataset_path
\
--sonnet-input-len
$input_len
\
--sonnet-output-len
"
$output_len
"
\
--sonnet-prefix-len
$prefix_len
\
--num-prompts
$num_prompts
\
--port
8200
\
--save-result
\
--result-dir
$results_folder
\
--result-filename
disagg_prefill_tp1_overhead.json
\
--request-rate
"
$qps
"
kill_gpu_processes
}
main
()
{
(
which wget
&&
which curl
)
||
(
apt-get update
&&
apt-get
install
-y
wget curl
)
(
which jq
)
||
(
apt-get
-y
install
jq
)
(
which socat
)
||
(
apt-get
-y
install
socat
)
pip
install
quart httpx datasets
cd
"
$(
dirname
"
$0
"
)
"
cd
..
# create sonnet-4x.txt
echo
""
>
sonnet_4x.txt
for
_
in
{
1..4
}
do
cat
sonnet.txt
>>
sonnet_4x.txt
done
cd
disagg_benchmarks
rm
-rf
results
mkdir
results
default_qps
=
1
default_output_len
=
1
benchmark
$default_qps
$default_output_len
}
main
"
$@
"
offline_benchmark_test/benchmarks/disagg_benchmarks/disagg_performance_benchmark.sh
0 → 100644
View file @
d1a06223
#!/bin/bash
# Requirement: 2x GPUs.
# Model: meta-llama/Meta-Llama-3.1-8B-Instruct
# Query: 1024 input tokens, 6 output tokens, QPS 2/4/6/8, 100 requests
# Resource: 2x GPU
# Approaches:
# 2. Chunked prefill: 2 vllm instance with tp=4, equivalent to 1 tp=4 instance with QPS 4
# 3. Disaggregated prefill: 1 prefilling instance and 1 decoding instance
# Prefilling instance: max_output_token=1
# Decoding instance: force the input tokens be the same across requests to bypass prefilling
set
-ex
kill_gpu_processes
()
{
# kill all processes on GPU.
pgrep pt_main_thread | xargs
-r
kill
-9
pgrep python3 | xargs
-r
kill
-9
for
port
in
8000 8100 8200
;
do
lsof
-t
-i
:
$port
| xargs
-r
kill
-9
;
done
sleep
1
}
wait_for_server
()
{
# wait for vllm server to start
# return 1 if vllm server crashes
local
port
=
$1
timeout
1200 bash
-c
"
until curl -s localhost:
${
port
}
/v1/completions > /dev/null; do
sleep 1
done"
&&
return
0
||
return
1
}
launch_chunked_prefill
()
{
model
=
"meta-llama/Meta-Llama-3.1-8B-Instruct"
# disagg prefill
CUDA_VISIBLE_DEVICES
=
0 python3
\
-m
vllm.entrypoints.openai.api_server
\
--model
$model
\
--port
8100
\
--max-model-len
10000
\
--enable-chunked-prefill
\
--gpu-memory-utilization
0.6 &
CUDA_VISIBLE_DEVICES
=
1 python3
\
-m
vllm.entrypoints.openai.api_server
\
--model
$model
\
--port
8200
\
--max-model-len
10000
\
--enable-chunked-prefill
\
--gpu-memory-utilization
0.6 &
wait_for_server 8100
wait_for_server 8200
python3 round_robin_proxy.py &
sleep
1
}
launch_disagg_prefill
()
{
model
=
"meta-llama/Meta-Llama-3.1-8B-Instruct"
# disagg prefill
CUDA_VISIBLE_DEVICES
=
0 python3
\
-m
vllm.entrypoints.openai.api_server
\
--model
$model
\
--port
8100
\
--max-model-len
10000
\
--gpu-memory-utilization
0.6
\
--kv-transfer-config
\
'{"kv_connector":"PyNcclConnector","kv_role":"kv_producer","kv_rank":0,"kv_parallel_size":2,"kv_buffer_size":5e9}'
&
CUDA_VISIBLE_DEVICES
=
1 python3
\
-m
vllm.entrypoints.openai.api_server
\
--model
$model
\
--port
8200
\
--max-model-len
10000
\
--gpu-memory-utilization
0.6
\
--kv-transfer-config
\
'{"kv_connector":"PyNcclConnector","kv_role":"kv_consumer","kv_rank":1,"kv_parallel_size":2,"kv_buffer_size":5e9}'
&
wait_for_server 8100
wait_for_server 8200
python3 disagg_prefill_proxy_server.py &
sleep
1
}
benchmark
()
{
results_folder
=
"./results"
model
=
"meta-llama/Meta-Llama-3.1-8B-Instruct"
dataset_name
=
"sonnet"
dataset_path
=
"../sonnet_4x.txt"
num_prompts
=
100
qps
=
$1
prefix_len
=
50
input_len
=
1024
output_len
=
$2
tag
=
$3
python3 ../benchmark_serving.py
\
--backend
vllm
\
--model
$model
\
--dataset-name
$dataset_name
\
--dataset-path
$dataset_path
\
--sonnet-input-len
$input_len
\
--sonnet-output-len
"
$output_len
"
\
--sonnet-prefix-len
$prefix_len
\
--num-prompts
$num_prompts
\
--port
8000
\
--save-result
\
--result-dir
$results_folder
\
--result-filename
"
$tag
"
-qps-
"
$qps
"
.json
\
--request-rate
"
$qps
"
sleep
2
}
main
()
{
(
which wget
&&
which curl
)
||
(
apt-get update
&&
apt-get
install
-y
wget curl
)
(
which jq
)
||
(
apt-get
-y
install
jq
)
(
which socat
)
||
(
apt-get
-y
install
socat
)
(
which lsof
)
||
(
apt-get
-y
install
lsof
)
pip
install
quart httpx matplotlib aiohttp datasets
cd
"
$(
dirname
"
$0
"
)
"
cd
..
# create sonnet-4x.txt so that we can sample 2048 tokens for input
echo
""
>
sonnet_4x.txt
for
_
in
{
1..4
}
do
cat
sonnet.txt
>>
sonnet_4x.txt
done
cd
disagg_benchmarks
rm
-rf
results
mkdir
results
default_output_len
=
6
export
VLLM_HOST_IP
=
$(
hostname
-I
|
awk
'{print $1}'
)
launch_chunked_prefill
for
qps
in
2 4 6 8
;
do
benchmark
$qps
$default_output_len
chunked_prefill
done
kill_gpu_processes
launch_disagg_prefill
for
qps
in
2 4 6 8
;
do
benchmark
$qps
$default_output_len
disagg_prefill
done
kill_gpu_processes
python3 visualize_benchmark_results.py
}
main
"
$@
"
offline_benchmark_test/benchmarks/disagg_benchmarks/disagg_prefill_proxy_server.py
0 → 100644
View file @
d1a06223
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import
os
import
aiohttp
from
quart
import
Quart
,
make_response
,
request
AIOHTTP_TIMEOUT
=
aiohttp
.
ClientTimeout
(
total
=
6
*
60
*
60
)
app
=
Quart
(
__name__
)
async
def
forward_request
(
url
,
data
):
async
with
aiohttp
.
ClientSession
(
timeout
=
AIOHTTP_TIMEOUT
)
as
session
:
headers
=
{
"Authorization"
:
f
"Bearer
{
os
.
environ
.
get
(
'OPENAI_API_KEY'
)
}
"
}
async
with
session
.
post
(
url
=
url
,
json
=
data
,
headers
=
headers
)
as
response
:
if
response
.
status
==
200
:
# if response.headers.get('Transfer-Encoding') == 'chunked':
if
True
:
async
for
chunk_bytes
in
response
.
content
.
iter_chunked
(
1024
):
yield
chunk_bytes
else
:
content
=
await
response
.
read
()
yield
content
@
app
.
route
(
"/v1/completions"
,
methods
=
[
"POST"
])
async
def
handle_request
():
try
:
original_request_data
=
await
request
.
get_json
()
prefill_request
=
original_request_data
.
copy
()
# change max_tokens = 1 to let it only do prefill
prefill_request
[
"max_tokens"
]
=
1
# finish prefill
async
for
_
in
forward_request
(
"http://localhost:8100/v1/completions"
,
prefill_request
):
continue
# return decode
generator
=
forward_request
(
"http://localhost:8200/v1/completions"
,
original_request_data
)
response
=
await
make_response
(
generator
)
response
.
timeout
=
None
return
response
except
Exception
as
e
:
import
sys
import
traceback
exc_info
=
sys
.
exc_info
()
print
(
"Error occurred in disagg prefill proxy server"
)
print
(
e
)
print
(
""
.
join
(
traceback
.
format_exception
(
*
exc_info
)))
if
__name__
==
"__main__"
:
app
.
run
(
port
=
8000
)
offline_benchmark_test/benchmarks/disagg_benchmarks/round_robin_proxy.py
0 → 100644
View file @
d1a06223
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import
asyncio
import
itertools
import
aiohttp
from
aiohttp
import
web
class
RoundRobinProxy
:
def
__init__
(
self
,
target_ports
):
self
.
target_ports
=
target_ports
self
.
port_cycle
=
itertools
.
cycle
(
self
.
target_ports
)
async
def
handle_request
(
self
,
request
):
target_port
=
next
(
self
.
port_cycle
)
target_url
=
f
"http://localhost:
{
target_port
}{
request
.
path_qs
}
"
async
with
aiohttp
.
ClientSession
()
as
session
:
try
:
# Forward the request
async
with
session
.
request
(
method
=
request
.
method
,
url
=
target_url
,
headers
=
request
.
headers
,
data
=
request
.
content
,
)
as
response
:
# Start sending the response
resp
=
web
.
StreamResponse
(
status
=
response
.
status
,
headers
=
response
.
headers
)
await
resp
.
prepare
(
request
)
# Stream the response content
async
for
chunk
in
response
.
content
.
iter_any
():
await
resp
.
write
(
chunk
)
await
resp
.
write_eof
()
return
resp
except
Exception
as
e
:
return
web
.
Response
(
text
=
f
"Error:
{
str
(
e
)
}
"
,
status
=
500
)
async
def
main
():
proxy
=
RoundRobinProxy
([
8100
,
8200
])
app
=
web
.
Application
()
app
.
router
.
add_route
(
"*"
,
"/{path:.*}"
,
proxy
.
handle_request
)
runner
=
web
.
AppRunner
(
app
)
await
runner
.
setup
()
site
=
web
.
TCPSite
(
runner
,
"localhost"
,
8000
)
await
site
.
start
()
print
(
"Proxy server started on http://localhost:8000"
)
# Keep the server running
await
asyncio
.
Event
().
wait
()
if
__name__
==
"__main__"
:
asyncio
.
run
(
main
())
offline_benchmark_test/benchmarks/disagg_benchmarks/visualize_benchmark_results.py
0 → 100644
View file @
d1a06223
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import
json
import
matplotlib.pyplot
as
plt
import
pandas
as
pd
if
__name__
==
"__main__"
:
data
=
[]
for
name
in
[
"disagg_prefill"
,
"chunked_prefill"
]:
for
qps
in
[
2
,
4
,
6
,
8
]:
with
open
(
f
"results/
{
name
}
-qps-
{
qps
}
.json"
)
as
f
:
x
=
json
.
load
(
f
)
x
[
"name"
]
=
name
x
[
"qps"
]
=
qps
data
.
append
(
x
)
df
=
pd
.
DataFrame
.
from_dict
(
data
)
dis_df
=
df
[
df
[
"name"
]
==
"disagg_prefill"
]
chu_df
=
df
[
df
[
"name"
]
==
"chunked_prefill"
]
plt
.
style
.
use
(
"bmh"
)
plt
.
rcParams
[
"font.size"
]
=
20
for
key
in
[
"mean_ttft_ms"
,
"median_ttft_ms"
,
"p99_ttft_ms"
,
"mean_itl_ms"
,
"median_itl_ms"
,
"p99_itl_ms"
,
]:
fig
,
ax
=
plt
.
subplots
(
figsize
=
(
11
,
7
))
plt
.
plot
(
dis_df
[
"qps"
],
dis_df
[
key
],
label
=
"disagg_prefill"
,
marker
=
"o"
,
linewidth
=
4
)
plt
.
plot
(
chu_df
[
"qps"
],
chu_df
[
key
],
label
=
"chunked_prefill"
,
marker
=
"o"
,
linewidth
=
4
)
ax
.
legend
()
ax
.
set_xlabel
(
"QPS"
)
ax
.
set_ylabel
(
key
)
ax
.
set_ylim
(
bottom
=
0
)
fig
.
savefig
(
f
"results/
{
key
}
.png"
)
plt
.
close
(
fig
)
offline_benchmark_test/benchmarks/fused_kernels/layernorm_rms_benchmarks.py
0 → 100644
View file @
d1a06223
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import
pickle
as
pkl
import
time
from
collections.abc
import
Iterable
from
dataclasses
import
dataclass
from
itertools
import
product
from
typing
import
Callable
,
Optional
import
torch
import
torch.utils.benchmark
as
TBenchmark
from
torch.utils.benchmark
import
Measurement
as
TMeasurement
from
tqdm
import
tqdm
import
vllm._custom_ops
as
ops
from
vllm.model_executor.layers.layernorm
import
RMSNorm
@
dataclass
class
bench_params_t
:
num_tokens
:
int
hidden_size
:
int
add_residual
:
bool
dtype
:
torch
.
dtype
def
description
(
self
):
return
(
f
"N
{
self
.
num_tokens
}
"
f
"x D
{
self
.
hidden_size
}
"
f
"x R
{
self
.
add_residual
}
"
f
"x DT
{
self
.
dtype
}
"
)
def
get_bench_params
()
->
list
[
bench_params_t
]:
## Test Fixtures
NUM_TOKENS
=
[
2
**
x
for
x
in
range
(
11
)]
HIDDEN_SIZES
=
list
(
range
(
1024
,
8129
,
1024
))
ADD_RESIDUAL
=
[
True
,
False
]
DTYPES
=
[
torch
.
bfloat16
,
torch
.
float
]
combinations
=
product
(
NUM_TOKENS
,
HIDDEN_SIZES
,
ADD_RESIDUAL
,
DTYPES
)
bench_params
=
list
(
map
(
lambda
x
:
bench_params_t
(
x
[
0
],
x
[
1
],
x
[
2
],
x
[
3
]),
combinations
)
)
return
bench_params
# Reference impls
def
unfused_int8_impl
(
rms_norm_layer
:
RMSNorm
,
x
:
torch
.
Tensor
,
residual
:
Optional
[
torch
.
Tensor
],
quant_dtype
:
torch
.
dtype
,
):
# Norm
torch_out
=
None
if
residual
is
None
:
torch_out
=
rms_norm_layer
.
forward_cuda
(
x
,
residual
)
else
:
torch_out
,
_
=
rms_norm_layer
.
forward_cuda
(
x
,
residual
)
# Quant
torch_out
,
_
,
_
=
ops
.
scaled_int8_quant
(
torch_out
)
def
unfused_fp8_impl
(
rms_norm_layer
:
RMSNorm
,
x
:
torch
.
Tensor
,
residual
:
Optional
[
torch
.
Tensor
],
quant_dtype
:
torch
.
dtype
,
):
# Norm
torch_out
=
None
if
residual
is
None
:
torch_out
=
rms_norm_layer
.
forward_cuda
(
x
,
residual
)
else
:
torch_out
,
_
=
rms_norm_layer
.
forward_cuda
(
x
,
residual
)
# Quant
torch_out
,
_
=
ops
.
scaled_fp8_quant
(
torch_out
)
def
fused_impl
(
rms_norm_layer
:
RMSNorm
,
# this stores the weights
x
:
torch
.
Tensor
,
residual
:
Optional
[
torch
.
Tensor
],
quant_dtype
:
torch
.
dtype
,
):
out
,
_
=
ops
.
rms_norm_dynamic_per_token_quant
(
x
,
rms_norm_layer
.
weight
,
1e-6
,
quant_dtype
,
residual
=
residual
)
# Bench functions
def
bench_fn
(
rms_norm_layer
:
RMSNorm
,
x
:
torch
.
Tensor
,
residual
:
torch
.
Tensor
,
quant_dtype
:
torch
.
dtype
,
label
:
str
,
sub_label
:
str
,
fn
:
Callable
,
description
:
str
,
)
->
TMeasurement
:
min_run_time
=
1
globals
=
{
"rms_norm_layer"
:
rms_norm_layer
,
"x"
:
x
,
"residual"
:
residual
,
"quant_dtype"
:
quant_dtype
,
"fn"
:
fn
,
}
return
TBenchmark
.
Timer
(
stmt
=
"fn(rms_norm_layer, x, residual, quant_dtype)"
,
globals
=
globals
,
label
=
label
,
sub_label
=
sub_label
,
description
=
description
,
).
blocked_autorange
(
min_run_time
=
min_run_time
)
def
bench
(
params
:
bench_params_t
,
label
:
str
,
sub_label
:
str
)
->
Iterable
[
TMeasurement
]:
# Make inputs
layer
=
RMSNorm
(
params
.
hidden_size
,
1e-6
).
to
(
dtype
=
params
.
dtype
)
# Make weights
layer
.
weight
.
data
.
normal_
(
mean
=
1.0
,
std
=
0.1
)
# Make inputs
scale
=
1
/
params
.
hidden_size
x
=
(
torch
.
randn
(
params
.
num_tokens
,
params
.
hidden_size
,
dtype
=
params
.
dtype
,
device
=
"cuda"
)
*
scale
)
residual
=
(
(
torch
.
randn_like
(
x
)
*
scale
).
to
(
device
=
"cuda"
)
if
params
.
add_residual
else
None
)
timers
=
[]
# unfused int8 impl.
timers
.
append
(
bench_fn
(
layer
,
x
,
residual
,
torch
.
int8
,
label
,
sub_label
,
unfused_int8_impl
,
"unfused_int8_impl"
,
)
)
# unfused fp8 impl.
timers
.
append
(
bench_fn
(
layer
,
x
,
residual
,
torch
.
float8_e4m3fn
,
label
,
sub_label
,
unfused_fp8_impl
,
"unfused_fp8_impl"
,
)
)
# fused int8 impl.
timers
.
append
(
bench_fn
(
layer
,
x
,
residual
,
torch
.
int8
,
label
,
sub_label
,
fused_impl
,
"fused_int8_impl"
,
)
)
# fused fp8 impl.
timers
.
append
(
bench_fn
(
layer
,
x
,
residual
,
torch
.
float8_e4m3fn
,
label
,
sub_label
,
fused_impl
,
"fused_fp8_impl"
,
)
)
print_timers
(
timers
)
return
timers
# launch bench
# runner
def
print_timers
(
timers
:
Iterable
[
TMeasurement
]):
compare
=
TBenchmark
.
Compare
(
timers
)
compare
.
print
()
def
main
():
torch
.
set_default_device
(
"cuda"
)
bench_params
=
get_bench_params
()
timers
=
[]
for
bp
in
tqdm
(
bench_params
):
timers
.
extend
(
bench
(
bp
,
"rms-norm-dynamic-per-token-quant"
,
bp
.
description
()))
print_timers
(
timers
)
# pickle all the results
timestamp
=
int
(
time
.
time
())
with
open
(
f
"rms_norm_dpt_quant-
{
timestamp
}
.pkl"
,
"wb"
)
as
f
:
pkl
.
dump
(
timers
,
f
)
if
__name__
==
"__main__"
:
main
()
offline_benchmark_test/benchmarks/kernels/bench_fp8_gemm.py
0 → 100644
View file @
d1a06223
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import
argparse
import
copy
import
itertools
import
torch
from
weight_shapes
import
WEIGHT_SHAPES
from
vllm._custom_ops
import
cutlass_scaled_mm
as
vllm_scaled_mm
from
vllm._custom_ops
import
scaled_fp8_quant
as
vllm_scaled_fp8_quant
from
vllm.triton_utils
import
triton
PROVIDER_CFGS
=
{
"torch-bf16"
:
dict
(
enabled
=
True
),
"fp8-tensor-w-token-a"
:
dict
(
w
=
"tensor"
,
a
=
"token"
,
no_a_quant
=
False
,
enabled
=
False
),
"fp8-tensor-w-tensor-a"
:
dict
(
w
=
"tensor"
,
a
=
"tensor"
,
no_a_quant
=
False
,
enabled
=
True
),
"fp8-channel-w-token-a"
:
dict
(
w
=
"channel"
,
a
=
"token"
,
no_a_quant
=
False
,
enabled
=
True
),
"fp8-channel-w-tensor-a"
:
dict
(
w
=
"channel"
,
a
=
"tensor"
,
no_a_quant
=
False
,
enabled
=
False
),
"fp8-tensor-w-token-a-noquant"
:
dict
(
w
=
"tensor"
,
a
=
"token"
,
no_a_quant
=
True
,
enabled
=
False
),
"fp8-tensor-w-tensor-a-noquant"
:
dict
(
w
=
"tensor"
,
a
=
"tensor"
,
no_a_quant
=
True
,
enabled
=
True
),
"fp8-channel-w-token-a-noquant"
:
dict
(
w
=
"channel"
,
a
=
"token"
,
no_a_quant
=
True
,
enabled
=
True
),
"fp8-channel-w-tensor-a-noquant"
:
dict
(
w
=
"channel"
,
a
=
"tensor"
,
no_a_quant
=
True
,
enabled
=
False
),
}
_enabled
=
[
k
for
k
,
v
in
PROVIDER_CFGS
.
items
()
if
v
[
"enabled"
]]
def
_quant_weight_fp8
(
b
:
torch
.
Tensor
,
w_type
:
str
,
device
:
str
):
if
w_type
==
"tensor"
:
scale_b
=
torch
.
ones
(
1
,
device
=
device
,
dtype
=
torch
.
float32
)
b_fp8
,
scale_b_fp8
=
vllm_scaled_fp8_quant
(
b
,
scale_b
)
else
:
b_fp8
,
scale_b_fp8
=
vllm_scaled_fp8_quant
(
b
,
use_per_token_if_dynamic
=
True
)
return
b_fp8
.
t
(),
scale_b_fp8
def
build_fp8_runner
(
cfg
,
a
,
b
,
dtype
,
device
):
b_fp8
,
scale_b_fp8
=
_quant_weight_fp8
(
b
,
cfg
[
"w"
],
device
)
scale_a_const
=
(
torch
.
ones
(
1
,
device
=
device
,
dtype
=
torch
.
float32
)
if
cfg
[
"a"
]
==
"tensor"
else
None
)
if
cfg
[
"no_a_quant"
]:
if
cfg
[
"a"
]
==
"tensor"
:
a_fp8
,
scale_a_fp8
=
vllm_scaled_fp8_quant
(
a
,
scale_a_const
)
else
:
a_fp8
,
scale_a_fp8
=
vllm_scaled_fp8_quant
(
a
,
use_per_token_if_dynamic
=
True
)
def
run
():
return
vllm_scaled_mm
(
a_fp8
,
b_fp8
,
scale_a_fp8
,
scale_b_fp8
,
dtype
)
return
run
if
cfg
[
"a"
]
==
"tensor"
:
def
run
():
a_fp8
,
scale_a_fp8
=
vllm_scaled_fp8_quant
(
a
,
scale_a_const
)
return
vllm_scaled_mm
(
a_fp8
,
b_fp8
,
scale_a_fp8
,
scale_b_fp8
,
dtype
)
else
:
def
run
():
a_fp8
,
scale_a_fp8
=
vllm_scaled_fp8_quant
(
a
,
use_per_token_if_dynamic
=
True
)
return
vllm_scaled_mm
(
a_fp8
,
b_fp8
,
scale_a_fp8
,
scale_b_fp8
,
dtype
)
return
run
@
triton
.
testing
.
perf_report
(
triton
.
testing
.
Benchmark
(
x_names
=
[
"batch_size"
],
x_vals
=
[
1
,
16
,
64
,
128
,
256
,
512
,
1024
,
2048
,
4096
,
8192
,
16384
],
x_log
=
False
,
line_arg
=
"provider"
,
line_vals
=
_enabled
,
line_names
=
_enabled
,
ylabel
=
"TFLOP/s (larger is better)"
,
plot_name
=
"BF16 vs FP8 GEMMs"
,
args
=
{},
)
)
def
benchmark
(
batch_size
,
provider
,
N
,
K
):
M
=
batch_size
device
=
"cuda"
dtype
=
torch
.
bfloat16
a
=
torch
.
randn
((
M
,
K
),
device
=
device
,
dtype
=
dtype
)
b
=
torch
.
randn
((
N
,
K
),
device
=
device
,
dtype
=
dtype
)
quantiles
=
[
0.5
,
0.2
,
0.8
]
if
provider
==
"torch-bf16"
:
ms
,
min_ms
,
max_ms
=
triton
.
testing
.
do_bench_cudagraph
(
lambda
:
torch
.
nn
.
functional
.
linear
(
a
,
b
),
quantiles
=
quantiles
)
else
:
cfg
=
PROVIDER_CFGS
[
provider
]
run_quant
=
build_fp8_runner
(
cfg
,
a
,
b
,
dtype
,
device
)
ms
,
min_ms
,
max_ms
=
triton
.
testing
.
do_bench_cudagraph
(
lambda
:
run_quant
(),
quantiles
=
quantiles
)
to_tflops
=
lambda
t_ms
:
(
2
*
M
*
N
*
K
)
*
1e-12
/
(
t_ms
*
1e-3
)
return
to_tflops
(
ms
),
to_tflops
(
max_ms
),
to_tflops
(
min_ms
)
def
prepare_shapes
(
args
):
out
=
[]
for
model
,
tp_size
in
itertools
.
product
(
args
.
models
,
args
.
tp_sizes
):
for
KN
,
tp_dim
in
copy
.
deepcopy
(
WEIGHT_SHAPES
[
model
]):
KN
[
tp_dim
]
//=
tp_size
KN
.
append
(
model
)
out
.
append
(
KN
)
return
out
if
__name__
==
"__main__"
:
parser
=
argparse
.
ArgumentParser
()
parser
.
add_argument
(
"--models"
,
nargs
=
"+"
,
type
=
str
,
default
=
[
"meta-llama/Llama-3.1-8B-Instruct"
],
choices
=
list
(
WEIGHT_SHAPES
.
keys
()),
)
parser
.
add_argument
(
"--tp-sizes"
,
nargs
=
"+"
,
type
=
int
,
default
=
[
1
])
args
=
parser
.
parse_args
()
for
K
,
N
,
model
in
prepare_shapes
(
args
):
print
(
f
"
{
model
}
, N=
{
N
}
K=
{
K
}
, BF16 vs FP8 GEMMs TFLOP/s:"
)
benchmark
.
run
(
print_data
=
True
,
show_plots
=
True
,
save_path
=
f
"bench_fp8_res_n
{
N
}
_k
{
K
}
"
,
N
=
N
,
K
=
K
,
)
print
(
"Benchmark finished!"
)
offline_benchmark_test/benchmarks/kernels/bench_int8_gemm.py
0 → 100644
View file @
d1a06223
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import
argparse
import
copy
import
itertools
import
torch
from
weight_shapes
import
WEIGHT_SHAPES
from
vllm._custom_ops
import
cutlass_scaled_mm
as
vllm_scaled_mm
from
vllm._custom_ops
import
scaled_int8_quant
as
vllm_scaled_int8_quant
from
vllm.triton_utils
import
triton
PROVIDER_CFGS
=
{
"torch-bf16"
:
dict
(
enabled
=
True
),
"int8-tensor-w-token-a"
:
dict
(
w
=
"tensor"
,
a
=
"token"
,
no_a_quant
=
False
,
enabled
=
False
),
"int8-tensor-w-tensor-a"
:
dict
(
w
=
"tensor"
,
a
=
"tensor"
,
no_a_quant
=
False
,
enabled
=
True
),
"int8-channel-w-token-a"
:
dict
(
w
=
"channel"
,
a
=
"token"
,
no_a_quant
=
False
,
enabled
=
True
),
"int8-channel-w-tensor-a"
:
dict
(
w
=
"channel"
,
a
=
"tensor"
,
no_a_quant
=
False
,
enabled
=
False
),
"int8-tensor-w-token-a-noquant"
:
dict
(
w
=
"tensor"
,
a
=
"token"
,
no_a_quant
=
True
,
enabled
=
False
),
"int8-tensor-w-tensor-a-noquant"
:
dict
(
w
=
"tensor"
,
a
=
"tensor"
,
no_a_quant
=
True
,
enabled
=
True
),
"int8-channel-w-token-a-noquant"
:
dict
(
w
=
"channel"
,
a
=
"token"
,
no_a_quant
=
True
,
enabled
=
True
),
"int8-channel-w-tensor-a-noquant"
:
dict
(
w
=
"channel"
,
a
=
"tensor"
,
no_a_quant
=
True
,
enabled
=
False
),
}
def
_quant_weight
(
b
,
w_type
,
device
):
if
w_type
==
"tensor"
:
scale_b
=
torch
.
ones
(
1
,
device
=
device
,
dtype
=
torch
.
float32
)
b_int8
,
scale_b_int8
,
_
=
vllm_scaled_int8_quant
(
b
,
scale_b
)
assert
scale_b_int8
.
numel
()
==
1
else
:
# channel
b_int8
,
scale_b_int8
,
_
=
vllm_scaled_int8_quant
(
b
)
assert
scale_b_int8
.
numel
()
==
b
.
shape
[
0
]
return
b_int8
.
t
(),
scale_b_int8
def
build_int8_runner
(
cfg
,
a
,
b
,
dtype
,
device
):
# quant before running the kernel
b_int8
,
scale_b_int8
=
_quant_weight
(
b
,
cfg
[
"w"
],
device
)
scale_a_const
=
None
if
cfg
[
"a"
]
==
"tensor"
:
scale_a_const
=
torch
.
ones
(
1
,
device
=
device
,
dtype
=
torch
.
float32
)
# no quant, create activation ahead
if
cfg
[
"no_a_quant"
]:
if
cfg
[
"a"
]
==
"tensor"
:
a_int8
,
scale_a_int8
,
_
=
vllm_scaled_int8_quant
(
a
,
scale_a_const
)
else
:
# token
a_int8
,
scale_a_int8
,
_
=
vllm_scaled_int8_quant
(
a
)
def
run_quant
():
return
vllm_scaled_mm
(
a_int8
,
b_int8
,
scale_a_int8
,
scale_b_int8
,
dtype
)
return
run_quant
# dynamic quant, create activation inside
if
cfg
[
"a"
]
==
"tensor"
:
def
run_quant
():
a_int8
,
scale_a_int8
,
_
=
vllm_scaled_int8_quant
(
a
,
scale_a_const
)
return
vllm_scaled_mm
(
a_int8
,
b_int8
,
scale_a_int8
,
scale_b_int8
,
dtype
)
else
:
# token
def
run_quant
():
a_int8
,
scale_a_int8
,
_
=
vllm_scaled_int8_quant
(
a
)
return
vllm_scaled_mm
(
a_int8
,
b_int8
,
scale_a_int8
,
scale_b_int8
,
dtype
)
return
run_quant
_enabled
=
[
k
for
k
,
v
in
PROVIDER_CFGS
.
items
()
if
v
.
get
(
"enabled"
)]
@
triton
.
testing
.
perf_report
(
triton
.
testing
.
Benchmark
(
x_names
=
[
"batch_size"
],
x_vals
=
[
1
,
16
,
64
,
128
,
256
,
512
,
1024
,
2048
,
4096
,
8192
,
16384
],
x_log
=
False
,
line_arg
=
"provider"
,
line_vals
=
_enabled
,
line_names
=
[
k
for
k
in
_enabled
],
ylabel
=
"TFLOP/s (larger is better)"
,
plot_name
=
"BF16 vs INT8 GEMMs"
,
args
=
{},
)
)
def
benchmark
(
batch_size
,
provider
,
N
,
K
):
M
=
batch_size
device
=
"cuda"
dtype
=
torch
.
bfloat16
a
=
torch
.
randn
((
M
,
K
),
device
=
device
,
dtype
=
dtype
)
b
=
torch
.
randn
((
N
,
K
),
device
=
device
,
dtype
=
dtype
)
quantiles
=
[
0.5
,
0.2
,
0.8
]
if
provider
==
"torch-bf16"
:
ms
,
min_ms
,
max_ms
=
triton
.
testing
.
do_bench_cudagraph
(
lambda
:
torch
.
nn
.
functional
.
linear
(
a
,
b
),
quantiles
=
quantiles
)
else
:
cfg
=
PROVIDER_CFGS
[
provider
]
run_quant
=
build_int8_runner
(
cfg
,
a
,
b
,
dtype
,
device
)
ms
,
min_ms
,
max_ms
=
triton
.
testing
.
do_bench_cudagraph
(
lambda
:
run_quant
(),
quantiles
=
quantiles
)
to_tflops
=
lambda
t_ms
:
(
2
*
M
*
N
*
K
)
*
1e-12
/
(
t_ms
*
1e-3
)
return
to_tflops
(
ms
),
to_tflops
(
max_ms
),
to_tflops
(
min_ms
)
def
prepare_shapes
(
args
):
KN_model_names
=
[]
for
model
,
tp_size
in
itertools
.
product
(
args
.
models
,
args
.
tp_sizes
):
for
KN
,
tp_dim
in
copy
.
deepcopy
(
WEIGHT_SHAPES
[
model
]):
KN
[
tp_dim
]
//=
tp_size
KN
.
append
(
model
)
KN_model_names
.
append
(
KN
)
return
KN_model_names
if
__name__
==
"__main__"
:
parser
=
argparse
.
ArgumentParser
()
parser
.
add_argument
(
"--models"
,
nargs
=
"+"
,
type
=
str
,
default
=
[
"meta-llama/Llama-3.1-8B-Instruct"
],
choices
=
list
(
WEIGHT_SHAPES
.
keys
()),
help
=
"List of models to benchmark"
,
)
parser
.
add_argument
(
"--tp-sizes"
,
nargs
=
"+"
,
type
=
int
,
default
=
[
1
],
help
=
"List of tensor parallel sizes"
,
)
args
=
parser
.
parse_args
()
for
K
,
N
,
model
in
prepare_shapes
(
args
):
print
(
f
"
{
model
}
, N=
{
N
}
K=
{
K
}
, BF16 vs INT8 GEMMs TFLOP/s:"
)
benchmark
.
run
(
print_data
=
True
,
show_plots
=
True
,
save_path
=
f
"bench_int8_res_n
{
N
}
_k
{
K
}
"
,
N
=
N
,
K
=
K
,
)
print
(
"Benchmark finished!"
)
offline_benchmark_test/benchmarks/kernels/benchmark_aqlm.py
0 → 100644
View file @
d1a06223
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import
os
import
sys
from
typing
import
Optional
import
torch
import
torch.nn.functional
as
F
from
vllm
import
_custom_ops
as
ops
from
vllm.model_executor.layers.quantization.aqlm
import
(
dequantize_weight
,
generic_dequantize_gemm
,
get_int_dtype
,
optimized_dequantize_gemm
,
)
from
vllm.utils
import
FlexibleArgumentParser
os
.
environ
[
"CUDA_VISIBLE_DEVICES"
]
=
"0"
def
torch_mult
(
# [..., in_features]
input
:
torch
.
Tensor
,
weights
:
torch
.
Tensor
,
# [num_out_groups, 1, 1, 1]
scales
:
torch
.
Tensor
,
)
->
torch
.
Tensor
:
output
=
F
.
linear
(
input
,
weights
)
return
output
def
dequant_out_scale
(
# [..., in_features]
input
:
torch
.
Tensor
,
# [num_out_groups, num_in_groups, num_codebooks]
codes
:
torch
.
IntTensor
,
# [num_codebooks, codebook_size, out_group_size, in_group_size]
codebooks
:
torch
.
Tensor
,
# [num_out_groups, 1, 1, 1]
scales
:
torch
.
Tensor
,
output_partition_sizes
:
torch
.
IntTensor
,
bias
:
Optional
[
torch
.
Tensor
],
)
->
torch
.
Tensor
:
weights
=
ops
.
aqlm_dequant
(
codes
,
codebooks
,
output_partition_sizes
)
if
bias
is
None
:
output
=
F
.
linear
(
input
,
weights
,
bias
)
orig_shape
=
output
.
shape
flattened_output
=
output
.
view
(
-
1
,
output
.
size
(
-
1
))
f_scales
=
scales
.
view
(
-
1
,
scales
.
shape
[
0
])
b_scales
=
f_scales
.
expand
(
flattened_output
.
shape
[
0
],
-
1
)
flattened_output
*=
b_scales
return
flattened_output
.
view
(
orig_shape
)
else
:
b_scales
=
scales
.
view
(
scales
.
shape
[:
-
3
]
+
(
-
1
,)).
expand
(
-
1
,
weights
.
shape
[
1
])
weights
*=
b_scales
return
F
.
linear
(
input
,
weights
,
bias
)
def
dequant_weight_scale
(
# [..., in_features]
input
:
torch
.
Tensor
,
# [num_out_groups, num_in_groups, num_codebooks]
codes
:
torch
.
IntTensor
,
# [num_codebooks, codebook_size, out_group_size, in_group_size]
codebooks
:
torch
.
Tensor
,
# [num_out_groups, 1, 1, 1]
scales
:
torch
.
Tensor
,
output_partition_sizes
:
torch
.
IntTensor
,
bias
:
Optional
[
torch
.
Tensor
],
)
->
torch
.
Tensor
:
weights
=
ops
.
aqlm_dequant
(
codes
,
codebooks
,
output_partition_sizes
)
b_scales
=
scales
.
view
(
scales
.
shape
[:
-
3
]
+
(
-
1
,)).
expand
(
-
1
,
weights
.
shape
[
1
])
weights
*=
b_scales
return
F
.
linear
(
input
,
weights
,
bias
)
def
dequant_no_scale
(
# [..., in_features]
input
:
torch
.
Tensor
,
# [num_out_groups, num_in_groups, num_codebooks]
codes
:
torch
.
IntTensor
,
# [num_codebooks, codebook_size, out_group_size, in_group_size]
codebooks
:
torch
.
Tensor
,
# [num_out_groups, 1, 1, 1]
scales
:
torch
.
Tensor
,
output_partition_sizes
:
torch
.
IntTensor
,
bias
:
Optional
[
torch
.
Tensor
],
)
->
torch
.
Tensor
:
weights
=
ops
.
aqlm_dequant
(
codes
,
codebooks
,
output_partition_sizes
)
return
F
.
linear
(
input
,
weights
,
bias
)
# Compare the optimized 1x16 and 2x8 cuda decompression/dequant kernels against
# the generic pytorch version.
# Just visual comparison.
def
dequant_test
(
k
:
int
,
parts
:
torch
.
Tensor
,
nbooks
:
int
,
bits
:
int
)
->
None
:
n
=
int
(
parts
.
sum
().
item
())
device
=
torch
.
device
(
"cuda:0"
)
code_range
=
(
1
<<
bits
)
//
2
ingroups
=
8
codes
=
torch
.
randint
(
-
code_range
,
code_range
,
size
=
(
n
,
k
//
ingroups
,
nbooks
),
dtype
=
get_int_dtype
(
bits
),
device
=
device
,
)
codebooks
=
torch
.
randn
(
size
=
(
parts
.
shape
[
0
]
*
nbooks
,
1
<<
bits
,
1
,
8
),
dtype
=
torch
.
float16
,
device
=
device
,
)
count
=
0
for
index
in
range
(
16
):
for
i
in
range
(
8
):
for
book
in
range
(
nbooks
):
codebooks
[
book
,
index
,
0
,
i
]
=
count
*
(
10
**
book
)
count
+=
1
print
(
"codes shape"
,
codes
.
shape
)
for
i
in
range
(
16
):
for
book
in
range
(
nbooks
):
codes
[
0
,
i
,
book
]
=
i
codes
[
0
,
-
i
,
book
]
=
i
weights
=
dequantize_weight
(
codes
,
codebooks
,
None
)
weights2
=
ops
.
aqlm_dequant
(
codes
,
codebooks
,
parts
)
print
(
"weights shape:"
,
weights
.
shape
)
print
(
"weights2 shape:"
,
weights2
.
shape
)
print
(
"weights are:"
,
weights
)
print
(
"weights2 are:"
,
weights2
)
print
(
"first 128 weights are"
,
weights
[
0
,
0
:
128
].
to
(
torch
.
int32
))
print
(
"first 128 weights2 are:"
,
weights2
[
0
,
0
:
128
].
to
(
torch
.
int32
))
print
(
"last 128 weights are"
,
weights
[
0
,
-
128
:])
print
(
"last 128 weights2 are:"
,
weights2
[
0
,
-
128
:])
def
main
():
parser
=
FlexibleArgumentParser
(
description
=
"Benchmark aqlm performance."
)
# Add arguments
parser
.
add_argument
(
"--nbooks"
,
type
=
int
,
default
=
1
,
help
=
"Number of codebooks (default: 1)"
)
parser
.
add_argument
(
"--bits"
,
type
=
int
,
default
=
16
,
help
=
"Number of bits per code element (default: 16)"
,
)
parser
.
add_argument
(
"--test"
,
type
=
bool
,
default
=
False
,
help
=
"Run the decompression/dequant tester rather than benchmarking "
"(default: False)"
,
)
# Parse the arguments
args
=
parser
.
parse_args
()
# Extract values
nbooks
=
args
.
nbooks
bits
=
args
.
bits
if
args
.
test
:
dequant_test
(
4096
,
torch
.
tensor
((
4096
,)),
nbooks
,
bits
)
return
# Otherwise, benchmark.
methods
=
[
ops
.
aqlm_gemm
,
dequant_out_scale
,
generic_dequantize_gemm
,
optimized_dequantize_gemm
,
dequant_weight_scale
,
torch_mult
,
dequant_no_scale
,
]
filename
=
f
"./aqlm_benchmark_
{
nbooks
}
x
{
bits
}
.csv"
print
(
f
"writing benchmarks to file
{
filename
}
"
)
with
open
(
filename
,
"w"
)
as
f
:
sys
.
stdout
=
f
print
(
"m | k | n | n parts"
,
end
=
""
)
for
method
in
methods
:
print
(
f
" |
{
method
.
__name__
.
replace
(
'_'
,
' '
)
}
(µs)"
,
end
=
""
)
print
(
""
)
# These are reasonable prefill sizes.
ksandpartions
=
(
(
4096
,
(
4096
,
4096
,
4096
)),
(
4096
,
(
4096
,)),
(
4096
,
(
11008
,
11008
)),
(
11008
,
(
4096
,)),
)
# reasonable ranges for m.
for
m
in
[
1
,
2
,
4
,
8
,
10
,
12
,
14
,
16
,
24
,
32
,
48
,
52
,
56
,
64
,
96
,
112
,
128
,
256
,
512
,
1024
,
1536
,
2048
,
3072
,
4096
,
]:
print
(
f
"
{
m
}
"
,
file
=
sys
.
__stdout__
)
for
ksp
in
ksandpartions
:
run_grid
(
m
,
ksp
[
0
],
torch
.
tensor
(
ksp
[
1
]),
nbooks
,
bits
,
methods
)
sys
.
stdout
=
sys
.
__stdout__
def
run_grid
(
m
:
int
,
k
:
int
,
parts
:
torch
.
Tensor
,
nbooks
:
int
,
bits
:
int
,
methods
):
# I didn't see visible improvements from increasing these, but feel free :)
num_warmup_trials
=
1
num_trials
=
1
num_calls
=
100
# warmup.
for
method
in
methods
:
for
_
in
range
(
num_warmup_trials
):
run_timing
(
num_calls
=
num_calls
,
m
=
m
,
k
=
k
,
parts
=
parts
,
nbooks
=
nbooks
,
bits
=
bits
,
method
=
method
,
)
n
=
parts
.
sum
().
item
()
print
(
f
"
{
m
}
|
{
k
}
|
{
n
}
|
{
parts
.
tolist
()
}
"
,
end
=
""
)
for
method
in
methods
:
best_time_us
=
1e20
for
_
in
range
(
num_trials
):
kernel_dur_ms
=
run_timing
(
num_calls
=
num_calls
,
m
=
m
,
k
=
k
,
parts
=
parts
,
nbooks
=
nbooks
,
bits
=
bits
,
method
=
method
,
)
kernel_dur_us
=
1000
*
kernel_dur_ms
if
kernel_dur_us
<
best_time_us
:
best_time_us
=
kernel_dur_us
print
(
f
" |
{
kernel_dur_us
:.
0
f
}
"
,
end
=
""
)
print
(
""
)
def
run_timing
(
num_calls
:
int
,
m
:
int
,
k
:
int
,
parts
:
torch
.
Tensor
,
nbooks
:
int
,
bits
:
int
,
method
)
->
float
:
n
=
int
(
parts
.
sum
().
item
())
device
=
torch
.
device
(
"cuda:0"
)
input
=
torch
.
randn
((
1
,
m
,
k
),
dtype
=
torch
.
float16
,
device
=
device
)
code_range
=
(
1
<<
bits
)
//
2
ingroups
=
8
codes
=
torch
.
randint
(
-
code_range
,
code_range
,
size
=
(
n
,
k
//
ingroups
,
nbooks
),
dtype
=
get_int_dtype
(
bits
),
device
=
device
,
)
codebooks
=
torch
.
randn
(
size
=
(
parts
.
shape
[
0
]
*
nbooks
,
1
<<
bits
,
1
,
8
),
dtype
=
torch
.
float16
,
device
=
device
,
)
scales
=
torch
.
randn
(
size
=
(
n
,
1
,
1
,
1
),
dtype
=
torch
.
float16
,
device
=
device
)
# for comparison to just a pytorch mult.
weights
=
torch
.
randn
((
n
,
k
),
dtype
=
torch
.
float16
,
device
=
device
)
start_event
=
torch
.
cuda
.
Event
(
enable_timing
=
True
)
end_event
=
torch
.
cuda
.
Event
(
enable_timing
=
True
)
start_event
.
record
()
if
method
is
torch_mult
:
for
i
in
range
(
num_calls
):
torch_mult
(
input
,
weights
,
scales
)
else
:
for
i
in
range
(
num_calls
):
method
(
input
,
codes
,
codebooks
,
scales
,
parts
,
None
)
end_event
.
record
()
end_event
.
synchronize
()
dur_ms
=
start_event
.
elapsed_time
(
end_event
)
/
num_calls
return
dur_ms
if
__name__
==
"__main__"
:
sys
.
exit
(
main
())
offline_benchmark_test/benchmarks/kernels/benchmark_bitblas.py
0 → 100644
View file @
d1a06223
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT License.
from
vllm.model_executor.layers.quantization.utils.bitblas_utils
import
(
MINIMUM_BITBLAS_VERSION
,
)
try
:
import
bitblas
if
bitblas
.
__version__
<
MINIMUM_BITBLAS_VERSION
:
raise
ImportError
(
"bitblas version is wrong. Please "
f
"install bitblas>=
{
MINIMUM_BITBLAS_VERSION
}
"
)
except
ImportError
as
e
:
bitblas_import_exception
=
e
raise
ValueError
(
"Trying to use the bitblas backend, but could not import"
f
"with the following error:
{
bitblas_import_exception
}
. "
"Please install bitblas through the following command: "
f
"`pip install bitblas>=
{
MINIMUM_BITBLAS_VERSION
}
`"
)
from
bitblas_import_exception
from
bitblas
import
Matmul
,
MatmulConfig
,
auto_detect_nvidia_target
from
vllm.utils
import
FlexibleArgumentParser
parser
=
FlexibleArgumentParser
(
description
=
"Benchmark BitBLAS int4 on a specific target."
)
# Add arguments to the parser
parser
.
add_argument
(
"--target"
,
type
=
str
,
default
=
auto_detect_nvidia_target
(),
help
=
"Specify the target device for benchmarking."
,
)
parser
.
add_argument
(
"--group_size"
,
type
=
int
,
default
=
None
,
help
=
"Group size for grouped quantization."
)
parser
.
add_argument
(
"--A_dtype"
,
type
=
str
,
default
=
"float16"
,
choices
=
[
"float16"
,
"float32"
,
"float64"
,
"int32"
,
"int8"
],
help
=
"Data type of activation A."
,
)
parser
.
add_argument
(
"--W_dtype"
,
type
=
str
,
default
=
"int4"
,
choices
=
[
"float16"
,
"float32"
,
"float64"
,
"int32"
,
"int8"
,
"int4"
,
"int2"
,
"int1"
,
"nf4"
,
"fp4_e2m1"
,
],
help
=
"Data type of weight W."
,
)
parser
.
add_argument
(
"--accum_dtype"
,
type
=
str
,
default
=
"float16"
,
choices
=
[
"float16"
,
"int32"
],
help
=
"Data type for accumulation."
,
)
parser
.
add_argument
(
"--out_dtype"
,
type
=
str
,
default
=
"float16"
,
choices
=
[
"float16"
,
"float32"
,
"int32"
,
"int8"
],
help
=
"Data type for output."
,
)
parser
.
add_argument
(
"--layout"
,
type
=
str
,
default
=
"nt"
,
choices
=
[
"nt"
,
"nn"
],
help
=
"Matrix layout, 'nt' for non-transpose A and transpose W."
,
)
parser
.
add_argument
(
"--with_bias"
,
action
=
"store_true"
,
help
=
"Include bias in the benchmark."
)
parser
.
add_argument
(
"--with_scaling"
,
action
=
"store_true"
,
help
=
"Include scaling factor in the quantization."
,
)
parser
.
add_argument
(
"--with_zeros"
,
action
=
"store_true"
,
help
=
"Include zeros in the quantization."
)
parser
.
add_argument
(
"--zeros_mode"
,
type
=
str
,
default
=
None
,
choices
=
[
"original"
,
"rescale"
,
"quantized"
],
help
=
"Specify the mode for calculating zeros."
,
)
# Parse the arguments
args
=
parser
.
parse_args
()
# Assign arguments to variables
target
=
args
.
target
A_dtype
=
args
.
A_dtype
W_dtype
=
args
.
W_dtype
accum_dtype
=
args
.
accum_dtype
out_dtype
=
args
.
out_dtype
layout
=
args
.
layout
with_bias
=
args
.
with_bias
group_size
=
args
.
group_size
with_scaling
=
args
.
with_scaling
with_zeros
=
args
.
with_zeros
zeros_mode
=
args
.
zeros_mode
# Define a list of shared arguments that repeat in every config
shared_args
=
[
A_dtype
,
W_dtype
,
out_dtype
,
accum_dtype
,
layout
,
with_bias
,
group_size
,
with_scaling
,
with_zeros
,
zeros_mode
,
]
# Define just the (M, K, N) shapes in a more compact list
shapes
=
[
# square test
(
1
,
16384
,
16384
),
# BLOOM-176B
(
1
,
43008
,
14336
),
(
1
,
14336
,
14336
),
(
1
,
57344
,
14336
),
(
1
,
14336
,
57344
),
# OPT-65B
(
1
,
9216
,
9216
),
(
1
,
36864
,
9216
),
(
1
,
9216
,
36864
),
(
1
,
22016
,
8192
),
# LLAMA-70B/65B
(
1
,
8192
,
22016
),
(
1
,
8192
,
8192
),
(
1
,
28672
,
8192
),
(
1
,
8192
,
28672
),
# square test
(
16384
,
16384
,
16384
),
# BLOOM-176B
(
8192
,
43008
,
14336
),
(
8192
,
14336
,
14336
),
(
8192
,
57344
,
14336
),
(
8192
,
14336
,
57344
),
# OPT-65B
(
8192
,
9216
,
9216
),
(
8192
,
36864
,
9216
),
(
8192
,
9216
,
36864
),
(
8192
,
22016
,
8192
),
# LLAMA-70B/65B
(
8192
,
8192
,
22016
),
(
8192
,
8192
,
8192
),
(
8192
,
28672
,
8192
),
(
8192
,
8192
,
28672
),
]
# Build test shapes with all the shared arguments
test_shapes
=
[(
MatmulConfig
,
Matmul
,
(
*
shape
,
*
shared_args
))
for
shape
in
shapes
]
benchmark_sets
=
[]
benchmark_sets
.
extend
(
test_shapes
)
benchmark_results
=
{}
for
config_class
,
operator
,
input_args
in
benchmark_sets
:
config
=
config_class
(
*
input_args
)
matmul
=
operator
(
config
,
target
=
target
,
enable_tuning
=
True
)
kernel_latency
=
matmul
.
profile_latency
()
print
(
"Time cost is: {:.3f} ms"
.
format
(
kernel_latency
))
profile_config
=
{
f
"
{
operator
.
__name__
}
-
{
'-'
.
join
([
str
(
i
)
for
i
in
input_args
])
}
"
:
{
"BitBLAS_top20_latency"
:
kernel_latency
,
}
}
benchmark_results
.
update
(
profile_config
)
# Define headers for the table
headers
=
[
"PrimFunc"
,
"Input Arguments"
,
"BitBLAS Top20 Latency"
,
]
# Calculate column widths for pretty printing
col_widths
=
[
0
,
0
,
0
]
for
config_key
,
values
in
benchmark_results
.
items
():
args_split
=
config_key
.
split
(
"-"
)
func_name
=
args_split
[
0
]
input_args_str
=
"-"
.
join
(
args_split
[
1
:])
col_widths
[
0
]
=
max
(
col_widths
[
0
],
len
(
func_name
)
+
2
,
len
(
headers
[
0
])
+
2
)
col_widths
[
1
]
=
max
(
col_widths
[
1
],
len
(
input_args_str
)
+
2
,
len
(
headers
[
1
])
+
2
)
col_widths
[
2
]
=
max
(
col_widths
[
2
],
len
(
f
"
{
values
[
'BitBLAS_top20_latency'
]:.
3
f
}
ms"
)
+
2
,
len
(
headers
[
2
])
+
2
,
)
# break only if you want to measure widths from a single example;
# otherwise, let it loop over all items.
# Print header
for
i
,
header
in
enumerate
(
headers
):
headers
[
i
]
=
header
.
ljust
(
col_widths
[
i
])
print
(
""
.
join
(
headers
))
print
(
"-"
*
sum
(
col_widths
))
# Print rows
for
config_key
,
values
in
benchmark_results
.
items
():
args_split
=
config_key
.
split
(
"-"
)
func_name
=
args_split
[
0
]
input_args_str
=
"-"
.
join
(
args_split
[
1
:])
row
=
[
func_name
,
input_args_str
,
f
"
{
values
[
'BitBLAS_top20_latency'
]:.
3
f
}
ms"
,
]
row_str
=
""
.
join
(
[
str
(
cell
).
ljust
(
col_widths
[
idx
])
for
idx
,
cell
in
enumerate
(
row
)]
)
print
(
row_str
)
offline_benchmark_test/benchmarks/kernels/benchmark_cutlass_fp4_moe.py
0 → 100644
View file @
d1a06223
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""
Benchmark the performance of the cutlass_moe_fp4 kernel vs the triton_moe
kernel. The cutlass_moe_fp4 kernel takes in fp4 quantized weights and 16-bit
activations. The triton_moe kernel takes in fp8 weights(tensor scaled to fp8)
and 16-bit activations.
"""
import
nvtx
import
torch
import
torch.utils.benchmark
as
benchmark
from
vllm
import
_custom_ops
as
ops
from
vllm.config
import
ParallelConfig
,
VllmConfig
,
set_current_vllm_config
from
vllm.model_executor.layers.fused_moe.cutlass_moe
import
cutlass_moe_fp4
from
vllm.model_executor.layers.fused_moe.fused_moe
import
fused_experts
,
fused_topk
from
vllm.scalar_type
import
scalar_types
from
vllm.utils
import
FlexibleArgumentParser
WEIGHT_SHAPES_MOE
=
{
"nvidia/DeepSeek-R1-FP4"
:
[
[
256
,
8
,
2048
,
7168
],
],
}
DEFAULT_MODELS
=
[
"nvidia/DeepSeek-R1-FP4"
,
]
DEFAULT_BATCH_SIZES
=
[
4
,
8
,
16
,
32
,
64
,
128
,
256
,
512
,
1024
,
2048
]
DEFAULT_TP_SIZES
=
[
1
]
PER_ACT_TOKEN_OPTS
=
[
False
]
PER_OUT_CH_OPTS
=
[
False
]
FLOAT4_E2M1_MAX
=
scalar_types
.
float4_e2m1f
.
max
()
FLOAT8_E4M3_MAX
=
torch
.
finfo
(
torch
.
float8_e4m3fn
).
max
def
to_fp8
(
tensor
:
torch
.
Tensor
):
finfo
=
torch
.
finfo
(
torch
.
float8_e4m3fn
)
return
torch
.
round
(
tensor
.
clamp
(
min
=
finfo
.
min
,
max
=
finfo
.
max
)).
to
(
dtype
=
torch
.
float8_e4m3fn
)
def
bench_run
(
results
:
list
[
benchmark
.
Measurement
],
model
:
str
,
num_experts
:
int
,
topk
:
int
,
per_act_token
:
bool
,
per_out_ch
:
bool
,
mkn
:
tuple
[
int
,
int
,
int
],
):
label
=
"NVFP4 Blockscaled CUTLASS MOE vs FP8 Tensor Scaled Triton"
sub_label
=
(
"{}, num_experts={}, topk={}, per_act_token={} per_out_ch={}, MKN=({})"
.
format
(
model
,
num_experts
,
topk
,
per_act_token
,
per_out_ch
,
mkn
)
)
print
(
f
"Testing:
{
sub_label
}
"
)
(
m
,
k
,
n
)
=
mkn
dtype
=
torch
.
half
device
=
"cuda"
a
=
torch
.
randn
((
m
,
k
),
device
=
device
,
dtype
=
dtype
)
/
10
w1
=
torch
.
randn
((
num_experts
,
2
*
n
,
k
),
device
=
device
,
dtype
=
dtype
)
/
10
w2
=
torch
.
randn
((
num_experts
,
k
,
n
),
device
=
device
,
dtype
=
dtype
)
/
10
_
,
a_fp8_scale
=
ops
.
scaled_fp8_quant
(
a
)
w1_fp8q
=
torch
.
empty
(
(
num_experts
,
2
*
n
,
k
),
device
=
device
,
dtype
=
torch
.
float8_e4m3fn
)
w2_fp8q
=
torch
.
empty
((
num_experts
,
k
,
n
),
device
=
device
,
dtype
=
torch
.
float8_e4m3fn
)
w1_fp8scale
=
torch
.
empty
((
num_experts
,
1
,
1
),
device
=
device
,
dtype
=
torch
.
float32
)
w2_fp8scale
=
torch
.
empty
((
num_experts
,
1
,
1
),
device
=
device
,
dtype
=
torch
.
float32
)
for
expert
in
range
(
num_experts
):
w1_fp8q
[
expert
],
w1_fp8scale
[
expert
]
=
ops
.
scaled_fp8_quant
(
w1
[
expert
])
w2_fp8q
[
expert
],
w2_fp8scale
[
expert
]
=
ops
.
scaled_fp8_quant
(
w2
[
expert
])
w1_fp8q_notransp
=
w1_fp8q
.
clone
()
w2_fp8q_notransp
=
w2_fp8q
.
clone
()
w1_fp8q
=
w1_fp8q
.
transpose
(
1
,
2
)
w2_fp8q
=
w2_fp8q
.
transpose
(
1
,
2
)
score
=
torch
.
randn
((
m
,
num_experts
),
device
=
device
,
dtype
=
dtype
)
topk_weights
,
topk_ids
,
_
=
fused_topk
(
a
,
score
,
topk
,
renormalize
=
False
)
quant_blocksize
=
16
w1_blockscale
=
torch
.
empty
(
(
num_experts
,
2
*
n
,
k
//
quant_blocksize
),
device
=
device
,
dtype
=
torch
.
float8_e4m3fn
,
)
w2_blockscale
=
torch
.
empty
(
(
num_experts
,
k
,
n
//
quant_blocksize
),
device
=
device
,
dtype
=
torch
.
float8_e4m3fn
)
# n_b_scales = 2 * n if per_out_ch else 1
# k_b_scales = k if per_out_ch else 1
w1_fp4
=
torch
.
empty
((
num_experts
,
2
*
n
,
k
//
2
),
device
=
device
,
dtype
=
torch
.
uint8
)
w2_fp4
=
torch
.
empty
((
num_experts
,
k
,
n
//
2
),
device
=
device
,
dtype
=
torch
.
uint8
)
w1_gs
=
torch
.
empty
((
num_experts
,),
device
=
device
,
dtype
=
torch
.
float32
)
w2_gs
=
torch
.
empty
((
num_experts
,),
device
=
device
,
dtype
=
torch
.
float32
)
a1_gs
=
torch
.
ones
((
num_experts
,),
device
=
device
,
dtype
=
torch
.
float32
)
a2_gs
=
torch
.
ones
((
num_experts
,),
device
=
device
,
dtype
=
torch
.
float32
)
for
expert
in
range
(
num_experts
):
w1_e
=
w1
[
expert
]
w2_e
=
w2
[
expert
]
w1_amax
=
torch
.
abs
(
w1_e
).
max
().
to
(
torch
.
float32
)
w2_amax
=
torch
.
abs
(
w2_e
).
max
().
to
(
torch
.
float32
)
w1_gs
[
expert
]
=
FLOAT8_E4M3_MAX
*
FLOAT4_E2M1_MAX
/
w1_amax
w2_gs
[
expert
]
=
FLOAT8_E4M3_MAX
*
FLOAT4_E2M1_MAX
/
w2_amax
w1_fp4
[
expert
],
w1_blockscale
[
expert
]
=
ops
.
scaled_fp4_quant
(
w1_e
,
w1_gs
[
expert
]
)
w2_fp4
[
expert
],
w2_blockscale
[
expert
]
=
ops
.
scaled_fp4_quant
(
w2_e
,
w2_gs
[
expert
]
)
def
run_triton_moe
(
a
:
torch
.
Tensor
,
w1
:
torch
.
Tensor
,
w2
:
torch
.
Tensor
,
topk_weights
:
torch
.
Tensor
,
topk_ids
:
torch
.
Tensor
,
w1_scale
:
torch
.
Tensor
,
w2_scale
:
torch
.
Tensor
,
a_fp8_scale
:
torch
.
Tensor
,
num_repeats
:
int
,
):
for
_
in
range
(
num_repeats
):
fused_experts
(
a
,
w1
,
w2
,
topk_weights
,
topk_ids
,
use_fp8_w8a8
=
True
,
w1_scale
=
w1_scale
,
w2_scale
=
w2_scale
,
a1_scale
=
a_fp8_scale
,
)
def
run_cutlass_moe_fp4
(
a
:
torch
.
Tensor
,
w1_fp4
:
torch
.
Tensor
,
w2_fp4
:
torch
.
Tensor
,
w1_blockscale
:
torch
.
Tensor
,
w2_blockscale
:
torch
.
Tensor
,
w1_gs
:
torch
.
Tensor
,
w2_gs
:
torch
.
Tensor
,
a1_gs
:
torch
.
Tensor
,
a2_gs
:
torch
.
Tensor
,
topk_weights
:
torch
.
Tensor
,
topk_ids
:
torch
.
Tensor
,
m
:
int
,
n
:
int
,
k
:
int
,
e
:
int
,
device
:
torch
.
device
,
num_repeats
:
int
,
):
for
_
in
range
(
num_repeats
):
with
nvtx
.
annotate
(
"cutlass_moe_fp4"
,
color
=
"green"
):
cutlass_moe_fp4
(
a
=
a
,
a1_gscale
=
a1_gs
,
a2_gscale
=
a2_gs
,
w1_fp4
=
w1_fp4
,
w1_blockscale
=
w1_blockscale
,
w1_alphas
=
w1_gs
,
w2_fp4
=
w2_fp4
,
w2_blockscale
=
w2_blockscale
,
w2_alphas
=
w2_gs
,
topk_weights
=
topk_weights
,
topk_ids
=
topk_ids
,
m
=
m
,
n
=
n
,
k
=
k
,
e
=
num_experts
,
device
=
device
,
)
def
run_cutlass_from_graph
(
a
:
torch
.
Tensor
,
a1_gscale
:
torch
.
Tensor
,
w1_fp4
:
torch
.
Tensor
,
w1_blockscale
:
torch
.
Tensor
,
w1_alphas
:
torch
.
Tensor
,
a2_gscale
:
torch
.
Tensor
,
w2_fp4
:
torch
.
Tensor
,
w2_blockscale
:
torch
.
Tensor
,
w2_alphas
:
torch
.
Tensor
,
topk_weights
:
torch
.
Tensor
,
topk_ids
:
torch
.
Tensor
,
m
:
int
,
n
:
int
,
k
:
int
,
e
:
int
,
device
:
torch
.
device
,
):
with
set_current_vllm_config
(
VllmConfig
(
parallel_config
=
ParallelConfig
(
pipeline_parallel_size
=
1
))
):
return
cutlass_moe_fp4
(
a
=
a
,
a1_gscale
=
a1_gs
,
w1_fp4
=
w1_fp4
,
w1_blockscale
=
w1_blockscale
,
w1_alphas
=
w1_alphas
,
a2_gscale
=
a2_gs
,
w2_fp4
=
w2_fp4
,
w2_blockscale
=
w2_blockscale
,
w2_alphas
=
w2_alphas
,
topk_weights
=
topk_weights
,
topk_ids
=
topk_ids
,
m
=
m
,
n
=
n
,
k
=
k
,
e
=
num_experts
,
device
=
device
,
)
def
run_triton_from_graph
(
a
:
torch
.
Tensor
,
w1
:
torch
.
Tensor
,
w2
:
torch
.
Tensor
,
topk_weights
:
torch
.
Tensor
,
topk_ids
:
torch
.
Tensor
,
w1_scale
:
torch
.
Tensor
,
w2_scale
:
torch
.
Tensor
,
a_fp8_scale
:
torch
.
Tensor
,
):
with
set_current_vllm_config
(
VllmConfig
(
parallel_config
=
ParallelConfig
(
pipeline_parallel_size
=
1
))
):
return
fused_experts
(
a
,
w1
,
w2
,
topk_weights
,
topk_ids
,
use_fp8_w8a8
=
True
,
w1_scale
=
w1_scale
,
w2_scale
=
w2_scale
,
a1_scale
=
a_fp8_scale
,
)
def
replay_graph
(
graph
,
num_repeats
):
for
_
in
range
(
num_repeats
):
graph
.
replay
()
torch
.
cuda
.
synchronize
()
cutlass_stream
=
torch
.
cuda
.
Stream
()
cutlass_graph
=
torch
.
cuda
.
CUDAGraph
()
with
torch
.
cuda
.
graph
(
cutlass_graph
,
stream
=
cutlass_stream
):
run_cutlass_from_graph
(
a
=
a
,
a1_gscale
=
a1_gs
,
w1_fp4
=
w1_fp4
,
w1_blockscale
=
w1_blockscale
,
w1_alphas
=
w1_gs
,
a2_gscale
=
a2_gs
,
w2_fp4
=
w2_fp4
,
w2_blockscale
=
w2_blockscale
,
w2_alphas
=
w2_gs
,
topk_weights
=
topk_weights
,
topk_ids
=
topk_ids
,
m
=
m
,
n
=
n
,
k
=
k
,
e
=
num_experts
,
device
=
device
,
)
torch
.
cuda
.
synchronize
()
triton_stream
=
torch
.
cuda
.
Stream
()
triton_graph
=
torch
.
cuda
.
CUDAGraph
()
with
torch
.
cuda
.
graph
(
triton_graph
,
stream
=
triton_stream
):
run_triton_from_graph
(
a
,
w1_fp8q_notransp
,
w2_fp8q_notransp
,
topk_weights
,
topk_ids
,
w1_fp8scale
,
w2_fp8scale
,
a_fp8_scale
,
)
torch
.
cuda
.
synchronize
()
min_run_time
=
5
num_warmup
=
5
num_runs
=
25
globals
=
{
# Baseline params
"w1"
:
w1
,
"w2"
:
w2
,
"score"
:
score
,
"topk"
:
topk
,
"w1_fp8q_notransp"
:
w1_fp8q_notransp
,
"w2_fp8q_notransp"
:
w2_fp8q_notransp
,
"w1_fp8scale"
:
w1_fp8scale
,
"w2_fp8scale"
:
w2_fp8scale
,
"a_fp8_scale"
:
a_fp8_scale
,
# Cutlass params
"a"
:
a
,
"a1_gscale"
:
a1_gs
,
"w1_fp4"
:
w1_fp4
,
"w1_blockscale"
:
w1_blockscale
,
"w1_alphas"
:
w1_gs
,
"a2_gscale"
:
a2_gs
,
"w2_fp4"
:
w2_fp4
,
"w2_blockscale"
:
w2_blockscale
,
"w2_alphas"
:
w2_gs
,
"topk_weights"
:
topk_weights
,
"topk_ids"
:
topk_ids
,
"m"
:
m
,
"n"
:
n
,
"k"
:
k
,
"e"
:
num_experts
,
"device"
:
device
,
# cuda graph params
"cutlass_graph"
:
cutlass_graph
,
"triton_graph"
:
triton_graph
,
# Gen params
"num_runs"
:
num_runs
,
# Kernels
"run_triton_moe"
:
run_triton_moe
,
"run_cutlass_moe_fp4"
:
run_cutlass_moe_fp4
,
"replay_graph"
:
replay_graph
,
}
# Warmup
run_triton_moe
(
a
,
w1_fp8q_notransp
,
w2_fp8q_notransp
,
topk_weights
,
topk_ids
,
w1_fp8scale
,
w2_fp8scale
,
a_fp8_scale
,
num_warmup
,
)
results
.
append
(
benchmark
.
Timer
(
stmt
=
"run_triton_moe(a, w1_fp8q_notransp, w2_fp8q_notransp, topk_weights, topk_ids, w1_fp8scale, w2_fp8scale, a_fp8_scale, num_runs)"
,
# noqa: E501
globals
=
globals
,
label
=
label
,
sub_label
=
sub_label
,
description
=
"triton_moe"
,
).
blocked_autorange
(
min_run_time
=
min_run_time
)
)
# Warmup
replay_graph
(
triton_graph
,
num_warmup
)
results
.
append
(
benchmark
.
Timer
(
stmt
=
"replay_graph(triton_graph, num_runs)"
,
globals
=
globals
,
label
=
label
,
sub_label
=
sub_label
,
description
=
"triton_moe_cuda_graphs"
,
).
blocked_autorange
(
min_run_time
=
min_run_time
)
)
# Warmup
run_cutlass_moe_fp4
(
a
,
w1_fp4
,
w2_fp4
,
w1_blockscale
,
w2_blockscale
,
w1_gs
,
w2_gs
,
a1_gs
,
a2_gs
,
topk_weights
,
topk_ids
,
m
,
n
,
k
,
num_experts
,
device
,
num_warmup
,
)
results
.
append
(
benchmark
.
Timer
(
stmt
=
"run_cutlass_moe_fp4(a, w1_fp4, w2_fp4, w1_blockscale, w2_blockscale, w1_alphas, w2_alphas, a1_gscale, a2_gscale, topk_weights, topk_ids, m, n, k, e, device, num_runs)"
,
# noqa: E501
globals
=
globals
,
label
=
label
,
sub_label
=
sub_label
,
description
=
"cutlass_moe_fp4"
,
).
blocked_autorange
(
min_run_time
=
min_run_time
)
)
# Warmup
replay_graph
(
cutlass_graph
,
num_warmup
)
results
.
append
(
benchmark
.
Timer
(
stmt
=
"replay_graph(cutlass_graph, num_runs)"
,
globals
=
globals
,
label
=
label
,
sub_label
=
sub_label
,
description
=
"cutlass_moe_fp4_cuda_graphs"
,
).
blocked_autorange
(
min_run_time
=
min_run_time
)
)
def
main
(
args
):
print
(
"Benchmarking models:"
)
for
i
,
model
in
enumerate
(
args
.
models
):
print
(
f
"[
{
i
}
]
{
model
}
"
)
results
:
list
[
benchmark
.
Measurement
]
=
[]
for
model
in
args
.
models
:
for
tp
in
args
.
tp_sizes
:
for
layer
in
WEIGHT_SHAPES_MOE
[
model
]:
num_experts
=
layer
[
0
]
topk
=
layer
[
1
]
size_k
=
layer
[
2
]
size_n
=
layer
[
3
]
//
tp
if
len
(
args
.
limit_k
)
>
0
and
size_k
not
in
args
.
limit_k
:
continue
if
len
(
args
.
limit_n
)
>
0
and
size_n
not
in
args
.
limit_n
:
continue
for
per_act_token
in
PER_ACT_TOKEN_OPTS
:
for
per_out_ch
in
PER_OUT_CH_OPTS
:
for
size_m
in
args
.
batch_sizes
:
mkn
=
(
size_m
,
size_k
,
size_n
)
bench_run
(
results
,
model
,
num_experts
,
topk
,
per_act_token
,
per_out_ch
,
mkn
,
)
compare
=
benchmark
.
Compare
(
results
)
compare
.
print
()
if
__name__
==
"__main__"
:
parser
=
FlexibleArgumentParser
(
description
=
"Benchmark NVFP4 CUTLASS MOE across specified models/shapes/batches"
)
parser
.
add_argument
(
"--models"
,
nargs
=
"+"
,
type
=
str
,
default
=
DEFAULT_MODELS
,
choices
=
WEIGHT_SHAPES_MOE
.
keys
(),
)
parser
.
add_argument
(
"--tp-sizes"
,
nargs
=
"+"
,
type
=
int
,
default
=
DEFAULT_TP_SIZES
)
parser
.
add_argument
(
"--batch-sizes"
,
nargs
=
"+"
,
type
=
int
,
default
=
DEFAULT_BATCH_SIZES
)
parser
.
add_argument
(
"--limit-k"
,
nargs
=
"+"
,
type
=
int
,
default
=
[])
parser
.
add_argument
(
"--limit-n"
,
nargs
=
"+"
,
type
=
int
,
default
=
[])
parser
.
add_argument
(
"--limit-num-groups"
,
nargs
=
"+"
,
type
=
int
,
default
=
[])
parser
.
add_argument
(
"--limit-per-act-token"
,
nargs
=
"+"
,
type
=
int
,
default
=
[])
parser
.
add_argument
(
"--limit-per-out-ch"
,
nargs
=
"+"
,
type
=
int
,
default
=
[])
args
=
parser
.
parse_args
()
main
(
args
)
offline_benchmark_test/benchmarks/kernels/benchmark_grouped_gemm_cutlass.py
0 → 100644
View file @
d1a06223
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import
torch
import
torch.utils.benchmark
as
benchmark
from
benchmark_shapes
import
WEIGHT_SHAPES_MOE
from
vllm
import
_custom_ops
as
ops
from
vllm.config
import
ParallelConfig
,
VllmConfig
,
set_current_vllm_config
from
vllm.model_executor.layers.fused_moe.cutlass_moe
import
cutlass_moe_fp8
from
vllm.model_executor.layers.fused_moe.fused_moe
import
(
fused_experts
,
fused_topk
,
)
from
vllm.utils
import
FlexibleArgumentParser
DEFAULT_MODELS
=
[
"nm-testing/Mixtral-8x7B-Instruct-v0.1"
,
"nm-testing/deepseekv2-lite"
,
"ibm-granite/granite-3.0-1b-a400m"
,
"ibm-granite/granite-3.0-3b-a800m"
,
]
DEFAULT_BATCH_SIZES
=
[
1
,
4
,
8
,
16
,
32
,
64
,
128
,
256
,
512
]
DEFAULT_TP_SIZES
=
[
1
]
PER_ACT_TOKEN_OPTS
=
[
False
]
PER_OUT_CH_OPTS
=
[
False
]
def
to_fp8
(
tensor
:
torch
.
Tensor
):
finfo
=
torch
.
finfo
(
torch
.
float8_e4m3fn
)
return
torch
.
round
(
tensor
.
clamp
(
min
=
finfo
.
min
,
max
=
finfo
.
max
)).
to
(
dtype
=
torch
.
float8_e4m3fn
)
def
bench_run
(
results
:
list
[
benchmark
.
Measurement
],
model
:
str
,
num_experts
:
int
,
topk
:
int
,
per_act_token
:
bool
,
per_out_ch
:
bool
,
mkn
:
tuple
[
int
,
int
,
int
],
):
label
=
"Quant Matmul"
sub_label
=
(
"{}, num_experts={}, topk={}, per_act_token={} per_out_ch={}, MKN=({})"
.
format
(
model
,
num_experts
,
topk
,
per_act_token
,
per_out_ch
,
mkn
)
)
print
(
f
"Testing:
{
sub_label
}
"
)
(
m
,
k
,
n
)
=
mkn
dtype
=
torch
.
half
a
=
torch
.
randn
((
m
,
k
),
device
=
"cuda"
,
dtype
=
dtype
)
/
10
w1
=
torch
.
randn
((
num_experts
,
2
*
n
,
k
),
device
=
"cuda"
,
dtype
=
dtype
)
/
10
w2
=
torch
.
randn
((
num_experts
,
k
,
n
),
device
=
"cuda"
,
dtype
=
dtype
)
/
10
_
,
a_scale
=
ops
.
scaled_fp8_quant
(
a
)
w1_q
=
torch
.
empty
(
(
num_experts
,
2
*
n
,
k
),
device
=
"cuda"
,
dtype
=
torch
.
float8_e4m3fn
)
w2_q
=
torch
.
empty
((
num_experts
,
k
,
n
),
device
=
"cuda"
,
dtype
=
torch
.
float8_e4m3fn
)
w1_scale
=
torch
.
empty
((
num_experts
,
1
,
1
),
device
=
"cuda"
,
dtype
=
torch
.
float32
)
w2_scale
=
torch
.
empty
((
num_experts
,
1
,
1
),
device
=
"cuda"
,
dtype
=
torch
.
float32
)
for
expert
in
range
(
num_experts
):
w1_q
[
expert
],
w1_scale
[
expert
]
=
ops
.
scaled_fp8_quant
(
w1
[
expert
])
w2_q
[
expert
],
w2_scale
[
expert
]
=
ops
.
scaled_fp8_quant
(
w2
[
expert
])
score
=
torch
.
randn
((
m
,
num_experts
),
device
=
"cuda"
,
dtype
=
dtype
)
topk_weights
,
topk_ids
,
token_expert_indices
=
fused_topk
(
a
,
score
,
topk
,
renormalize
=
False
)
def
run_triton_moe
(
a
:
torch
.
Tensor
,
w1
:
torch
.
Tensor
,
w2
:
torch
.
Tensor
,
topk_weights
:
torch
.
Tensor
,
topk_ids
:
torch
.
Tensor
,
w1_scale
:
torch
.
Tensor
,
w2_scale
:
torch
.
Tensor
,
a_scale
:
torch
.
Tensor
,
num_repeats
:
int
,
):
for
_
in
range
(
num_repeats
):
fused_experts
(
a
,
w1
,
w2
,
topk_weights
,
topk_ids
,
use_fp8_w8a8
=
True
,
w1_scale
=
w1_scale
,
w2_scale
=
w2_scale
,
a1_scale
=
a_scale
,
)
def
run_cutlass_moe
(
a
:
torch
.
Tensor
,
a_scale
:
torch
.
Tensor
,
w1
:
torch
.
Tensor
,
w2
:
torch
.
Tensor
,
w1_scale
:
torch
.
Tensor
,
w2_scale
:
torch
.
Tensor
,
topk_weights
:
torch
.
Tensor
,
topk_ids
:
torch
.
Tensor
,
per_act_token
:
bool
,
num_repeats
:
int
,
):
for
_
in
range
(
num_repeats
):
cutlass_moe_fp8
(
a
,
w1
,
w2
,
topk_weights
,
topk_ids
,
w1_scale
,
w2_scale
,
per_act_token
,
a1_scale
=
None
,
)
def
run_cutlass_from_graph
(
a
:
torch
.
Tensor
,
a_scale
:
torch
.
Tensor
,
w1_q
:
torch
.
Tensor
,
w2_q
:
torch
.
Tensor
,
w1_scale
:
torch
.
Tensor
,
w2_scale
:
torch
.
Tensor
,
topk_weights
:
torch
.
Tensor
,
topk_ids
:
torch
.
Tensor
,
):
with
set_current_vllm_config
(
VllmConfig
(
parallel_config
=
ParallelConfig
(
pipeline_parallel_size
=
1
))
):
return
cutlass_moe_fp8
(
a
,
w1_q
,
w2_q
,
topk_weights
,
topk_ids
,
w1_scale
,
w2_scale
,
per_act_token
,
a1_scale
=
None
,
)
def
run_triton_from_graph
(
a
:
torch
.
Tensor
,
w1
:
torch
.
Tensor
,
w2
:
torch
.
Tensor
,
topk_weights
:
torch
.
Tensor
,
topk_ids
:
torch
.
Tensor
,
w1_scale
:
torch
.
Tensor
,
w2_scale
:
torch
.
Tensor
,
a_scale
:
torch
.
Tensor
,
):
with
set_current_vllm_config
(
VllmConfig
(
parallel_config
=
ParallelConfig
(
pipeline_parallel_size
=
1
))
):
return
fused_experts
(
a
,
w1
,
w2
,
topk_weights
,
topk_ids
,
use_fp8_w8a8
=
True
,
w1_scale
=
w1_scale
,
w2_scale
=
w2_scale
,
a1_scale
=
a_scale
,
)
def
replay_graph
(
graph
,
num_repeats
):
for
_
in
range
(
num_repeats
):
graph
.
replay
()
torch
.
cuda
.
synchronize
()
cutlass_stream
=
torch
.
cuda
.
Stream
()
cutlass_graph
=
torch
.
cuda
.
CUDAGraph
()
with
torch
.
cuda
.
graph
(
cutlass_graph
,
stream
=
cutlass_stream
):
run_cutlass_from_graph
(
a
,
a_scale
,
w1_q
,
w2_q
,
w1_scale
,
w2_scale
,
topk_weights
,
topk_ids
,
)
torch
.
cuda
.
synchronize
()
triton_stream
=
torch
.
cuda
.
Stream
()
triton_graph
=
torch
.
cuda
.
CUDAGraph
()
with
torch
.
cuda
.
graph
(
triton_graph
,
stream
=
triton_stream
):
run_triton_from_graph
(
a
,
w1_q
,
w2_q
,
topk_weights
,
topk_ids
,
w1_scale
,
w2_scale
,
a_scale
,
)
torch
.
cuda
.
synchronize
()
min_run_time
=
5
num_warmup
=
5
num_runs
=
25
globals
=
{
# Baseline params
"w1"
:
w1
,
"w2"
:
w2
,
"score"
:
score
,
"topk"
:
topk
,
# Cutlass params
"a_scale"
:
a_scale
,
"w1_q"
:
w1_q
,
"w2_q"
:
w2_q
,
"w1_scale"
:
w1_scale
,
"w2_scale"
:
w2_scale
,
"per_act_token"
:
per_act_token
,
# cuda graph params
"cutlass_graph"
:
cutlass_graph
,
"triton_graph"
:
triton_graph
,
# Gen params
"a"
:
a
,
"topk_weights"
:
topk_weights
,
"topk_ids"
:
topk_ids
,
"num_runs"
:
num_runs
,
# Kernels
"run_triton_moe"
:
run_triton_moe
,
"run_cutlass_moe"
:
run_cutlass_moe
,
"replay_graph"
:
replay_graph
,
}
# Warmup
run_triton_moe
(
a
,
w1_q
,
w2_q
,
topk_weights
,
topk_ids
,
w1_scale
,
w2_scale
,
a_scale
,
num_warmup
,
)
results
.
append
(
benchmark
.
Timer
(
stmt
=
"run_triton_moe(a, w1_q, w2_q, topk_weights, topk_ids, w1_scale, w2_scale, a_scale, num_runs)"
,
# noqa: E501
globals
=
globals
,
label
=
label
,
sub_label
=
sub_label
,
description
=
"triton_moe"
,
).
blocked_autorange
(
min_run_time
=
min_run_time
)
)
# Warmup
replay_graph
(
triton_graph
,
num_warmup
)
results
.
append
(
benchmark
.
Timer
(
stmt
=
"replay_graph(triton_graph, num_runs)"
,
globals
=
globals
,
label
=
label
,
sub_label
=
sub_label
,
description
=
"triton_moe_cuda_graphs"
,
).
blocked_autorange
(
min_run_time
=
min_run_time
)
)
# Warmup
run_cutlass_moe
(
a
,
a_scale
,
w1_q
,
w2_q
,
w1_scale
,
w2_scale
,
topk_weights
,
topk_ids
,
per_act_token
,
num_warmup
,
)
results
.
append
(
benchmark
.
Timer
(
stmt
=
"run_cutlass_moe(a, a_scale, w1_q, w2_q, w1_scale, w2_scale, topk_weights, topk_ids, per_act_token, num_runs)"
,
# noqa: E501
globals
=
globals
,
label
=
label
,
sub_label
=
sub_label
,
description
=
"grouped_gemm_moe"
,
).
blocked_autorange
(
min_run_time
=
min_run_time
)
)
# Warmup
replay_graph
(
cutlass_graph
,
num_warmup
)
results
.
append
(
benchmark
.
Timer
(
stmt
=
"replay_graph(cutlass_graph, num_runs)"
,
globals
=
globals
,
label
=
label
,
sub_label
=
sub_label
,
description
=
"grouped_gemm_moe_cuda_graphs"
,
).
blocked_autorange
(
min_run_time
=
min_run_time
)
)
def
main
(
args
):
print
(
"Benchmarking models:"
)
for
i
,
model
in
enumerate
(
args
.
models
):
print
(
f
"[
{
i
}
]
{
model
}
"
)
results
:
list
[
benchmark
.
Measurement
]
=
[]
for
model
in
args
.
models
:
for
tp
in
args
.
tp_sizes
:
for
layer
in
WEIGHT_SHAPES_MOE
[
model
]:
num_experts
=
layer
[
0
]
topk
=
layer
[
1
]
size_k
=
layer
[
2
]
size_n
=
layer
[
3
]
//
tp
if
len
(
args
.
limit_k
)
>
0
and
size_k
not
in
args
.
limit_k
:
continue
if
len
(
args
.
limit_n
)
>
0
and
size_n
not
in
args
.
limit_n
:
continue
for
per_act_token
in
PER_ACT_TOKEN_OPTS
:
for
per_out_ch
in
PER_OUT_CH_OPTS
:
for
size_m
in
DEFAULT_BATCH_SIZES
:
mkn
=
(
size_m
,
size_k
,
size_n
)
bench_run
(
results
,
model
,
num_experts
,
topk
,
per_act_token
,
per_out_ch
,
mkn
,
)
compare
=
benchmark
.
Compare
(
results
)
compare
.
print
()
if
__name__
==
"__main__"
:
parser
=
FlexibleArgumentParser
(
description
=
"Benchmark Marlin across specified models/shapes/batches"
)
parser
.
add_argument
(
"--models"
,
nargs
=
"+"
,
type
=
str
,
default
=
DEFAULT_MODELS
,
choices
=
WEIGHT_SHAPES_MOE
.
keys
(),
)
parser
.
add_argument
(
"--tp-sizes"
,
nargs
=
"+"
,
type
=
int
,
default
=
DEFAULT_TP_SIZES
)
parser
.
add_argument
(
"--batch-sizes"
,
nargs
=
"+"
,
type
=
int
,
default
=
DEFAULT_BATCH_SIZES
)
parser
.
add_argument
(
"--limit-k"
,
nargs
=
"+"
,
type
=
int
,
default
=
[])
parser
.
add_argument
(
"--limit-n"
,
nargs
=
"+"
,
type
=
int
,
default
=
[])
parser
.
add_argument
(
"--limit-num-groups"
,
nargs
=
"+"
,
type
=
int
,
default
=
[])
parser
.
add_argument
(
"--limit-per-act-token"
,
nargs
=
"+"
,
type
=
int
,
default
=
[])
parser
.
add_argument
(
"--limit-per-out-ch"
,
nargs
=
"+"
,
type
=
int
,
default
=
[])
args
=
parser
.
parse_args
()
main
(
args
)
offline_benchmark_test/benchmarks/kernels/benchmark_layernorm.py
0 → 100644
View file @
d1a06223
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import
time
import
torch
from
vllm.model_executor.layers.layernorm
import
RMSNorm
from
vllm.platforms
import
current_platform
from
vllm.utils
import
STR_DTYPE_TO_TORCH_DTYPE
,
FlexibleArgumentParser
@
torch
.
inference_mode
()
def
main
(
num_tokens
:
int
,
hidden_size
:
int
,
add_residual
:
bool
,
dtype
:
torch
.
dtype
,
seed
:
int
=
0
,
do_profile
:
bool
=
False
,
num_warmup_iters
:
int
=
5
,
num_iters
:
int
=
100
,
)
->
None
:
current_platform
.
seed_everything
(
seed
)
torch
.
set_default_device
(
"cuda"
)
layer
=
RMSNorm
(
hidden_size
).
to
(
dtype
=
dtype
)
layer
.
weight
.
data
.
normal_
(
mean
=
1.0
,
std
=
0.1
)
scale
=
1
/
(
2
*
hidden_size
)
x
=
torch
.
randn
(
num_tokens
,
hidden_size
,
dtype
=
dtype
)
x
*=
scale
residual
=
torch
.
randn_like
(
x
)
*
scale
if
add_residual
else
None
def
run_cuda_benchmark
(
num_iters
:
int
,
profile
:
bool
=
False
)
->
float
:
torch
.
cuda
.
synchronize
()
if
profile
:
torch
.
cuda
.
cudart
().
cudaProfilerStart
()
start_time
=
time
.
perf_counter
()
for
_
in
range
(
num_iters
):
layer
(
x
,
residual
)
torch
.
cuda
.
synchronize
()
end_time
=
time
.
perf_counter
()
if
profile
:
torch
.
cuda
.
cudart
().
cudaProfilerStop
()
return
(
end_time
-
start_time
)
/
num_iters
# Warmup.
print
(
"Warming up..."
)
run_benchmark
=
run_cuda_benchmark
run_benchmark
(
num_iters
=
num_warmup_iters
,
profile
=
False
)
# Benchmark.
if
do_profile
:
latency
=
run_benchmark
(
num_iters
=
1
,
profile
=
True
)
else
:
latency
=
run_benchmark
(
num_iters
=
num_iters
,
profile
=
False
)
print
(
f
"Kernel running time:
{
latency
*
1000000
:.
3
f
}
us"
)
if
__name__
==
"__main__"
:
parser
=
FlexibleArgumentParser
(
description
=
"Benchmark the layernorm kernel."
)
parser
.
add_argument
(
"--num-tokens"
,
type
=
int
,
default
=
4096
)
parser
.
add_argument
(
"--hidden-size"
,
type
=
int
,
default
=
8192
)
parser
.
add_argument
(
"--add-residual"
,
action
=
"store_true"
)
parser
.
add_argument
(
"--dtype"
,
type
=
str
,
choices
=
[
"half"
,
"bfloat16"
,
"float"
],
default
=
"half"
)
parser
.
add_argument
(
"--seed"
,
type
=
int
,
default
=
0
)
parser
.
add_argument
(
"--profile"
,
action
=
"store_true"
)
parser
.
add_argument
(
"--num-warmup-iters"
,
type
=
int
,
default
=
5
)
parser
.
add_argument
(
"--num-iters"
,
type
=
int
,
default
=
100
,
help
=
"Number of benchmark iterations. "
"If --profile is set, this number is ignored"
,
)
args
=
parser
.
parse_args
()
print
(
args
)
main
(
num_tokens
=
args
.
num_tokens
,
hidden_size
=
args
.
hidden_size
,
add_residual
=
args
.
add_residual
,
dtype
=
STR_DTYPE_TO_TORCH_DTYPE
[
args
.
dtype
],
seed
=
args
.
seed
,
do_profile
=
args
.
profile
,
num_warmup_iters
=
args
.
num_warmup_iters
,
num_iters
=
args
.
num_iters
,
)
offline_benchmark_test/benchmarks/kernels/benchmark_lora.py
0 → 100644
View file @
d1a06223
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import
argparse
import
copy
import
json
import
pickle
import
time
from
dataclasses
import
dataclass
from
enum
import
Enum
,
auto
from
itertools
import
product
from
pathlib
import
Path
from
typing
import
Any
,
Callable
,
Optional
import
torch
import
torch.utils.benchmark
as
TBenchmark
from
torch.utils.benchmark
import
Measurement
as
TMeasurement
from
utils
import
ArgPool
,
Bench
,
CudaGraphBenchParams
from
weight_shapes
import
WEIGHT_SHAPES
from
vllm.triton_utils
import
HAS_TRITON
if
HAS_TRITON
:
from
vllm.lora.ops.triton_ops
import
LoRAKernelMeta
,
lora_expand
,
lora_shrink
from
vllm.lora.ops.triton_ops.utils
import
_LORA_A_PTR_DICT
,
_LORA_B_PTR_DICT
from
vllm.utils
import
FlexibleArgumentParser
DEFAULT_MODELS
=
list
(
WEIGHT_SHAPES
.
keys
())
DEFAULT_TP_SIZES
=
[
1
]
DEFAULT_BATCH_SIZES
=
[
1
,
16
,
32
,
64
,
128
,
192
,
256
,
320
,
384
,
448
,
512
,
640
,
768
,
896
,
1024
,
2048
,
3072
,
4096
,
5120
,
6144
,
7168
,
8192
,
]
DEFAULT_HIDDEN_SIZES
=
[
1024
,
2048
,
4096
,
8192
,
16384
]
DEFAULT_LORA_RANKS
=
[
16
]
DEFAULT_NUM_LORAS
=
[
1
,
2
,
3
,
4
]
DEFAULT_SORT_BY_LORA_IDS
=
[
False
,
True
]
DEFAULT_SEQ_LENGTHS
=
[
1
]
DEFAULT_EXPAND_FN_ADD_INPUTS
=
[
True
,
False
]
# Utilities
def
dtype_to_str
(
dtype
:
torch
.
dtype
):
if
dtype
==
torch
.
float16
:
return
"f16"
if
dtype
==
torch
.
bfloat16
:
return
"bf16"
if
dtype
==
torch
.
float32
:
return
"f32"
raise
ValueError
(
f
"Unsupported dtype
{
dtype
}
"
)
def
make_rand_lora_weight_tensor
(
k
:
int
,
n
:
int
,
num_loras
:
int
,
dtype
:
torch
.
dtype
,
device
:
str
=
"cuda"
)
->
torch
.
Tensor
:
# LoRA weights column major
return
torch
.
rand
((
num_loras
,
n
,
k
),
dtype
=
dtype
).
to
(
device
)
def
make_rand_tensors
(
a_shape
:
tuple
[
int
],
b_shape
:
tuple
[
int
],
c_shape
:
tuple
[
int
],
a_dtype
:
torch
.
dtype
,
b_dtype
:
torch
.
dtype
,
c_dtype
:
torch
.
dtype
,
num_slices
:
int
,
device
:
str
=
"cuda"
,
)
->
tuple
[
torch
.
Tensor
,
list
[
torch
.
Tensor
],
torch
.
Tensor
]:
"""
Make LoRA input/output matrices.
"""
A
=
torch
.
rand
(
a_shape
,
dtype
=
a_dtype
).
to
(
device
)
# LoRA weights column major
Bs
=
[
torch
.
rand
(
b_shape
,
dtype
=
b_dtype
).
to
(
device
)
for
_
in
range
(
num_slices
)]
C
=
torch
.
zeros
(
c_shape
,
dtype
=
c_dtype
).
to
(
device
)
return
A
,
Bs
,
C
def
make_prompt_lora_mapping
(
num_prompts
:
int
,
num_active_loras
:
int
,
sort_by_lora_id
:
bool
,
device
:
str
)
->
torch
.
Tensor
:
"""
All prompts are mapped to a LoRA ID in range [0, num_active_loras).
where 0 refers to first lora, 1 refers to second lora and so on.
"""
assert
num_active_loras
>
0
if
not
sort_by_lora_id
:
return
torch
.
randint
(
0
,
num_active_loras
,
(
num_prompts
,),
dtype
=
torch
.
long
)
# Divide LoRAs equally and in order.
part_size
=
num_prompts
//
num_active_loras
part_size
=
max
(
part_size
,
1
)
lora_id
=
0
prompt_lora_mapping
=
[]
while
len
(
prompt_lora_mapping
)
<
num_prompts
:
prompt_lora_mapping
.
extend
([
lora_id
]
*
part_size
)
lora_id
=
lora_id
+
1
if
lora_id
+
1
<
num_active_loras
else
lora_id
return
torch
.
tensor
(
prompt_lora_mapping
[:
num_prompts
],
dtype
=
torch
.
long
,
device
=
device
)
def
make_token_lora_mapping
(
num_tokens
:
int
,
num_prompts
:
int
,
prompt_lora_mapping
:
torch
.
Tensor
,
seq_len_tensor
:
torch
.
Tensor
,
device
:
str
,
):
"""
Make token_lora_mapping from prompt_lora_mapping and seq_lens_tensor
"""
assert
prompt_lora_mapping
.
shape
[
0
]
==
num_prompts
# token to lora index mapping
token_lora_mapping
=
[
0
]
*
num_tokens
current_offset
=
0
for
b_id
in
range
(
num_prompts
):
lora_index
=
prompt_lora_mapping
[
b_id
].
item
()
s
=
current_offset
e
=
s
+
seq_len_tensor
[
b_id
].
item
()
token_lora_mapping
[
s
:
e
]
=
[
lora_index
]
*
(
e
-
s
)
current_offset
+=
seq_len_tensor
[
b_id
].
item
()
return
torch
.
tensor
(
token_lora_mapping
,
dtype
=
torch
.
long
,
device
=
device
)
def
ref_group_gemm
(
ref_out
:
torch
.
Tensor
,
input
:
torch
.
Tensor
,
lora_weights
:
list
[
torch
.
Tensor
],
seq_lens_cpu
:
torch
.
Tensor
,
prompt_lora_mapping_cpu
:
torch
.
Tensor
,
scaling
:
float
,
add_inputs
:
Optional
[
bool
],
):
"""
Torch group gemm reference implementation to test correctness of
benchmarking operations.
"""
batches
=
seq_lens_cpu
.
size
(
0
)
out_list
=
[]
current_offset
=
0
for
lora_index
,
b_length
in
zip
(
range
(
batches
),
seq_lens_cpu
):
x
=
input
[
current_offset
:
b_length
+
current_offset
,
:]
current_offset
+=
b_length
w
=
lora_weights
[
prompt_lora_mapping_cpu
[
lora_index
]]
result
=
torch
.
nn
.
functional
.
linear
(
x
,
w
)
result
*=
scaling
out_list
.
append
(
result
)
cat_result
=
torch
.
cat
(
out_list
,
dim
=
0
)
if
add_inputs
:
ref_out
+=
cat_result
else
:
ref_out
.
copy_
(
cat_result
)
class
OpType
(
Enum
):
"""
LoRA Ops to benchmark and its properties.
"""
LORA_SHRINK
=
auto
()
LORA_EXPAND
=
auto
()
@
staticmethod
def
from_str
(
s
:
str
)
->
"OpType"
:
if
s
.
lower
()
==
"lora_shrink"
:
return
OpType
.
LORA_SHRINK
if
s
.
lower
()
==
"lora_expand"
:
return
OpType
.
LORA_EXPAND
raise
ValueError
(
f
"Unrecognized str
{
s
}
to convert to OpType"
)
def
is_shrink_fn
(
self
)
->
bool
:
return
self
in
[
OpType
.
LORA_SHRINK
]
def
is_expand_fn
(
self
)
->
bool
:
return
self
in
[
OpType
.
LORA_EXPAND
]
def
num_slices
(
self
)
->
list
[
int
]:
return
[
1
,
2
,
3
]
def
mkn
(
self
,
batch_size
:
int
,
seq_length
:
int
,
hidden_size
:
int
,
lora_rank
:
int
)
->
tuple
[
int
,
int
,
int
]:
num_tokens
=
batch_size
*
seq_length
if
self
.
is_shrink_fn
():
m
=
num_tokens
k
=
hidden_size
n
=
lora_rank
else
:
assert
self
.
is_expand_fn
()
m
=
num_tokens
k
=
lora_rank
n
=
hidden_size
return
m
,
k
,
n
def
matmul_dtypes
(
self
,
op_dtype
:
torch
.
dtype
)
->
tuple
[
torch
.
dtype
,
torch
.
dtype
,
torch
.
dtype
]:
"""
return a type, b type and c type for A x B = C
"""
if
self
.
is_shrink_fn
():
return
op_dtype
,
op_dtype
,
torch
.
float32
else
:
assert
self
.
is_expand_fn
()
return
torch
.
float32
,
op_dtype
,
op_dtype
def
matmul_shapes
(
self
,
batch_size
:
int
,
seq_length
:
int
,
hidden_size
:
int
,
lora_rank
:
int
,
num_loras
:
int
,
num_slices
:
int
,
)
->
tuple
[
tuple
[
int
],
tuple
[
int
],
tuple
[
int
]]:
"""
Given num_slices, return the shapes of the A, B, and C matrices
in A x B = C, for the op_type
"""
m
,
k
,
n
=
self
.
mkn
(
batch_size
,
seq_length
,
hidden_size
,
lora_rank
)
b_shape
=
(
num_loras
,
n
,
k
)
# col-major
if
self
in
[
OpType
.
LORA_SHRINK
]:
# LoRA shrink kernels support num_slices inherently in the kernel.
return
((
m
,
k
),
b_shape
,
(
num_slices
,
m
,
n
))
if
self
in
[
OpType
.
LORA_EXPAND
]:
# LoRA expand kernels support num_slices inherently in the kernel
return
((
num_slices
,
m
,
k
),
b_shape
,
(
m
,
n
*
num_slices
))
raise
ValueError
(
f
"Unrecognized op_type
{
self
}
"
)
def
bench_fn
(
self
)
->
Callable
:
if
self
==
OpType
.
LORA_SHRINK
:
return
lora_shrink
if
self
==
OpType
.
LORA_EXPAND
:
return
lora_expand
raise
ValueError
(
f
"Unrecognized optype
{
self
}
"
)
def
run_ref_group_gemm
(
self
,
output
:
torch
.
Tensor
,
input
:
torch
.
Tensor
,
lora_weights
:
list
[
torch
.
Tensor
],
**
kwargs
,
)
->
Callable
:
"""Each benchmark operation expects the input, lora_weights and outputs
in a slightly different format. Refer to self.matmul_shapes().
run_ref_group_gemm accounts for those differences in executing a
reference group gemm for correctness testing.
"""
w_dtype
=
lora_weights
[
0
].
dtype
num_slices
=
len
(
lora_weights
)
if
self
in
[
OpType
.
LORA_SHRINK
]:
for
slice_idx
in
range
(
num_slices
):
ref_group_gemm
(
ref_out
=
output
[
slice_idx
,
:],
input
=
input
,
lora_weights
=
lora_weights
[
slice_idx
],
**
kwargs
,
)
elif
self
in
[
OpType
.
LORA_EXPAND
]:
hidden_size
=
lora_weights
[
0
].
shape
[
1
]
for
slice_idx
in
range
(
num_slices
):
slice_offset
=
slice_idx
*
hidden_size
ref_group_gemm
(
ref_out
=
output
[:,
slice_offset
:
slice_offset
+
hidden_size
],
input
=
input
[
slice_idx
].
clone
().
to
(
dtype
=
w_dtype
),
lora_weights
=
lora_weights
[
slice_idx
],
**
kwargs
,
)
else
:
raise
ValueError
(
f
"Unrecognized optype
{
self
}
"
)
@
dataclass
class
BenchmarkContext
:
"""
LoRA benchmark context
"""
batch_size
:
int
hidden_size
:
int
num_loras
:
int
num_active_loras
:
int
lora_rank
:
int
sort_by_lora_id
:
bool
dtype
:
torch
.
dtype
seq_length
:
Optional
[
int
]
=
None
num_slices
:
Optional
[
int
]
=
None
# num_slices for slice based ops
def
with_seq_length
(
self
,
seq_length
:
int
)
->
"BenchmarkContext"
:
ctx
=
copy
.
copy
(
self
)
ctx
.
seq_length
=
seq_length
return
ctx
def
with_num_slices
(
self
,
num_slices
:
int
)
->
"BenchmarkContext"
:
ctx
=
copy
.
copy
(
self
)
ctx
.
num_slices
=
num_slices
return
ctx
def
bench_label
(
self
)
->
str
:
return
f
"lora-
{
self
.
dtype
}
"
def
bench_sublabel
(
self
,
op_type
:
OpType
)
->
str
:
m
,
k
,
n
=
op_type
.
mkn
(
self
.
batch_size
,
self
.
seq_length
,
self
.
hidden_size
,
self
.
lora_rank
)
desc
=
{
"bs"
:
self
.
batch_size
,
"sl"
:
self
.
seq_length
,
"m"
:
m
,
"k"
:
k
,
"n"
:
n
,
"num_loras"
:
self
.
num_loras
,
"sort_by_lora"
:
self
.
sort_by_lora_id
,
"num_slices"
:
self
.
num_slices
,
}
return
json
.
dumps
(
desc
)
@
dataclass
class
BenchmarkTensors
:
"""
Input/Output tensors used for benchmarks
"""
# matmul tensors
input
:
torch
.
Tensor
lora_weights_lst
:
list
[
torch
.
Tensor
]
output
:
torch
.
Tensor
# LoRA kernel metadata
lora_kernel_meta
:
LoRAKernelMeta
# Metadata tensors used in testing correctness
seq_lens
:
torch
.
Tensor
prompt_lora_mapping
:
torch
.
Tensor
def
io_types
(
self
)
->
str
:
return
(
f
"
{
dtype_to_str
(
self
.
input
.
dtype
)
}
x"
f
"
{
dtype_to_str
(
self
.
lora_weights_lst
[
0
].
dtype
)
}
=>"
f
"
{
dtype_to_str
(
self
.
output
.
dtype
)
}
"
)
@
staticmethod
def
make
(
ctx
:
BenchmarkContext
,
op_type
:
OpType
,
device
:
str
=
"cuda"
)
->
"BenchmarkTensors"
:
# Make input / output matmul tensors.
a_shape
,
b_shape
,
c_shape
=
op_type
.
matmul_shapes
(
ctx
.
batch_size
,
ctx
.
seq_length
,
ctx
.
hidden_size
,
ctx
.
lora_rank
,
ctx
.
num_loras
,
ctx
.
num_slices
,
)
a_type
,
b_type
,
c_type
=
op_type
.
matmul_dtypes
(
ctx
.
dtype
)
input_tensor
,
lora_weights
,
output_tensor
=
make_rand_tensors
(
a_shape
,
b_shape
,
c_shape
,
a_type
,
b_type
,
c_type
,
num_slices
=
ctx
.
num_slices
)
# Make metadata tensors.
# Keep the metadata tensors in the CPU for further processing if needed.
# The tensors get moved to the GPU before benchmarking.
assert
ctx
.
num_active_loras
<=
ctx
.
num_loras
total_tokens
=
ctx
.
batch_size
*
ctx
.
seq_length
# Make metadata tensors involved in correctness testing.
# Prepare seq lens tensor
seq_len_tensor
=
torch
.
randint
(
ctx
.
seq_length
,
ctx
.
seq_length
+
1
,
(
ctx
.
batch_size
,)
)
assert
total_tokens
==
seq_len_tensor
.
sum
()
# Prepare prompt lora indices tensor
prompt_lora_indices_tensor
=
make_prompt_lora_mapping
(
ctx
.
batch_size
,
ctx
.
num_active_loras
,
ctx
.
sort_by_lora_id
,
"cpu"
)
# Make LoRAKernelMeta
token_lora_indices_tensor
=
make_token_lora_mapping
(
total_tokens
,
ctx
.
batch_size
,
prompt_lora_indices_tensor
,
seq_len_tensor
,
"cpu"
,
)
lora_kernel_meta
=
LoRAKernelMeta
.
make
(
max_loras
=
ctx
.
num_loras
,
max_num_tokens
=
token_lora_indices_tensor
.
size
(
0
),
device
=
"cpu"
,
)
lora_kernel_meta
.
prepare_tensors
(
token_lora_mapping
=
token_lora_indices_tensor
)
return
BenchmarkTensors
(
input_tensor
,
lora_weights
,
output_tensor
,
lora_kernel_meta
,
seq_len_tensor
,
prompt_lora_indices_tensor
,
)
def
sanity_check
(
self
)
->
None
:
"""
Fails asserts when non-conformality is detected.
"""
num_tokens
=
self
.
input
.
shape
[
-
2
]
# check metadata tensors
assert
torch
.
sum
(
self
.
seq_lens
)
==
num_tokens
num_seqs
=
self
.
seq_lens
.
shape
[
0
]
# assert self.seq_start_loc.shape[0] == num_seqs
assert
self
.
prompt_lora_mapping
.
shape
[
0
]
==
num_seqs
assert
self
.
lora_kernel_meta
.
token_lora_mapping
.
shape
[
0
]
==
num_tokens
def
to_device
(
self
,
device
:
str
):
"""
Transfer tensors to device if the tensors aren't already on the device
"""
def
to_device
(
tensor
:
torch
.
Tensor
):
if
tensor
.
device
!=
device
:
tensor
=
tensor
.
to
(
device
=
device
)
return
tensor
self
.
input
=
to_device
(
self
.
input
)
self
.
output
=
to_device
(
self
.
output
)
self
.
seq_lens
=
to_device
(
self
.
seq_lens
)
self
.
prompt_lora_mapping
=
to_device
(
self
.
prompt_lora_mapping
)
for
i
in
range
(
len
(
self
.
lora_weights_lst
)):
self
.
lora_weights_lst
[
i
]
=
to_device
(
self
.
lora_weights_lst
[
i
])
# LoRA meta
for
field_name
in
LoRAKernelMeta
.
__dataclass_fields__
:
field
=
getattr
(
self
.
lora_kernel_meta
,
field_name
)
assert
isinstance
(
field
,
torch
.
Tensor
)
setattr
(
self
.
lora_kernel_meta
,
field_name
,
to_device
(
field
))
def
metadata
(
self
)
->
tuple
[
int
,
int
,
int
]:
"""
Return num_seqs, num_tokens and max_seq_len
"""
num_seqs
=
self
.
seq_lens
.
shape
[
0
]
num_tokens
=
self
.
lora_kernel_meta
.
token_lora_mapping
.
shape
[
0
]
max_seq_len
=
torch
.
max
(
self
.
seq_lens
).
item
()
num_slices
=
len
(
self
.
lora_weights_lst
)
return
num_seqs
,
num_tokens
,
max_seq_len
,
num_slices
def
as_lora_shrink_kwargs
(
self
)
->
dict
[
str
,
Any
]:
self
.
sanity_check
()
self
.
to_device
(
self
.
input
.
device
)
_
,
num_tokens
,
_
,
num_slices
=
self
.
metadata
()
# Sanity check matrix shapes.
i_shape
,
lw_shape
,
o_shape
=
(
self
.
input
.
shape
,
self
.
lora_weights_lst
[
0
].
shape
,
self
.
output
.
shape
,
)
# Expected input shape [num_tokens, hidden_size]
assert
len
(
i_shape
)
==
2
assert
i_shape
[
0
]
==
num_tokens
hidden_size
=
i_shape
[
1
]
# Expected lora weight shape [num_loras, lora_rank, hidden_size]
assert
len
(
lw_shape
)
==
3
assert
lw_shape
[
2
]
==
hidden_size
lora_rank
=
lw_shape
[
1
]
# Expected output shape [num_slices, num_tokens, lora_rank]
assert
len
(
o_shape
)
==
3
assert
o_shape
==
(
num_slices
,
num_tokens
,
lora_rank
)
return
{
"inputs"
:
self
.
input
,
"lora_a_weights"
:
self
.
lora_weights_lst
,
"output_tensor"
:
self
.
output
,
"token_lora_mapping"
:
self
.
lora_kernel_meta
.
token_lora_mapping
,
"token_indices_sorted_by_lora_ids"
:
(
self
.
lora_kernel_meta
.
token_indices_sorted_by_lora_ids
),
"num_tokens_per_lora"
:
self
.
lora_kernel_meta
.
num_tokens_per_lora
,
"lora_token_start_loc"
:
self
.
lora_kernel_meta
.
lora_token_start_loc
,
"lora_ids"
:
self
.
lora_kernel_meta
.
active_lora_ids
,
"scaling"
:
1.0
,
}
def
as_lora_expand_kwargs
(
self
,
add_inputs
:
bool
)
->
dict
[
str
,
Any
]:
self
.
sanity_check
()
self
.
to_device
(
self
.
input
.
device
)
_
,
num_tokens
,
_
,
num_slices
=
self
.
metadata
()
# Sanity check matrix shapes.
i_shape
,
lw_shape
,
o_shape
=
(
self
.
input
.
shape
,
self
.
lora_weights_lst
[
0
].
shape
,
self
.
output
.
shape
,
)
# Expected input shape : [num_slices, num_tokens, lora_rank]
assert
len
(
i_shape
)
==
3
assert
i_shape
[
0
]
==
num_slices
assert
i_shape
[
1
]
==
num_tokens
lora_rank
=
i_shape
[
2
]
# Expected lora weight shape : [num_lora, hidden_size, lora_rank]
assert
len
(
lw_shape
)
==
3
assert
lw_shape
[
2
]
==
lora_rank
hidden_size
=
lw_shape
[
1
]
# Expected output shape : [num_tokens, hidden_size * num_slices]
assert
len
(
o_shape
)
==
2
assert
o_shape
==
(
num_tokens
,
hidden_size
*
num_slices
)
return
{
"inputs"
:
self
.
input
,
"lora_b_weights"
:
self
.
lora_weights_lst
,
"output_tensor"
:
self
.
output
,
"token_lora_mapping"
:
self
.
lora_kernel_meta
.
token_lora_mapping
,
"token_indices_sorted_by_lora_ids"
:
(
self
.
lora_kernel_meta
.
token_indices_sorted_by_lora_ids
),
"num_tokens_per_lora"
:
self
.
lora_kernel_meta
.
num_tokens_per_lora
,
"lora_token_start_loc"
:
self
.
lora_kernel_meta
.
lora_token_start_loc
,
"lora_ids"
:
self
.
lora_kernel_meta
.
active_lora_ids
,
"offset_start"
:
0
,
"add_inputs"
:
add_inputs
,
}
def
bench_fn_kwargs
(
self
,
op_type
:
OpType
,
add_inputs
:
Optional
[
bool
]
=
None
)
->
dict
[
str
,
Any
]:
if
op_type
.
is_shrink_fn
():
assert
add_inputs
is
None
else
:
assert
add_inputs
is
not
None
if
op_type
==
OpType
.
LORA_SHRINK
:
return
self
.
as_lora_shrink_kwargs
()
if
op_type
==
OpType
.
LORA_EXPAND
:
return
self
.
as_lora_expand_kwargs
(
add_inputs
)
raise
ValueError
(
f
"Unrecognized optype
{
self
}
"
)
def
test_correctness
(
self
,
op_type
:
OpType
,
expand_fn_add_inputs
:
Optional
[
bool
]
)
->
bool
:
"""
Test correctness of op_type implementation against a grouped gemm
reference implementation.
"""
seq_lens_cpu
=
self
.
seq_lens
.
to
(
device
=
"cpu"
)
prompt_lora_mapping_cpu
=
self
.
prompt_lora_mapping
.
to
(
device
=
"cpu"
)
ref_output
=
self
.
output
.
clone
()
self
.
output
.
zero_
()
op_type
.
bench_fn
()(
**
self
.
bench_fn_kwargs
(
op_type
,
expand_fn_add_inputs
))
op_type
.
run_ref_group_gemm
(
ref_output
,
self
.
input
,
self
.
lora_weights_lst
,
seq_lens_cpu
=
seq_lens_cpu
,
prompt_lora_mapping_cpu
=
prompt_lora_mapping_cpu
,
scaling
=
1.0
,
add_inputs
=
expand_fn_add_inputs
,
)
rtol
,
atol
=
{
torch
.
float16
:
(
6e-2
,
6e-2
),
torch
.
bfloat16
:
(
6e-2
,
6e-2
),
torch
.
float32
:
(
1e-2
,
1e-2
),
}[
self
.
output
.
dtype
]
return
torch
.
allclose
(
ref_output
,
self
.
output
,
rtol
=
rtol
,
atol
=
atol
)
def
bench_optype
(
ctx
:
BenchmarkContext
,
arg_pool_size
:
int
,
op_type
:
OpType
,
cuda_graph_nops
:
Optional
[
int
]
=
None
,
expand_fn_add_inputs
:
Optional
[
bool
]
=
None
,
test_correctness
:
bool
=
False
,
)
->
TMeasurement
:
assert
arg_pool_size
>=
1
if
op_type
.
is_shrink_fn
():
assert
expand_fn_add_inputs
is
None
else
:
assert
expand_fn_add_inputs
is
not
None
# BenchmarkContext -> BenchmarkTensors
bench_tensors
:
list
[
BenchmarkTensors
]
=
[
BenchmarkTensors
.
make
(
ctx
,
op_type
)
for
_
in
range
(
arg_pool_size
)
]
for
bt
in
bench_tensors
:
bt
.
sanity_check
()
# Test correctness of our implementation.
if
test_correctness
:
assert
all
(
[
bt
.
test_correctness
(
op_type
,
expand_fn_add_inputs
)
for
bt
in
bench_tensors
]
)
# BenchmarkTensors -> dict (kwargs)
kwargs_list
=
[
bt
.
bench_fn_kwargs
(
op_type
,
add_inputs
=
expand_fn_add_inputs
)
for
bt
in
bench_tensors
]
# Clear LoRA optimization hash-maps.
_LORA_A_PTR_DICT
.
clear
()
_LORA_B_PTR_DICT
.
clear
()
# Run bench function so that _LORA_A_PTR_DICT and _LORA_B_PTR_DICT are setup
for
kwargs
in
kwargs_list
:
op_type
.
bench_fn
()(
**
kwargs
)
torch
.
cuda
.
synchronize
()
# Merge into a single kwargs and qualify arguments as ArgPool
kwargs
=
{
k
:
ArgPool
([])
for
k
in
kwargs_list
[
0
]}
for
_kwargs
in
kwargs_list
:
for
k
,
v
in
_kwargs
.
items
():
kwargs
[
k
].
values
.
append
(
v
)
describe_args
=
(
f
"add_inputs=
{
expand_fn_add_inputs
}
"
if
expand_fn_add_inputs
is
not
None
else
""
)
description
=
f
"
{
op_type
.
name
}
(
{
describe_args
}
) (
{
bench_tensors
[
0
].
io_types
()
}
)"
cuda_graph_params
=
None
if
cuda_graph_nops
:
cuda_graph_params
=
CudaGraphBenchParams
(
cuda_graph_nops
)
timer
=
None
with
Bench
(
cuda_graph_params
,
ctx
.
bench_label
(),
ctx
.
bench_sublabel
(
op_type
),
description
,
op_type
.
bench_fn
(),
**
kwargs
,
)
as
bench
:
timer
=
bench
.
run
()
return
timer
def
bench_torch_mm
(
ctx
:
BenchmarkContext
,
arg_pool_size
:
int
,
op_type
:
OpType
,
cuda_graph_nops
:
Optional
[
int
]
=
None
,
)
->
TMeasurement
:
"""
Benchmark basic torch.mm as a roofline.
When all the input tokens have the same LoRA ID, the LoRA kernels are just
a matmul. This torch.mm benchmark serves as a roofline for that case.
input op_type is used in determining the m, k, n dimensions for the matmul.
"""
batch_size
,
hidden_size
,
lora_rank
,
seq_length
,
dtype
=
(
ctx
.
batch_size
,
ctx
.
hidden_size
,
ctx
.
lora_rank
,
ctx
.
seq_length
,
ctx
.
dtype
,
)
m
,
k
,
n
=
op_type
.
mkn
(
batch_size
,
seq_length
,
hidden_size
,
lora_rank
)
# For a fairer comparison.
n
=
n
*
ctx
.
num_slices
# Get matmul input and output tensors for A x B = C
As
,
Bs
,
Cs
=
[],
[],
[]
for
_
in
range
(
arg_pool_size
):
As
.
append
(
torch
.
rand
((
m
,
k
),
dtype
=
dtype
).
to
(
"cuda"
))
Bs
.
append
(
torch
.
rand
((
n
,
k
),
dtype
=
dtype
).
to
(
"cuda"
).
t
())
Cs
.
append
(
torch
.
rand
((
m
,
n
),
dtype
=
dtype
).
to
(
"cuda"
))
# Make torch.mm kwargs
mm_kwargs
=
{
"input"
:
ArgPool
(
As
),
"mat2"
:
ArgPool
(
Bs
),
"out"
:
ArgPool
(
Cs
)}
description
=
(
f
"single-lora roofline using torch.mm (
{
dtype_to_str
(
dtype
)
}
"
f
"x
{
dtype_to_str
(
dtype
)
}
"
f
"=>
{
dtype_to_str
(
dtype
)
}
)"
)
cuda_graph_params
=
None
if
cuda_graph_nops
:
cuda_graph_params
=
CudaGraphBenchParams
(
cuda_graph_nops
)
with
Bench
(
cuda_graph_params
,
ctx
.
bench_label
(),
ctx
.
bench_sublabel
(
op_type
),
description
,
torch
.
mm
,
**
mm_kwargs
,
)
as
bench
:
return
bench
.
run
()
# runner
def
use_cuda_graph_recommendation
()
->
str
:
return
"""
Triton kernels have a significant launch overhead with
launched directly via python. This overhead is more noticeable
for small the problem sizes. For these cases, it is recommended
to use the script with `--cuda-graph-nops N` to benchmark N
consecutive invocations of the benchmarking operations from
inside a CUDA Graph. Note that the returned measurement is for N
invocations of the operation.
"""
def
print_timers
(
timers
:
list
[
TMeasurement
],
args
:
Optional
[
argparse
.
Namespace
]
=
None
):
compare
=
TBenchmark
.
Compare
(
timers
)
compare
.
print
()
if
args
and
args
.
cuda_graph_nops
:
print
(
f
"Note : The timings reported above is for
{
args
.
cuda_graph_nops
}
"
"consecutive invocations of the benchmarking functions. "
f
"Please divide by
{
args
.
cuda_graph_nops
}
for single invocation "
"timings."
)
print
(
"Note on Comparison with torch.mm : The torch.mm numbers are "
"benchmark numbers of a simple matmul emulating the single lora "
"case. It is provided as a roofline for comparing our LoRA Kernel "
"implementations. It is expected that the LoRA kernels will be "
"slower than torch.mm in cases where num_loras is big. But for "
"small num_loras the goal should be to match the torch.mm numbers."
)
def
run
(
args
:
argparse
.
Namespace
,
bench_ctxs
:
list
[
BenchmarkContext
]):
if
args
.
cuda_graph_nops
is
not
None
:
assert
args
.
cuda_graph_nops
>
0
print
(
f
"Benchmarking
{
args
.
cuda_graph_nops
}
invocations inside a CUDA Graph"
)
else
:
print
(
f
"CUDA Graphs not enabled.
\n
{
use_cuda_graph_recommendation
()
}
"
)
timers
=
[]
for
bench_ctx
in
bench_ctxs
:
for
seq_len
in
args
.
seq_lengths
:
bench_ops
:
list
[
OpType
]
=
args
.
op_types
seq_len_timers
=
[]
for
bench_op
in
bench_ops
:
for
num_slices
in
bench_op
.
num_slices
():
_ctx
=
bench_ctx
.
with_seq_length
(
seq_len
).
with_num_slices
(
num_slices
)
# Benchmark torch.mm as a roofline
seq_len_timers
.
append
(
bench_torch_mm
(
_ctx
,
args
.
arg_pool_size
,
bench_op
,
args
.
cuda_graph_nops
)
)
# Benchmark bench_op
expand_fn_add_inputs
=
(
[
None
]
if
bench_op
.
is_shrink_fn
()
else
args
.
expand_fn_add_inputs
)
for
add_input_arg
in
expand_fn_add_inputs
:
seq_len_timers
.
append
(
bench_optype
(
_ctx
,
args
.
arg_pool_size
,
bench_op
,
args
.
cuda_graph_nops
,
add_input_arg
,
args
.
test_correctness
,
)
)
print_timers
(
seq_len_timers
)
timers
.
extend
(
seq_len_timers
)
# Result stdout dump
print
(
"== All Results ===="
)
print_timers
(
timers
,
args
)
if
args
.
output_directory
:
# Result file dump
od
=
Path
(
args
.
output_directory
)
if
not
od
.
exists
():
od
.
mkdir
()
timestamp
=
int
(
time
.
time
())
pkl_file
=
od
/
f
"lora_bench-
{
timestamp
}
.pkl"
print
(
f
"Writing benchmarks to
{
pkl_file
}
"
)
with
open
(
pkl_file
,
"wb"
)
as
f
:
pickle
.
dump
(
timers
,
f
)
def
as_benchmark_contexts
(
hidden_sizes
:
list
[
int
],
lora_ranks
:
list
[
int
],
args
:
argparse
.
Namespace
)
->
list
[
BenchmarkContext
]:
ctxs
:
list
[
BenchmarkContext
]
=
[]
for
batch_size
,
hidden_size
,
lora_rank
,
num_loras
,
sort_by_lora_id
in
product
(
# noqa
args
.
batch_sizes
,
list
(
hidden_sizes
),
lora_ranks
,
args
.
num_loras
,
args
.
sort_by_lora_id
,
):
ctxs
.
append
(
BenchmarkContext
(
batch_size
=
batch_size
,
hidden_size
=
hidden_size
,
lora_rank
=
lora_rank
,
num_loras
=
num_loras
,
num_active_loras
=
args
.
num_active_loras
if
args
.
num_active_loras
else
num_loras
,
# To be filled based on the OpType to benchmark
seq_length
=
None
,
sort_by_lora_id
=
sort_by_lora_id
,
dtype
=
args
.
dtype
,
# To be filled based on the OpType to benchmark
num_slices
=
None
,
)
)
return
ctxs
def
run_list_bench
(
args
:
argparse
.
Namespace
):
print
(
args
)
print
(
"List bench :
\n
"
f
" Hidden Sizes
{
args
.
hidden_sizes
}
"
f
" LoRA Ranks
{
args
.
lora_ranks
}
"
)
# Get all benchmarking contexts
bench_contexts
:
list
[
BenchmarkContext
]
=
as_benchmark_contexts
(
hidden_sizes
=
args
.
hidden_sizes
,
lora_ranks
=
args
.
lora_ranks
,
args
=
args
)
run
(
args
,
bench_contexts
)
def
run_range_bench
(
args
:
argparse
.
Namespace
):
print
(
args
)
hidden_sizes
=
list
(
range
(
args
.
hidden_sizes_start
,
args
.
hidden_sizes_end
+
1
,
args
.
hidden_sizes_increment
,
)
)
lora_ranks
=
list
(
range
(
args
.
lora_ranks_start
,
args
.
lora_ranks_end
+
1
,
args
.
lora_ranks_increment
)
)
print
(
f
"Range bench :
\n
Hidden Sizes
{
hidden_sizes
}
LoRA Ranks
{
lora_ranks
}
"
)
# Get all benchmarking contexts
bench_contexts
:
list
[
BenchmarkContext
]
=
as_benchmark_contexts
(
hidden_sizes
=
hidden_sizes
,
lora_ranks
=
lora_ranks
,
args
=
args
)
run
(
args
,
bench_contexts
)
def
run_model_bench
(
args
:
argparse
.
Namespace
):
print
(
args
)
def
hidden_sizes_from_model
(
model
:
str
,
tp_size
:
int
)
->
set
[
int
]:
hidden_sizes
=
set
()
for
KN
,
tp_split_dim
in
WEIGHT_SHAPES
[
model
]:
KN
[
tp_split_dim
]
=
KN
[
tp_split_dim
]
//
tp_size
hidden_sizes
.
add
(
KN
[
1
])
return
hidden_sizes
# Get all hidden sizes
hidden_sizes
:
set
[
int
]
=
set
()
for
model_name
,
tp_size
in
product
(
args
.
models
,
args
.
tp_sizes
):
hidden_sizes
=
hidden_sizes
.
union
(
hidden_sizes_from_model
(
model_name
,
tp_size
))
print
(
f
"Model bench :
\n
Hidden Sizes
{
hidden_sizes
}
LoRA Ranks
{
args
.
lora_ranks
}
"
)
# Get all benchmarking contexts
bench_contexts
:
list
[
BenchmarkContext
]
=
as_benchmark_contexts
(
hidden_sizes
=
hidden_sizes
,
lora_ranks
=
args
.
lora_ranks
,
args
=
args
)
run
(
args
,
bench_contexts
)
if
__name__
==
"__main__"
:
def
to_torch_dtype
(
dt
):
if
dt
==
"torch.float16"
:
return
torch
.
float16
if
dt
==
"torch.bfloat16"
:
return
torch
.
bfloat16
raise
ValueError
(
"unsupported dtype"
)
def
get_bool
(
s
:
str
)
->
bool
:
return
s
.
lower
()
in
[
"true"
,
"1"
]
def
add_common_command_args
(
p
:
argparse
.
ArgumentParser
):
p
.
add_argument
(
"--dtype"
,
type
=
to_torch_dtype
,
required
=
True
,
help
=
"Available options are ['torch.float16', 'torch.bfloat16']"
,
)
p
.
add_argument
(
"--arg-pool-size"
,
type
=
int
,
default
=
32
,
help
=
"Run profiles with a pool of input/output/meta tensors instead"
"of simply reusing the same tensors for all runs. A bigger arg-pool"
"mitigates hardware caching effects during benchmarking."
,
)
p
.
add_argument
(
"--cuda-graph-nops"
,
type
=
int
,
help
=
(
"when set profiling is done using cudagraph, "
"with the given number of operations in a graph."
"Note that the measurement returned is the time "
"taken for N consecutive executions of the benchmarking "
"functions, where N is the value of this argument."
),
)
p
.
add_argument
(
"--num-loras"
,
nargs
=
"+"
,
type
=
int
,
default
=
DEFAULT_NUM_LORAS
)
p
.
add_argument
(
"--num-active-loras"
,
type
=
int
,
default
=
None
,
help
=
"Active LoRAs. When None, all LoRAs are active"
,
)
p
.
add_argument
(
"--sort-by-lora-id"
,
nargs
=
"+"
,
type
=
get_bool
,
default
=
DEFAULT_SORT_BY_LORA_IDS
,
)
p
.
add_argument
(
"--op-types"
,
nargs
=
"+"
,
type
=
OpType
.
from_str
,
default
=
list
(
OpType
)
)
p
.
add_argument
(
"--seq-lengths"
,
nargs
=
"+"
,
type
=
int
,
default
=
DEFAULT_SEQ_LENGTHS
)
p
.
add_argument
(
"--batch-sizes"
,
nargs
=
"+"
,
type
=
int
,
default
=
DEFAULT_BATCH_SIZES
)
p
.
add_argument
(
"--expand-fn-add-inputs"
,
nargs
=
"+"
,
type
=
get_bool
,
default
=
DEFAULT_EXPAND_FN_ADD_INPUTS
,
)
p
.
add_argument
(
"-o"
,
"--output-directory"
,
type
=
str
,
help
=
(
"Output directory to store a the list of benchmarking"
"TMeasurement objects as a pickle file"
),
)
p
.
add_argument
(
"--test-correctness"
,
action
=
"store_true"
,
help
=
(
"When enabled, the benchmarking functions are tested"
"for correctness before the actual benchmarking"
),
)
parser
=
FlexibleArgumentParser
(
description
=
f
"""
Benchmark LoRA kernels:
{
use_cuda_graph_recommendation
()
}
list_bench example:
python3 benchmarks/kernels/benchmark_lora.py list_bench --arg-pool-size 32 --batch-sizes 1 16 32 --dtype torch.float16 --hidden-sizes 2048 --lora-ranks 16 --num-loras 1 4 --op-types lora_shrink lora_expand --seq-lengths 1 16 --sort-by-lora-id 1 --cuda-graph-nops 32
model_bench example:
python3 benchmarks/kernels/benchmark_lora.py model_bench --models meta-llama/Llama-3-8b --arg-pool-size 32 --batch-sizes 1 16 32 --dtype torch.float16 --lora-ranks 16 --num-loras 1 4 --op-types lora_shrink lora_expand --seq-lengths 1 16 --sort-by-lora-id 1 --cuda-graph-nops 32
range_bench example:
python3 benchmarks/kernels/benchmark_lora.py range_bench --arg-pool-size 32 --batch-sizes 1 16 32 --dtype torch.float16 --num-loras 1 4 --op-types lora_shrink lora_expand --seq-lengths 1 16 --sort-by-lora-id 1 --cuda-graph-nops 32 --hidden-sizes-start 1024 --hidden-sizes-end 4096 --hidden-sizes-increment 1024 --lora-ranks-start 8 --lora-ranks-end 24 --lora-ranks-increment 8
"""
,
# noqa: E501
formatter_class
=
argparse
.
RawTextHelpFormatter
,
)
subparsers
=
parser
.
add_subparsers
(
dest
=
"cmd"
,
required
=
True
)
list_parser
=
subparsers
.
add_parser
(
"list_bench"
)
list_parser
.
add_argument
(
"--hidden-sizes"
,
nargs
=
"+"
,
type
=
int
,
default
=
DEFAULT_HIDDEN_SIZES
)
list_parser
.
add_argument
(
"--lora-ranks"
,
nargs
=
"+"
,
type
=
int
,
default
=
DEFAULT_LORA_RANKS
)
add_common_command_args
(
list_parser
)
list_parser
.
set_defaults
(
func
=
run_list_bench
)
range_parser
=
subparsers
.
add_parser
(
"range_bench"
)
range_parser
.
add_argument
(
"--hidden-sizes-start"
,
type
=
int
,
required
=
True
)
range_parser
.
add_argument
(
"--hidden-sizes-end"
,
type
=
int
,
required
=
True
)
range_parser
.
add_argument
(
"--hidden-sizes-increment"
,
type
=
int
,
required
=
True
)
range_parser
.
add_argument
(
"--lora-ranks-start"
,
type
=
int
,
required
=
True
)
range_parser
.
add_argument
(
"--lora-ranks-end"
,
type
=
int
,
required
=
True
)
range_parser
.
add_argument
(
"--lora-ranks-increment"
,
type
=
int
,
required
=
True
)
add_common_command_args
(
range_parser
)
range_parser
.
set_defaults
(
func
=
run_range_bench
)
model_parser
=
subparsers
.
add_parser
(
"model_bench"
)
model_parser
.
add_argument
(
"--models"
,
nargs
=
"+"
,
type
=
str
,
default
=
DEFAULT_MODELS
,
choices
=
WEIGHT_SHAPES
.
keys
(),
)
model_parser
.
add_argument
(
"--tp-sizes"
,
nargs
=
"+"
,
type
=
int
,
default
=
DEFAULT_TP_SIZES
)
model_parser
.
add_argument
(
"--lora-ranks"
,
nargs
=
"+"
,
type
=
int
,
default
=
DEFAULT_LORA_RANKS
)
add_common_command_args
(
model_parser
)
model_parser
.
set_defaults
(
func
=
run_model_bench
)
args
=
parser
.
parse_args
()
args
.
func
(
args
)
Prev
1
2
3
4
5
6
…
9
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment