Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
cf069aa8
Unverified
Commit
cf069aa8
authored
Mar 03, 2025
by
Harry Mellor
Committed by
GitHub
Mar 02, 2025
Browse files
Update deprecated Python 3.8 typing (#13971)
parent
bf33700e
Changes
300
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
198 additions
and
195 deletions
+198
-195
benchmarks/backend_request_func.py
benchmarks/backend_request_func.py
+3
-3
benchmarks/benchmark_guided.py
benchmarks/benchmark_guided.py
+8
-9
benchmarks/benchmark_latency.py
benchmarks/benchmark_latency.py
+3
-3
benchmarks/benchmark_prefix_caching.py
benchmarks/benchmark_prefix_caching.py
+8
-8
benchmarks/benchmark_prioritization.py
benchmarks/benchmark_prioritization.py
+4
-4
benchmarks/benchmark_serving.py
benchmarks/benchmark_serving.py
+39
-38
benchmarks/benchmark_serving_guided.py
benchmarks/benchmark_serving_guided.py
+29
-28
benchmarks/benchmark_throughput.py
benchmarks/benchmark_throughput.py
+19
-19
benchmarks/benchmark_utils.py
benchmarks/benchmark_utils.py
+4
-4
benchmarks/cutlass_benchmarks/sparse_benchmarks.py
benchmarks/cutlass_benchmarks/sparse_benchmarks.py
+5
-4
benchmarks/cutlass_benchmarks/utils.py
benchmarks/cutlass_benchmarks/utils.py
+4
-4
benchmarks/cutlass_benchmarks/w8a8_benchmarks.py
benchmarks/cutlass_benchmarks/w8a8_benchmarks.py
+9
-8
benchmarks/fused_kernels/layernorm_rms_benchmarks.py
benchmarks/fused_kernels/layernorm_rms_benchmarks.py
+3
-2
benchmarks/kernels/benchmark_lora.py
benchmarks/kernels/benchmark_lora.py
+30
-30
benchmarks/kernels/benchmark_machete.py
benchmarks/kernels/benchmark_machete.py
+13
-12
benchmarks/kernels/benchmark_marlin.py
benchmarks/kernels/benchmark_marlin.py
+2
-4
benchmarks/kernels/benchmark_moe.py
benchmarks/kernels/benchmark_moe.py
+9
-9
benchmarks/kernels/benchmark_paged_attention.py
benchmarks/kernels/benchmark_paged_attention.py
+2
-2
benchmarks/kernels/benchmark_rmsnorm.py
benchmarks/kernels/benchmark_rmsnorm.py
+2
-2
benchmarks/kernels/benchmark_rope.py
benchmarks/kernels/benchmark_rope.py
+2
-2
No files found.
benchmarks/backend_request_func.py
View file @
cf069aa8
...
...
@@ -6,7 +6,7 @@ import sys
import
time
import
traceback
from
dataclasses
import
dataclass
,
field
from
typing
import
List
,
Optional
,
Union
from
typing
import
Optional
,
Union
import
aiohttp
import
huggingface_hub.constants
...
...
@@ -41,8 +41,8 @@ class RequestFuncOutput:
latency
:
float
=
0.0
output_tokens
:
int
=
0
ttft
:
float
=
0.0
# Time to first token
itl
:
L
ist
[
float
]
=
field
(
default_factory
=
list
)
#
L
ist of inter-token latencies
itl
:
l
ist
[
float
]
=
field
(
default_factory
=
list
)
#
l
ist of inter-token latencies
tpot
:
float
=
0.0
# avg next-token latencies
prompt_len
:
int
=
0
error
:
str
=
""
...
...
benchmarks/benchmark_guided.py
View file @
cf069aa8
...
...
@@ -6,7 +6,6 @@ import json
import
os
import
random
import
time
from
typing
import
List
import
datasets
import
pandas
as
pd
...
...
@@ -39,7 +38,7 @@ class SampleRequest:
completion
:
str
=
None
def
run_vllm
(
requests
:
L
ist
[
SampleRequest
],
def
run_vllm
(
requests
:
l
ist
[
SampleRequest
],
engine_args
:
EngineArgs
,
n
:
int
,
guided_decoding_rate
:
float
=
1.0
,
...
...
@@ -54,8 +53,8 @@ def run_vllm(requests: List[SampleRequest],
" prompt_len and expected_output_len for all requests."
)
# Add the requests to the engine.
prompts
:
L
ist
[
str
]
=
[]
sampling_params
:
L
ist
[
SamplingParams
]
=
[]
prompts
:
l
ist
[
str
]
=
[]
sampling_params
:
l
ist
[
SamplingParams
]
=
[]
# create a list containing random selected true or false
guided_decoding_req_idx
=
random
.
sample
(
range
(
len
(
requests
)),
int
(
len
(
requests
)
*
guided_decoding_rate
))
...
...
@@ -110,7 +109,7 @@ def run_vllm(requests: List[SampleRequest],
async
def
run_vllm_async
(
requests
:
L
ist
[
SampleRequest
],
requests
:
l
ist
[
SampleRequest
],
engine_args
:
AsyncEngineArgs
,
n
:
int
,
guided_decoding_rate
:
float
=
1.0
,
...
...
@@ -129,8 +128,8 @@ async def run_vllm_async(
" prompt_len and expected_output_len for all requests."
)
# Add the requests to the engine.
prompts
:
L
ist
[
str
]
=
[]
sampling_params
:
L
ist
[
SamplingParams
]
=
[]
prompts
:
l
ist
[
str
]
=
[]
sampling_params
:
l
ist
[
SamplingParams
]
=
[]
guided_decoding_req_idx
=
random
.
sample
(
range
(
len
(
requests
)),
int
(
len
(
requests
)
*
guided_decoding_rate
))
...
...
@@ -203,7 +202,7 @@ async def run_vllm_async(
def
sample_requests
(
tokenizer
:
PreTrainedTokenizerBase
,
args
:
argparse
.
Namespace
)
->
L
ist
[
SampleRequest
]:
args
:
argparse
.
Namespace
)
->
l
ist
[
SampleRequest
]:
if
args
.
dataset
==
'json'
:
if
args
.
json_schema_path
is
None
:
dir_path
=
os
.
path
.
dirname
(
os
.
path
.
realpath
(
__file__
))
...
...
@@ -287,7 +286,7 @@ def sample_requests(tokenizer: PreTrainedTokenizerBase,
elif
args
.
dataset
==
"xgrammar_bench"
:
args
.
warmup
=
False
requests
:
L
ist
[
SampleRequest
]
=
[]
requests
:
l
ist
[
SampleRequest
]
=
[]
dataset
=
datasets
.
load_dataset
(
"NousResearch/json-mode-eval"
,
split
=
"train"
)
print
(
f
"dataset has
{
len
(
dataset
)
}
entries"
)
...
...
benchmarks/benchmark_latency.py
View file @
cf069aa8
...
...
@@ -7,7 +7,7 @@ import json
import
os
import
time
from
pathlib
import
Path
from
typing
import
Any
,
Dict
,
List
,
Optional
from
typing
import
Any
,
Optional
import
numpy
as
np
import
torch
...
...
@@ -22,7 +22,7 @@ from vllm.utils import FlexibleArgumentParser
def
save_to_pytorch_benchmark_format
(
args
:
argparse
.
Namespace
,
results
:
D
ict
[
str
,
Any
])
->
None
:
results
:
d
ict
[
str
,
Any
])
->
None
:
pt_records
=
convert_to_pytorch_benchmark_format
(
args
=
args
,
metrics
=
{
"latency"
:
results
[
"latencies"
]},
...
...
@@ -57,7 +57,7 @@ def main(args: argparse.Namespace):
dummy_prompt_token_ids
=
np
.
random
.
randint
(
10000
,
size
=
(
args
.
batch_size
,
args
.
input_len
))
dummy_prompts
:
L
ist
[
PromptType
]
=
[{
dummy_prompts
:
l
ist
[
PromptType
]
=
[{
"prompt_token_ids"
:
batch
}
for
batch
in
dummy_prompt_token_ids
.
tolist
()]
...
...
benchmarks/benchmark_prefix_caching.py
View file @
cf069aa8
...
...
@@ -31,7 +31,7 @@ import dataclasses
import
json
import
random
import
time
from
typing
import
List
,
Optional
,
Tuple
from
typing
import
Optional
from
transformers
import
PreTrainedTokenizerBase
...
...
@@ -77,9 +77,9 @@ def sample_requests_from_dataset(
dataset_path
:
str
,
num_requests
:
int
,
tokenizer
:
PreTrainedTokenizerBase
,
input_length_range
:
T
uple
[
int
,
int
],
input_length_range
:
t
uple
[
int
,
int
],
fixed_output_len
:
Optional
[
int
],
)
->
L
ist
[
Request
]:
)
->
l
ist
[
Request
]:
if
fixed_output_len
is
not
None
and
fixed_output_len
<
4
:
raise
ValueError
(
"output_len too small"
)
...
...
@@ -99,7 +99,7 @@ def sample_requests_from_dataset(
assert
min_len
>=
0
and
max_len
>=
min_len
,
"input_length_range too small"
# Filter out sequences that are too long or too short
filtered_requests
:
L
ist
[
Request
]
=
[]
filtered_requests
:
l
ist
[
Request
]
=
[]
for
i
in
range
(
len
(
dataset
)):
if
len
(
filtered_requests
)
==
num_requests
:
...
...
@@ -122,10 +122,10 @@ def sample_requests_from_dataset(
def
sample_requests_from_random
(
num_requests
:
int
,
tokenizer
:
PreTrainedTokenizerBase
,
input_length_range
:
T
uple
[
int
,
int
],
input_length_range
:
t
uple
[
int
,
int
],
fixed_output_len
:
Optional
[
int
],
prefix_len
:
int
,
)
->
L
ist
[
Request
]:
)
->
l
ist
[
Request
]:
requests
=
[]
prefix_token_ids
=
sample_tokens
(
tokenizer
,
prefix_len
)
...
...
@@ -144,9 +144,9 @@ def sample_requests_from_random(
return
requests
def
repeat_and_sort_requests
(
requests
:
L
ist
[
Request
],
def
repeat_and_sort_requests
(
requests
:
l
ist
[
Request
],
repeat_count
:
int
,
sort
:
bool
=
False
)
->
L
ist
[
str
]:
sort
:
bool
=
False
)
->
l
ist
[
str
]:
repeated_requests
=
requests
*
repeat_count
if
sort
:
repeated_requests
.
sort
(
key
=
lambda
x
:
x
[
1
])
...
...
benchmarks/benchmark_prioritization.py
View file @
cf069aa8
...
...
@@ -5,7 +5,7 @@ import dataclasses
import
json
import
random
import
time
from
typing
import
List
,
Optional
,
Tuple
from
typing
import
Optional
from
transformers
import
AutoTokenizer
,
PreTrainedTokenizerBase
...
...
@@ -23,7 +23,7 @@ def sample_requests(
num_requests
:
int
,
tokenizer
:
PreTrainedTokenizerBase
,
fixed_output_len
:
Optional
[
int
],
)
->
L
ist
[
T
uple
[
str
,
int
,
int
]]:
)
->
l
ist
[
t
uple
[
str
,
int
,
int
]]:
if
fixed_output_len
is
not
None
and
fixed_output_len
<
4
:
raise
ValueError
(
"output_len too small"
)
...
...
@@ -40,7 +40,7 @@ def sample_requests(
random
.
shuffle
(
dataset
)
# Filter out sequences that are too long or too short
filtered_dataset
:
L
ist
[
T
uple
[
str
,
int
,
int
]]
=
[]
filtered_dataset
:
l
ist
[
t
uple
[
str
,
int
,
int
]]
=
[]
for
i
in
range
(
len
(
dataset
)):
if
len
(
filtered_dataset
)
==
num_requests
:
break
...
...
@@ -68,7 +68,7 @@ def sample_requests(
def
run_vllm
(
requests
:
L
ist
[
T
uple
[
str
,
int
,
int
]],
requests
:
l
ist
[
t
uple
[
str
,
int
,
int
]],
n
:
int
,
engine_args
:
EngineArgs
,
)
->
float
:
...
...
benchmarks/benchmark_serving.py
View file @
cf069aa8
...
...
@@ -33,9 +33,10 @@ import os
import
random
import
time
import
warnings
from
collections.abc
import
AsyncGenerator
,
Collection
from
dataclasses
import
dataclass
from
datetime
import
datetime
from
typing
import
Any
,
AsyncGenerator
,
Collection
,
Dict
,
List
,
Optional
,
Tuple
from
typing
import
Any
,
Optional
import
numpy
as
np
import
pandas
as
pd
...
...
@@ -73,22 +74,22 @@ class BenchmarkMetrics:
mean_ttft_ms
:
float
median_ttft_ms
:
float
std_ttft_ms
:
float
percentiles_ttft_ms
:
L
ist
[
T
uple
[
float
,
float
]]
percentiles_ttft_ms
:
l
ist
[
t
uple
[
float
,
float
]]
mean_tpot_ms
:
float
median_tpot_ms
:
float
std_tpot_ms
:
float
percentiles_tpot_ms
:
L
ist
[
T
uple
[
float
,
float
]]
percentiles_tpot_ms
:
l
ist
[
t
uple
[
float
,
float
]]
mean_itl_ms
:
float
median_itl_ms
:
float
std_itl_ms
:
float
percentiles_itl_ms
:
L
ist
[
T
uple
[
float
,
float
]]
percentiles_itl_ms
:
l
ist
[
t
uple
[
float
,
float
]]
# E2EL stands for end-to-end latency per request.
# It is the time taken on the client side from sending
# a request to receiving a complete response.
mean_e2el_ms
:
float
median_e2el_ms
:
float
std_e2el_ms
:
float
percentiles_e2el_ms
:
L
ist
[
T
uple
[
float
,
float
]]
percentiles_e2el_ms
:
l
ist
[
t
uple
[
float
,
float
]]
def
sample_sharegpt_requests
(
...
...
@@ -96,7 +97,7 @@ def sample_sharegpt_requests(
num_requests
:
int
,
tokenizer
:
PreTrainedTokenizerBase
,
fixed_output_len
:
Optional
[
int
]
=
None
,
)
->
L
ist
[
T
uple
[
str
,
int
,
int
,
None
]]:
)
->
l
ist
[
t
uple
[
str
,
int
,
int
,
None
]]:
# Load the dataset.
with
open
(
dataset_path
,
encoding
=
'utf-8'
)
as
f
:
dataset
=
json
.
load
(
f
)
...
...
@@ -110,7 +111,7 @@ def sample_sharegpt_requests(
random
.
shuffle
(
dataset
)
# Filter out sequences that are too long or too short
filtered_dataset
:
L
ist
[
T
uple
[
str
,
int
,
int
]]
=
[]
filtered_dataset
:
l
ist
[
t
uple
[
str
,
int
,
int
]]
=
[]
for
i
in
range
(
len
(
dataset
)):
if
len
(
filtered_dataset
)
==
num_requests
:
break
...
...
@@ -139,7 +140,7 @@ def sample_burstgpt_requests(
num_requests
:
int
,
random_seed
:
int
,
tokenizer
:
PreTrainedTokenizerBase
,
)
->
L
ist
[
T
uple
[
str
,
int
,
int
,
None
]]:
)
->
l
ist
[
t
uple
[
str
,
int
,
int
,
None
]]:
df
=
pd
.
read_csv
(
dataset_path
)
gpt4_df
=
df
[
df
[
"Model"
]
==
"GPT-4"
]
# Remove the failed requests (i.e., response length is 0)
...
...
@@ -170,7 +171,7 @@ def sample_sonnet_requests(
output_len
:
int
,
prefix_len
:
int
,
tokenizer
:
PreTrainedTokenizerBase
,
)
->
L
ist
[
T
uple
[
str
,
str
,
int
,
int
,
None
]]:
)
->
l
ist
[
t
uple
[
str
,
str
,
int
,
int
,
None
]]:
assert
(
input_len
>
prefix_len
),
"'args.sonnet-input-len' must be greater than 'args.prefix-input-len'."
...
...
@@ -211,7 +212,7 @@ def sample_sonnet_requests(
prefix_lines
=
poem_lines
[:
num_prefix_lines
]
# Sample the rest of lines per request.
sampled_requests
:
L
ist
[
T
uple
[
str
,
int
,
int
]]
=
[]
sampled_requests
:
l
ist
[
t
uple
[
str
,
int
,
int
]]
=
[]
for
_
in
range
(
num_requests
):
num_lines_needed
=
num_input_lines
-
num_prefix_lines
sampled_lines
=
""
.
join
(
prefix_lines
+
...
...
@@ -238,8 +239,8 @@ def sample_vision_arena_requests(
num_requests
:
int
,
tokenizer
:
PreTrainedTokenizerBase
,
fixed_output_len
:
Optional
[
int
]
=
None
,
)
->
L
ist
[
T
uple
[
str
,
str
,
int
,
Optional
[
D
ict
[
str
,
Collection
[
str
]]]]]:
sampled_requests
:
L
ist
[
T
uple
[
str
,
int
,
int
,
D
ict
[
str
,
)
->
l
ist
[
t
uple
[
str
,
str
,
int
,
Optional
[
d
ict
[
str
,
Collection
[
str
]]]]]:
sampled_requests
:
l
ist
[
t
uple
[
str
,
int
,
int
,
d
ict
[
str
,
Collection
[
str
]]]]
=
[]
for
data
in
dataset
:
if
len
(
sampled_requests
)
==
num_requests
:
...
...
@@ -285,7 +286,7 @@ def sample_hf_requests(
tokenizer
:
PreTrainedTokenizerBase
,
random_seed
:
int
,
fixed_output_len
:
Optional
[
int
]
=
None
,
)
->
L
ist
[
T
uple
[
str
,
str
,
int
,
Optional
[
D
ict
[
str
,
Collection
[
str
]]]]]:
)
->
l
ist
[
t
uple
[
str
,
str
,
int
,
Optional
[
d
ict
[
str
,
Collection
[
str
]]]]]:
# Special case for vision_arena dataset
if
dataset_path
==
'lmarena-ai/vision-arena-bench-v0.1'
\
...
...
@@ -307,7 +308,7 @@ def sample_hf_requests(
"HF Dataset must have 'conversations' column."
)
filter_func
=
lambda
x
:
len
(
x
[
"conversations"
])
>=
2
filtered_dataset
=
dataset
.
shuffle
(
seed
=
random_seed
).
filter
(
filter_func
)
sampled_requests
:
L
ist
[
T
uple
[
str
,
int
,
int
,
D
ict
[
str
,
sampled_requests
:
l
ist
[
t
uple
[
str
,
int
,
int
,
d
ict
[
str
,
Collection
[
str
]]]]
=
[]
for
data
in
filtered_dataset
:
if
len
(
sampled_requests
)
==
num_requests
:
...
...
@@ -370,7 +371,7 @@ def sample_random_requests(
num_prompts
:
int
,
range_ratio
:
float
,
tokenizer
:
PreTrainedTokenizerBase
,
)
->
L
ist
[
T
uple
[
str
,
int
,
int
]]:
)
->
l
ist
[
t
uple
[
str
,
int
,
int
]]:
prefix_token_ids
=
np
.
random
.
randint
(
0
,
tokenizer
.
vocab_size
,
size
=
prefix_len
).
tolist
()
...
...
@@ -399,10 +400,10 @@ def sample_random_requests(
async
def
get_request
(
input_requests
:
L
ist
[
T
uple
[
str
,
int
,
int
]],
input_requests
:
l
ist
[
t
uple
[
str
,
int
,
int
]],
request_rate
:
float
,
burstiness
:
float
=
1.0
,
)
->
AsyncGenerator
[
T
uple
[
str
,
int
,
int
],
None
]:
)
->
AsyncGenerator
[
t
uple
[
str
,
int
,
int
],
None
]:
"""
Asynchronously generates requests at a specified rate
with OPTIONAL burstiness.
...
...
@@ -443,23 +444,23 @@ async def get_request(
def
calculate_metrics
(
input_requests
:
L
ist
[
T
uple
[
str
,
int
,
int
]],
outputs
:
L
ist
[
RequestFuncOutput
],
input_requests
:
l
ist
[
t
uple
[
str
,
int
,
int
]],
outputs
:
l
ist
[
RequestFuncOutput
],
dur_s
:
float
,
tokenizer
:
PreTrainedTokenizerBase
,
selected_percentile_metrics
:
L
ist
[
str
],
selected_percentiles
:
L
ist
[
float
],
goodput_config_dict
:
D
ict
[
str
,
float
],
)
->
T
uple
[
BenchmarkMetrics
,
L
ist
[
int
]]:
actual_output_lens
:
L
ist
[
int
]
=
[]
selected_percentile_metrics
:
l
ist
[
str
],
selected_percentiles
:
l
ist
[
float
],
goodput_config_dict
:
d
ict
[
str
,
float
],
)
->
t
uple
[
BenchmarkMetrics
,
l
ist
[
int
]]:
actual_output_lens
:
l
ist
[
int
]
=
[]
total_input
=
0
completed
=
0
good_completed
=
0
itls
:
L
ist
[
float
]
=
[]
tpots
:
L
ist
[
float
]
=
[]
all_tpots
:
L
ist
[
float
]
=
[]
ttfts
:
L
ist
[
float
]
=
[]
e2els
:
L
ist
[
float
]
=
[]
itls
:
l
ist
[
float
]
=
[]
tpots
:
l
ist
[
float
]
=
[]
all_tpots
:
l
ist
[
float
]
=
[]
ttfts
:
l
ist
[
float
]
=
[]
e2els
:
l
ist
[
float
]
=
[]
for
i
in
range
(
len
(
outputs
)):
if
outputs
[
i
].
success
:
output_len
=
outputs
[
i
].
output_tokens
...
...
@@ -557,19 +558,19 @@ async def benchmark(
model_id
:
str
,
model_name
:
str
,
tokenizer
:
PreTrainedTokenizerBase
,
input_requests
:
L
ist
[
T
uple
[
str
,
int
,
int
]],
input_requests
:
l
ist
[
t
uple
[
str
,
int
,
int
]],
logprobs
:
Optional
[
int
],
best_of
:
int
,
request_rate
:
float
,
burstiness
:
float
,
disable_tqdm
:
bool
,
profile
:
bool
,
selected_percentile_metrics
:
L
ist
[
str
],
selected_percentiles
:
L
ist
[
str
],
selected_percentile_metrics
:
l
ist
[
str
],
selected_percentiles
:
l
ist
[
str
],
ignore_eos
:
bool
,
goodput_config_dict
:
D
ict
[
str
,
float
],
goodput_config_dict
:
d
ict
[
str
,
float
],
max_concurrency
:
Optional
[
int
],
lora_modules
:
Optional
[
L
ist
[
str
]],
lora_modules
:
Optional
[
l
ist
[
str
]],
):
if
backend
in
ASYNC_REQUEST_FUNCS
:
request_func
=
ASYNC_REQUEST_FUNCS
[
backend
]
...
...
@@ -652,7 +653,7 @@ async def benchmark(
pbar
=
pbar
)
benchmark_start_time
=
time
.
perf_counter
()
tasks
:
L
ist
[
asyncio
.
Task
]
=
[]
tasks
:
l
ist
[
asyncio
.
Task
]
=
[]
async
for
request
in
get_request
(
input_requests
,
request_rate
,
burstiness
):
prompt
,
prompt_len
,
output_len
,
mm_content
=
request
req_model_id
,
req_model_name
=
model_id
,
model_name
...
...
@@ -674,7 +675,7 @@ async def benchmark(
asyncio
.
create_task
(
limited_request_func
(
request_func_input
=
request_func_input
,
pbar
=
pbar
)))
outputs
:
L
ist
[
RequestFuncOutput
]
=
await
asyncio
.
gather
(
*
tasks
)
outputs
:
l
ist
[
RequestFuncOutput
]
=
await
asyncio
.
gather
(
*
tasks
)
if
profile
:
print
(
"Stopping profiler..."
)
...
...
@@ -820,7 +821,7 @@ def parse_goodput(slo_pairs):
def
save_to_pytorch_benchmark_format
(
args
:
argparse
.
Namespace
,
results
:
D
ict
[
str
,
Any
],
results
:
d
ict
[
str
,
Any
],
file_name
:
str
)
->
None
:
metrics
=
[
"median_ttft_ms"
,
"mean_ttft_ms"
,
"std_ttft_ms"
,
"p99_ttft_ms"
,
...
...
@@ -974,7 +975,7 @@ def main(args: argparse.Namespace):
# Save config and results to json
if
args
.
save_result
:
result_json
:
D
ict
[
str
,
Any
]
=
{}
result_json
:
d
ict
[
str
,
Any
]
=
{}
# Setup
current_dt
=
datetime
.
now
().
strftime
(
"%Y%m%d-%H%M%S"
)
...
...
benchmarks/benchmark_serving_guided.py
View file @
cf069aa8
...
...
@@ -30,8 +30,9 @@ import os
import
random
import
time
import
warnings
from
collections.abc
import
AsyncGenerator
from
dataclasses
import
dataclass
from
typing
import
AsyncGenerator
,
Dict
,
List
,
Optional
,
Tuple
from
typing
import
Optional
import
datasets
import
numpy
as
np
...
...
@@ -66,22 +67,22 @@ class BenchmarkMetrics:
mean_ttft_ms
:
float
median_ttft_ms
:
float
std_ttft_ms
:
float
percentiles_ttft_ms
:
L
ist
[
T
uple
[
float
,
float
]]
percentiles_ttft_ms
:
l
ist
[
t
uple
[
float
,
float
]]
mean_tpot_ms
:
float
median_tpot_ms
:
float
std_tpot_ms
:
float
percentiles_tpot_ms
:
L
ist
[
T
uple
[
float
,
float
]]
percentiles_tpot_ms
:
l
ist
[
t
uple
[
float
,
float
]]
mean_itl_ms
:
float
median_itl_ms
:
float
std_itl_ms
:
float
percentiles_itl_ms
:
L
ist
[
T
uple
[
float
,
float
]]
percentiles_itl_ms
:
l
ist
[
t
uple
[
float
,
float
]]
# E2EL stands for end-to-end latency per request.
# It is the time taken on the client side from sending
# a request to receiving a complete response.
mean_e2el_ms
:
float
median_e2el_ms
:
float
std_e2el_ms
:
float
percentiles_e2el_ms
:
L
ist
[
T
uple
[
float
,
float
]]
percentiles_e2el_ms
:
l
ist
[
t
uple
[
float
,
float
]]
@
dataclasses
.
dataclass
...
...
@@ -104,7 +105,7 @@ class SampleRequest:
def
sample_requests
(
tokenizer
:
PreTrainedTokenizerBase
,
args
:
argparse
.
Namespace
)
->
L
ist
[
SampleRequest
]:
args
:
argparse
.
Namespace
)
->
l
ist
[
SampleRequest
]:
if
args
.
dataset
==
'json'
:
if
args
.
json_schema_path
is
None
:
dir_path
=
os
.
path
.
dirname
(
os
.
path
.
realpath
(
__file__
))
...
...
@@ -187,7 +188,7 @@ def sample_requests(tokenizer: PreTrainedTokenizerBase,
]
elif
args
.
dataset
==
"xgrammar_bench"
:
requests
:
L
ist
[
SampleRequest
]
=
[]
requests
:
l
ist
[
SampleRequest
]
=
[]
dataset
=
datasets
.
load_dataset
(
"NousResearch/json-mode-eval"
,
split
=
"train"
)
print
(
f
"dataset has
{
len
(
dataset
)
}
entries"
)
...
...
@@ -214,10 +215,10 @@ def sample_requests(tokenizer: PreTrainedTokenizerBase,
async
def
get_request
(
input_requests
:
L
ist
[
SampleRequest
],
input_requests
:
l
ist
[
SampleRequest
],
request_rate
:
float
,
burstiness
:
float
=
1.0
,
)
->
AsyncGenerator
[
T
uple
[
int
,
SampleRequest
],
None
]:
)
->
AsyncGenerator
[
t
uple
[
int
,
SampleRequest
],
None
]:
"""
Asynchronously generates requests at a specified rate
with OPTIONAL burstiness.
...
...
@@ -258,23 +259,23 @@ async def get_request(
def
calculate_metrics
(
input_requests
:
L
ist
[
T
uple
[
str
,
int
,
int
]],
outputs
:
L
ist
[
RequestFuncOutput
],
input_requests
:
l
ist
[
t
uple
[
str
,
int
,
int
]],
outputs
:
l
ist
[
RequestFuncOutput
],
dur_s
:
float
,
tokenizer
:
PreTrainedTokenizerBase
,
selected_percentile_metrics
:
L
ist
[
str
],
selected_percentiles
:
L
ist
[
float
],
goodput_config_dict
:
Optional
[
D
ict
[
str
,
float
]]
=
None
,
)
->
T
uple
[
BenchmarkMetrics
,
L
ist
[
int
]]:
actual_output_lens
:
L
ist
[
int
]
=
[]
selected_percentile_metrics
:
l
ist
[
str
],
selected_percentiles
:
l
ist
[
float
],
goodput_config_dict
:
Optional
[
d
ict
[
str
,
float
]]
=
None
,
)
->
t
uple
[
BenchmarkMetrics
,
l
ist
[
int
]]:
actual_output_lens
:
l
ist
[
int
]
=
[]
total_input
=
0
completed
=
0
good_completed
=
0
itls
:
L
ist
[
float
]
=
[]
tpots
:
L
ist
[
float
]
=
[]
all_tpots
:
L
ist
[
float
]
=
[]
ttfts
:
L
ist
[
float
]
=
[]
e2els
:
L
ist
[
float
]
=
[]
itls
:
l
ist
[
float
]
=
[]
tpots
:
l
ist
[
float
]
=
[]
all_tpots
:
l
ist
[
float
]
=
[]
ttfts
:
l
ist
[
float
]
=
[]
e2els
:
l
ist
[
float
]
=
[]
for
i
in
range
(
len
(
outputs
)):
if
outputs
[
i
].
success
:
# We use the tokenizer to count the number of output tokens for all
...
...
@@ -368,18 +369,18 @@ async def benchmark(
base_url
:
str
,
model_id
:
str
,
tokenizer
:
PreTrainedTokenizerBase
,
input_requests
:
L
ist
[
SampleRequest
],
input_requests
:
l
ist
[
SampleRequest
],
request_rate
:
float
,
burstiness
:
float
,
disable_tqdm
:
bool
,
profile
:
bool
,
selected_percentile_metrics
:
L
ist
[
str
],
selected_percentiles
:
L
ist
[
str
],
selected_percentile_metrics
:
l
ist
[
str
],
selected_percentiles
:
l
ist
[
str
],
ignore_eos
:
bool
,
max_concurrency
:
Optional
[
int
],
guided_decoding_ratio
:
float
,
guided_decoding_backend
:
str
,
goodput_config_dict
:
Optional
[
D
ict
[
str
,
float
]]
=
None
,
goodput_config_dict
:
Optional
[
d
ict
[
str
,
float
]]
=
None
,
):
if
backend
in
ASYNC_REQUEST_FUNCS
:
request_func
=
ASYNC_REQUEST_FUNCS
[
backend
]
...
...
@@ -459,8 +460,8 @@ async def benchmark(
pbar
=
pbar
)
benchmark_start_time
=
time
.
perf_counter
()
tasks
:
L
ist
[
asyncio
.
Task
]
=
[]
expected
:
L
ist
[
str
]
=
[]
tasks
:
l
ist
[
asyncio
.
Task
]
=
[]
expected
:
l
ist
[
str
]
=
[]
async
for
i
,
request
in
get_request
(
input_requests
,
request_rate
,
burstiness
):
extra_body
=
prepare_extra_body
(
...
...
@@ -479,7 +480,7 @@ async def benchmark(
asyncio
.
create_task
(
limited_request_func
(
request_func_input
=
request_func_input
,
pbar
=
pbar
)))
outputs
:
L
ist
[
RequestFuncOutput
]
=
await
asyncio
.
gather
(
*
tasks
)
outputs
:
l
ist
[
RequestFuncOutput
]
=
await
asyncio
.
gather
(
*
tasks
)
if
profile
:
print
(
"Stopping profiler..."
)
...
...
benchmarks/benchmark_throughput.py
View file @
cf069aa8
...
...
@@ -7,7 +7,7 @@ import os
import
random
import
time
from
functools
import
cache
from
typing
import
Any
,
Dict
,
List
,
Optional
,
Tuple
from
typing
import
Any
,
Optional
import
torch
import
uvloop
...
...
@@ -74,12 +74,12 @@ def lora_path_on_disk(lora_path: str) -> str:
return
get_adapter_absolute_path
(
lora_path
)
lora_tokenizer_cache
:
D
ict
[
int
,
AnyTokenizer
]
=
{}
lora_tokenizer_cache
:
d
ict
[
int
,
AnyTokenizer
]
=
{}
def
get_random_lora_request
(
args
:
argparse
.
Namespace
)
->
T
uple
[
LoRARequest
,
Optional
[
AnyTokenizer
]]:
)
->
t
uple
[
LoRARequest
,
Optional
[
AnyTokenizer
]]:
global
lora_tokenizer_cache
lora_id
=
random
.
randint
(
1
,
args
.
max_loras
)
lora_request
=
LoRARequest
(
lora_name
=
str
(
lora_id
),
...
...
@@ -91,7 +91,7 @@ def get_random_lora_request(
def
sample_requests
(
tokenizer
:
PreTrainedTokenizerBase
,
args
:
argparse
.
Namespace
)
->
L
ist
[
SampleRequest
]:
args
:
argparse
.
Namespace
)
->
l
ist
[
SampleRequest
]:
dataset_path
:
str
=
args
.
dataset
num_requests
:
int
=
args
.
num_prompts
...
...
@@ -109,7 +109,7 @@ def sample_requests(tokenizer: PreTrainedTokenizerBase,
random
.
shuffle
(
dataset
)
# Filter out sequences that are too long or too short
filtered_dataset
:
L
ist
[
SampleRequest
]
=
[]
filtered_dataset
:
l
ist
[
SampleRequest
]
=
[]
for
data
in
tqdm
(
dataset
,
total
=
len
(
filtered_dataset
),
desc
=
"sampling requests"
):
...
...
@@ -165,7 +165,7 @@ def sample_requests(tokenizer: PreTrainedTokenizerBase,
def
run_vllm
(
requests
:
L
ist
[
SampleRequest
],
requests
:
l
ist
[
SampleRequest
],
n
:
int
,
engine_args
:
EngineArgs
,
)
->
float
:
...
...
@@ -178,8 +178,8 @@ def run_vllm(
"Please ensure that max_model_len is greater than the sum of"
" prompt_len and expected_output_len for all requests."
)
# Add the requests to the engine.
prompts
:
L
ist
[
TextPrompt
]
=
[]
sampling_params
:
L
ist
[
SamplingParams
]
=
[]
prompts
:
l
ist
[
TextPrompt
]
=
[]
sampling_params
:
l
ist
[
SamplingParams
]
=
[]
for
request
in
requests
:
prompts
.
append
(
TextPrompt
(
prompt
=
request
.
prompt
,
...
...
@@ -192,7 +192,7 @@ def run_vllm(
ignore_eos
=
True
,
max_tokens
=
request
.
expected_output_len
,
))
lora_requests
:
Optional
[
L
ist
[
LoRARequest
]]
=
None
lora_requests
:
Optional
[
l
ist
[
LoRARequest
]]
=
None
if
engine_args
.
enable_lora
:
lora_requests
=
[
request
.
lora_request
for
request
in
requests
]
...
...
@@ -225,7 +225,7 @@ def run_vllm(
async
def
run_vllm_async
(
requests
:
L
ist
[
SampleRequest
],
requests
:
l
ist
[
SampleRequest
],
n
:
int
,
engine_args
:
AsyncEngineArgs
,
disable_frontend_multiprocessing
:
bool
=
False
,
...
...
@@ -242,9 +242,9 @@ async def run_vllm_async(
" prompt_len and expected_output_len for all requests."
)
# Add the requests to the engine.
prompts
:
L
ist
[
TextPrompt
]
=
[]
sampling_params
:
L
ist
[
SamplingParams
]
=
[]
lora_requests
:
L
ist
[
Optional
[
LoRARequest
]]
=
[]
prompts
:
l
ist
[
TextPrompt
]
=
[]
sampling_params
:
l
ist
[
SamplingParams
]
=
[]
lora_requests
:
l
ist
[
Optional
[
LoRARequest
]]
=
[]
for
request
in
requests
:
prompts
.
append
(
TextPrompt
(
prompt
=
request
.
prompt
,
...
...
@@ -276,7 +276,7 @@ async def run_vllm_async(
def
run_hf
(
requests
:
L
ist
[
SampleRequest
],
requests
:
l
ist
[
SampleRequest
],
model
:
str
,
tokenizer
:
PreTrainedTokenizerBase
,
n
:
int
,
...
...
@@ -292,7 +292,7 @@ def run_hf(
pbar
=
tqdm
(
total
=
len
(
requests
))
start
=
time
.
perf_counter
()
batch
:
L
ist
[
str
]
=
[]
batch
:
l
ist
[
str
]
=
[]
max_prompt_len
=
0
max_output_len
=
0
for
i
in
range
(
len
(
requests
)):
...
...
@@ -334,7 +334,7 @@ def run_hf(
def
run_mii
(
requests
:
L
ist
[
SampleRequest
],
requests
:
l
ist
[
SampleRequest
],
model
:
str
,
tensor_parallel_size
:
int
,
output_len
:
int
,
...
...
@@ -352,7 +352,7 @@ def run_mii(
def
save_to_pytorch_benchmark_format
(
args
:
argparse
.
Namespace
,
results
:
D
ict
[
str
,
Any
])
->
None
:
results
:
d
ict
[
str
,
Any
])
->
None
:
pt_records
=
convert_to_pytorch_benchmark_format
(
args
=
args
,
metrics
=
{
...
...
@@ -479,8 +479,8 @@ if __name__ == "__main__":
type
=
str
,
default
=
None
,
help
=
"Path to the dataset. The dataset is expected to "
"be a json in form of
L
ist[
D
ict[..., conversations: "
"
L
ist[
D
ict[..., value: <prompt_or_response>]]]]"
)
"be a json in form of
l
ist[
d
ict[..., conversations: "
"
l
ist[
d
ict[..., value: <prompt_or_response>]]]]"
)
parser
.
add_argument
(
"--input-len"
,
type
=
int
,
default
=
None
,
...
...
benchmarks/benchmark_utils.py
View file @
cf069aa8
...
...
@@ -4,12 +4,12 @@ import argparse
import
json
import
math
import
os
from
typing
import
Any
,
Dict
,
List
from
typing
import
Any
def
convert_to_pytorch_benchmark_format
(
args
:
argparse
.
Namespace
,
metrics
:
D
ict
[
str
,
L
ist
],
extra_info
:
D
ict
[
str
,
Any
])
->
L
ist
:
metrics
:
d
ict
[
str
,
l
ist
],
extra_info
:
d
ict
[
str
,
Any
])
->
l
ist
:
"""
Save the benchmark results in the format used by PyTorch OSS benchmark with
on metric per record
...
...
@@ -64,6 +64,6 @@ class InfEncoder(json.JSONEncoder):
return
super
().
iterencode
(
self
.
clear_inf
(
o
),
*
args
,
**
kwargs
)
def
write_to_json
(
filename
:
str
,
records
:
L
ist
)
->
None
:
def
write_to_json
(
filename
:
str
,
records
:
l
ist
)
->
None
:
with
open
(
filename
,
"w"
)
as
f
:
json
.
dump
(
records
,
f
,
cls
=
InfEncoder
)
benchmarks/cutlass_benchmarks/sparse_benchmarks.py
View file @
cf069aa8
...
...
@@ -5,7 +5,8 @@ import copy
import
itertools
import
pickle
as
pkl
import
time
from
typing
import
Callable
,
Iterable
,
List
,
Tuple
from
collections.abc
import
Iterable
from
typing
import
Callable
import
torch
import
torch.utils.benchmark
as
TBenchmark
...
...
@@ -228,7 +229,7 @@ def print_timers(timers: Iterable[TMeasurement]):
def
run
(
dtype
:
torch
.
dtype
,
MKNs
:
Iterable
[
T
uple
[
int
,
int
,
int
]])
->
Iterable
[
TMeasurement
]:
MKNs
:
Iterable
[
t
uple
[
int
,
int
,
int
]])
->
Iterable
[
TMeasurement
]:
results
=
[]
for
m
,
k
,
n
in
MKNs
:
timers
=
bench
(
dtype
,
m
,
k
,
n
,
f
"scaled-
{
dtype
}
-gemm"
,
...
...
@@ -241,7 +242,7 @@ def run(dtype: torch.dtype,
# output makers
def
make_output
(
data
:
Iterable
[
TMeasurement
],
MKNs
:
Iterable
[
T
uple
[
int
,
int
,
int
]],
MKNs
:
Iterable
[
t
uple
[
int
,
int
,
int
]],
base_description
:
str
,
timestamp
=
None
):
print
(
f
"== All Results
{
base_description
}
===="
)
...
...
@@ -282,7 +283,7 @@ def run_model_bench(args):
for
i
,
model
in
enumerate
(
args
.
models
):
print
(
f
"[
{
i
}
]
{
model
}
"
)
def
model_shapes
(
model_name
:
str
,
tp_size
:
int
)
->
L
ist
[
T
uple
[
int
,
int
]]:
def
model_shapes
(
model_name
:
str
,
tp_size
:
int
)
->
l
ist
[
t
uple
[
int
,
int
]]:
KNs
=
[]
for
KN
,
tp_split_dim
in
copy
.
deepcopy
(
WEIGHT_SHAPES
[
model_name
]):
KN
[
tp_split_dim
]
=
KN
[
tp_split_dim
]
//
tp_size
...
...
benchmarks/cutlass_benchmarks/utils.py
View file @
cf069aa8
# SPDX-License-Identifier: Apache-2.0
# Cutlass bench utils
from
typing
import
Iterable
,
Tuple
from
collections.abc
import
Iterable
import
torch
...
...
@@ -27,7 +27,7 @@ def to_fp16(tensor: torch.Tensor) -> torch.Tensor:
def
make_rand_tensors
(
dtype
:
torch
.
dtype
,
m
:
int
,
n
:
int
,
k
:
int
)
->
T
uple
[
torch
.
Tensor
,
torch
.
Tensor
]:
k
:
int
)
->
t
uple
[
torch
.
Tensor
,
torch
.
Tensor
]:
a
=
torch
.
randn
((
m
,
k
),
device
=
'cuda'
)
*
5
b
=
torch
.
randn
((
n
,
k
),
device
=
'cuda'
).
t
()
*
5
...
...
@@ -63,7 +63,7 @@ def prune_to_2_4(tensor):
def
make_rand_sparse_tensors
(
dtype
:
torch
.
dtype
,
m
:
int
,
n
:
int
,
k
:
int
)
->
T
uple
[
torch
.
Tensor
,
torch
.
Tensor
]:
k
:
int
)
->
t
uple
[
torch
.
Tensor
,
torch
.
Tensor
]:
a
=
torch
.
randn
((
m
,
k
),
device
=
'cuda'
)
*
5
b
=
torch
.
randn
((
n
,
k
),
device
=
'cuda'
).
t
()
*
5
...
...
@@ -88,7 +88,7 @@ def make_rand_sparse_tensors(dtype: torch.dtype, m: int, n: int,
def
make_n_rand_sparse_tensors
(
num_tensors
:
int
,
dtype
:
torch
.
dtype
,
m
:
int
,
n
:
int
,
k
:
int
)
->
\
T
uple
[
Iterable
[
torch
.
Tensor
],
Iterable
[
torch
.
Tensor
]]:
t
uple
[
Iterable
[
torch
.
Tensor
],
Iterable
[
torch
.
Tensor
]]:
ABs
=
[]
for
_
in
range
(
num_tensors
):
b_comp
,
e
,
a
,
b
=
make_rand_sparse_tensors
(
dtype
,
m
,
n
,
k
)
...
...
benchmarks/cutlass_benchmarks/w8a8_benchmarks.py
View file @
cf069aa8
...
...
@@ -5,7 +5,8 @@ import copy
import
itertools
import
pickle
as
pkl
import
time
from
typing
import
Callable
,
Iterable
,
List
,
Optional
,
Tuple
from
collections.abc
import
Iterable
from
typing
import
Callable
,
Optional
import
torch
import
torch.utils.benchmark
as
TBenchmark
...
...
@@ -49,7 +50,7 @@ def bench_int8(
n
:
int
,
label
:
str
,
sub_label
:
str
,
bench_kernels
:
Optional
[
L
ist
[
str
]]
=
None
)
->
Iterable
[
TMeasurement
]:
bench_kernels
:
Optional
[
l
ist
[
str
]]
=
None
)
->
Iterable
[
TMeasurement
]:
"""Benchmark INT8-based kernels."""
assert
dtype
==
torch
.
int8
a
,
b
=
make_rand_tensors
(
torch
.
int8
,
m
,
n
,
k
)
...
...
@@ -101,7 +102,7 @@ def bench_fp8(
n
:
int
,
label
:
str
,
sub_label
:
str
,
bench_kernels
:
Optional
[
L
ist
[
str
]]
=
None
)
->
Iterable
[
TMeasurement
]:
bench_kernels
:
Optional
[
l
ist
[
str
]]
=
None
)
->
Iterable
[
TMeasurement
]:
"""Benchmark FP8-based kernels."""
assert
dtype
==
torch
.
float8_e4m3fn
a
,
b
=
make_rand_tensors
(
torch
.
float8_e4m3fn
,
m
,
n
,
k
)
...
...
@@ -180,7 +181,7 @@ def bench(dtype: torch.dtype,
n
:
int
,
label
:
str
,
sub_label
:
str
,
bench_kernels
:
Optional
[
L
ist
[
str
]]
=
None
)
->
Iterable
[
TMeasurement
]:
bench_kernels
:
Optional
[
l
ist
[
str
]]
=
None
)
->
Iterable
[
TMeasurement
]:
if
dtype
==
torch
.
int8
:
return
bench_int8
(
dtype
,
m
,
k
,
n
,
label
,
sub_label
,
bench_kernels
)
if
dtype
==
torch
.
float8_e4m3fn
:
...
...
@@ -195,8 +196,8 @@ def print_timers(timers: Iterable[TMeasurement]):
def
run
(
dtype
:
torch
.
dtype
,
MKNs
:
Iterable
[
T
uple
[
int
,
int
,
int
]],
bench_kernels
:
Optional
[
L
ist
[
str
]]
=
None
)
->
Iterable
[
TMeasurement
]:
MKNs
:
Iterable
[
t
uple
[
int
,
int
,
int
]],
bench_kernels
:
Optional
[
l
ist
[
str
]]
=
None
)
->
Iterable
[
TMeasurement
]:
results
=
[]
for
m
,
k
,
n
in
MKNs
:
timers
=
bench
(
dtype
,
...
...
@@ -212,7 +213,7 @@ def run(dtype: torch.dtype,
def
make_output
(
data
:
Iterable
[
TMeasurement
],
MKNs
:
Iterable
[
T
uple
[
int
,
int
,
int
]],
MKNs
:
Iterable
[
t
uple
[
int
,
int
,
int
]],
base_description
:
str
,
timestamp
=
None
):
print
(
f
"== All Results
{
base_description
}
===="
)
...
...
@@ -248,7 +249,7 @@ def run_model_bench(args):
for
i
,
model
in
enumerate
(
args
.
models
):
print
(
f
"[
{
i
}
]
{
model
}
"
)
def
model_shapes
(
model_name
:
str
,
tp_size
:
int
)
->
L
ist
[
T
uple
[
int
,
int
]]:
def
model_shapes
(
model_name
:
str
,
tp_size
:
int
)
->
l
ist
[
t
uple
[
int
,
int
]]:
KNs
=
[]
for
KN
,
tp_split_dim
in
copy
.
deepcopy
(
WEIGHT_SHAPES
[
model_name
]):
KN
[
tp_split_dim
]
=
KN
[
tp_split_dim
]
//
tp_size
...
...
benchmarks/fused_kernels/layernorm_rms_benchmarks.py
View file @
cf069aa8
...
...
@@ -2,9 +2,10 @@
import
pickle
as
pkl
import
time
from
collections.abc
import
Iterable
from
dataclasses
import
dataclass
from
itertools
import
product
from
typing
import
Callable
,
Iterable
,
List
,
Optional
from
typing
import
Callable
,
Optional
import
torch
import
torch.utils.benchmark
as
TBenchmark
...
...
@@ -29,7 +30,7 @@ class bench_params_t:
f
'x DT
{
self
.
dtype
}
'
)
def
get_bench_params
()
->
L
ist
[
bench_params_t
]:
def
get_bench_params
()
->
l
ist
[
bench_params_t
]:
## Test Fixtures
NUM_TOKENS
=
[
2
**
x
for
x
in
range
(
11
)]
HIDDEN_SIZES
=
list
(
range
(
1024
,
8129
,
1024
))
...
...
benchmarks/kernels/benchmark_lora.py
View file @
cf069aa8
...
...
@@ -9,7 +9,7 @@ from dataclasses import dataclass
from
enum
import
Enum
,
auto
from
itertools
import
product
from
pathlib
import
Path
from
typing
import
Any
,
Callable
,
Dict
,
List
,
Optional
,
Tuple
from
typing
import
Any
,
Callable
,
Optional
import
torch
import
torch.utils.benchmark
as
TBenchmark
...
...
@@ -61,15 +61,15 @@ def make_rand_lora_weight_tensor(k: int,
def
make_rand_tensors
(
a_shape
:
T
uple
[
int
],
b_shape
:
T
uple
[
int
],
c_shape
:
T
uple
[
int
],
a_shape
:
t
uple
[
int
],
b_shape
:
t
uple
[
int
],
c_shape
:
t
uple
[
int
],
a_dtype
:
torch
.
dtype
,
b_dtype
:
torch
.
dtype
,
c_dtype
:
torch
.
dtype
,
num_slices
:
int
,
device
:
str
=
"cuda"
,
)
->
T
uple
[
torch
.
Tensor
,
L
ist
[
torch
.
Tensor
],
torch
.
Tensor
]:
)
->
t
uple
[
torch
.
Tensor
,
l
ist
[
torch
.
Tensor
],
torch
.
Tensor
]:
"""
Make LoRA input/output matrices.
"""
...
...
@@ -135,7 +135,7 @@ def make_token_lora_mapping(num_tokens: int, num_prompts: int,
def
ref_group_gemm
(
ref_out
:
torch
.
Tensor
,
input
:
torch
.
Tensor
,
lora_weights
:
L
ist
[
torch
.
Tensor
],
lora_weights
:
l
ist
[
torch
.
Tensor
],
seq_lens_cpu
:
torch
.
Tensor
,
prompt_lora_mapping_cpu
:
torch
.
Tensor
,
scaling
:
float
,
add_inputs
:
Optional
[
bool
]):
...
...
@@ -204,7 +204,7 @@ class OpType(Enum):
def
is_expand_slice_fn
(
self
)
->
bool
:
return
self
in
[
OpType
.
BGMV_EXPAND_SLICE
]
def
num_slices
(
self
)
->
L
ist
[
int
]:
def
num_slices
(
self
)
->
l
ist
[
int
]:
if
self
in
[
OpType
.
SGMV_EXPAND
,
OpType
.
SGMV_SHRINK
]:
# SGMV kernels supports slices
return
[
1
,
2
,
3
]
...
...
@@ -215,7 +215,7 @@ class OpType(Enum):
raise
ValueError
(
f
"Unrecognized OpType
{
self
}
"
)
def
mkn
(
self
,
batch_size
:
int
,
seq_length
:
int
,
hidden_size
:
int
,
lora_rank
:
int
)
->
T
uple
[
int
,
int
,
int
]:
lora_rank
:
int
)
->
t
uple
[
int
,
int
,
int
]:
num_tokens
=
batch_size
*
seq_length
if
self
.
is_shrink_fn
():
m
=
num_tokens
...
...
@@ -230,7 +230,7 @@ class OpType(Enum):
def
matmul_dtypes
(
self
,
op_dtype
:
torch
.
dtype
)
->
T
uple
[
torch
.
dtype
,
torch
.
dtype
,
torch
.
dtype
]:
)
->
t
uple
[
torch
.
dtype
,
torch
.
dtype
,
torch
.
dtype
]:
"""
return a type, b type and c type for A x B = C
"""
...
...
@@ -243,7 +243,7 @@ class OpType(Enum):
def
matmul_shapes
(
self
,
batch_size
:
int
,
seq_length
:
int
,
hidden_size
:
int
,
lora_rank
:
int
,
num_loras
:
int
,
num_slices
:
int
)
->
T
uple
[
T
uple
[
int
],
T
uple
[
int
],
T
uple
[
int
]]:
num_slices
:
int
)
->
t
uple
[
t
uple
[
int
],
t
uple
[
int
],
t
uple
[
int
]]:
"""
Given num_slices, return the shapes of the A, B, and C matrices
in A x B = C, for the op_type
...
...
@@ -268,7 +268,7 @@ class OpType(Enum):
def
bench_fn
(
self
)
->
Callable
:
def
emulate_bgmv_expand_slice
(
kwargs_list
:
L
ist
[
D
ict
[
str
,
Any
]]):
def
emulate_bgmv_expand_slice
(
kwargs_list
:
l
ist
[
d
ict
[
str
,
Any
]]):
for
x
in
kwargs_list
:
bgmv_expand_slice
(
**
x
)
...
...
@@ -285,7 +285,7 @@ class OpType(Enum):
raise
ValueError
(
f
"Unrecognized optype
{
self
}
"
)
def
run_ref_group_gemm
(
self
,
output
:
torch
.
Tensor
,
input
:
torch
.
Tensor
,
lora_weights
:
L
ist
[
torch
.
Tensor
],
lora_weights
:
l
ist
[
torch
.
Tensor
],
**
kwargs
)
->
Callable
:
"""Each benchmark operation expected the input, lora_weights and outputs
in a slightly different format. Refer to self.matmul_shapes().
...
...
@@ -384,7 +384,7 @@ class BenchmarkTensors:
"""
# matmul tensors
input
:
torch
.
Tensor
lora_weights_lst
:
L
ist
[
torch
.
Tensor
]
lora_weights_lst
:
l
ist
[
torch
.
Tensor
]
output
:
torch
.
Tensor
# metadata tensors
seq_lens
:
torch
.
Tensor
...
...
@@ -469,7 +469,7 @@ class BenchmarkTensors:
for
i
in
range
(
len
(
self
.
lora_weights_lst
)):
self
.
lora_weights_lst
[
i
]
=
to_device
(
self
.
lora_weights_lst
[
i
])
def
metadata
(
self
)
->
T
uple
[
int
,
int
,
int
]:
def
metadata
(
self
)
->
t
uple
[
int
,
int
,
int
]:
"""
Return num_seqs, num_tokens and max_seq_len
"""
...
...
@@ -505,7 +505,7 @@ class BenchmarkTensors:
self
.
seq_lens
=
seq_lens
.
to
(
dtype
=
self
.
seq_lens
.
dtype
)
self
.
seq_start_loc
=
seq_start_loc
.
to
(
dtype
=
self
.
seq_start_loc
.
dtype
)
def
as_sgmv_shrink_kwargs
(
self
)
->
D
ict
[
str
,
Any
]:
def
as_sgmv_shrink_kwargs
(
self
)
->
d
ict
[
str
,
Any
]:
self
.
convert_to_sgmv_benchmark_tensors
()
self
.
sanity_check
()
self
.
to_device
(
self
.
input
.
device
)
...
...
@@ -540,7 +540,7 @@ class BenchmarkTensors:
'scaling'
:
1.0
,
}
def
as_sgmv_expand_kwargs
(
self
,
add_inputs
:
bool
)
->
D
ict
[
str
,
Any
]:
def
as_sgmv_expand_kwargs
(
self
,
add_inputs
:
bool
)
->
d
ict
[
str
,
Any
]:
self
.
convert_to_sgmv_benchmark_tensors
()
self
.
sanity_check
()
...
...
@@ -578,7 +578,7 @@ class BenchmarkTensors:
'add_inputs'
:
add_inputs
,
}
def
as_bgmv_shrink_kwargs
(
self
)
->
D
ict
[
str
,
Any
]:
def
as_bgmv_shrink_kwargs
(
self
)
->
d
ict
[
str
,
Any
]:
assert
len
(
self
.
lora_weights_lst
)
==
1
self
.
to_device
(
self
.
input
.
device
)
...
...
@@ -634,7 +634,7 @@ class BenchmarkTensors:
'add_inputs'
:
add_inputs
}
def
as_bgmv_expand_slice_kwargs
(
self
,
add_inputs
:
bool
)
->
D
ict
[
str
,
Any
]:
def
as_bgmv_expand_slice_kwargs
(
self
,
add_inputs
:
bool
)
->
d
ict
[
str
,
Any
]:
_
,
num_tokens
,
_
,
num_slices
=
self
.
metadata
()
# Sanity check shapes
...
...
@@ -670,7 +670,7 @@ class BenchmarkTensors:
def
bench_fn_kwargs
(
self
,
op_type
:
OpType
,
add_inputs
:
Optional
[
bool
]
=
None
)
->
D
ict
[
str
,
Any
]:
add_inputs
:
Optional
[
bool
]
=
None
)
->
d
ict
[
str
,
Any
]:
if
op_type
.
is_shrink_fn
():
assert
add_inputs
is
None
else
:
...
...
@@ -734,7 +734,7 @@ def bench_optype(ctx: BenchmarkContext,
assert
expand_fn_add_inputs
is
not
None
# BenchmarkContext -> BenchmarkTensors
bench_tensors
:
L
ist
[
BenchmarkTensors
]
=
\
bench_tensors
:
l
ist
[
BenchmarkTensors
]
=
\
[
BenchmarkTensors
.
make
(
ctx
,
op_type
)
for
_
in
range
(
arg_pool_size
)]
for
bt
in
bench_tensors
:
bt
.
sanity_check
()
...
...
@@ -746,7 +746,7 @@ def bench_optype(ctx: BenchmarkContext,
for
bt
in
bench_tensors
])
# BenchmarkTensors ->
D
ict (kwargs)
# BenchmarkTensors ->
d
ict (kwargs)
kwargs_list
=
[
bt
.
bench_fn_kwargs
(
op_type
,
add_inputs
=
expand_fn_add_inputs
)
for
bt
in
bench_tensors
...
...
@@ -841,7 +841,7 @@ def use_cuda_graph_recommendation() -> str:
"""
def
print_timers
(
timers
:
L
ist
[
TMeasurement
],
def
print_timers
(
timers
:
l
ist
[
TMeasurement
],
args
:
Optional
[
argparse
.
Namespace
]
=
None
):
compare
=
TBenchmark
.
Compare
(
timers
)
compare
.
print
()
...
...
@@ -861,7 +861,7 @@ def print_timers(timers: List[TMeasurement],
"small num_loras the goal should be to match the torch.mm numbers."
)
def
run
(
args
:
argparse
.
Namespace
,
bench_ctxs
:
L
ist
[
BenchmarkContext
]):
def
run
(
args
:
argparse
.
Namespace
,
bench_ctxs
:
l
ist
[
BenchmarkContext
]):
if
args
.
cuda_graph_nops
is
not
None
:
assert
args
.
cuda_graph_nops
>
0
...
...
@@ -873,7 +873,7 @@ def run(args: argparse.Namespace, bench_ctxs: List[BenchmarkContext]):
timers
=
[]
for
bench_ctx
in
bench_ctxs
:
for
seq_len
in
args
.
seq_lengths
:
bench_ops
:
L
ist
[
OpType
]
=
[]
bench_ops
:
l
ist
[
OpType
]
=
[]
if
seq_len
==
1
:
# bench all decode ops
bench_ops
=
[
op
for
op
in
args
.
op_types
if
op
.
is_decode_op
()]
...
...
@@ -921,10 +921,10 @@ def run(args: argparse.Namespace, bench_ctxs: List[BenchmarkContext]):
pickle
.
dump
(
timers
,
f
)
def
as_benchmark_contexts
(
hidden_sizes
:
L
ist
[
int
],
lora_ranks
:
L
ist
[
int
],
args
:
argparse
.
Namespace
)
->
L
ist
[
BenchmarkContext
]:
def
as_benchmark_contexts
(
hidden_sizes
:
l
ist
[
int
],
lora_ranks
:
l
ist
[
int
],
args
:
argparse
.
Namespace
)
->
l
ist
[
BenchmarkContext
]:
ctxs
:
L
ist
[
BenchmarkContext
]
=
[]
ctxs
:
l
ist
[
BenchmarkContext
]
=
[]
for
batch_size
,
hidden_size
,
lora_rank
,
num_loras
,
sort_by_lora_id
in
product
(
# noqa
args
.
batch_sizes
,
list
(
hidden_sizes
),
lora_ranks
,
args
.
num_loras
,
args
.
sort_by_lora_id
):
...
...
@@ -954,7 +954,7 @@ def run_list_bench(args: argparse.Namespace):
f
" LoRA Ranks
{
args
.
lora_ranks
}
"
)
# Get all benchmarking contexts
bench_contexts
:
L
ist
[
BenchmarkContext
]
=
as_benchmark_contexts
(
bench_contexts
:
l
ist
[
BenchmarkContext
]
=
as_benchmark_contexts
(
hidden_sizes
=
args
.
hidden_sizes
,
lora_ranks
=
args
.
lora_ranks
,
args
=
args
)
run
(
args
,
bench_contexts
)
...
...
@@ -975,7 +975,7 @@ def run_range_bench(args: argparse.Namespace):
f
" LoRA Ranks
{
lora_ranks
}
"
)
# Get all benchmarking contexts
bench_contexts
:
L
ist
[
BenchmarkContext
]
=
as_benchmark_contexts
(
bench_contexts
:
l
ist
[
BenchmarkContext
]
=
as_benchmark_contexts
(
hidden_sizes
=
hidden_sizes
,
lora_ranks
=
lora_ranks
,
args
=
args
)
run
(
args
,
bench_contexts
)
...
...
@@ -1002,7 +1002,7 @@ def run_model_bench(args: argparse.Namespace):
f
" LoRA Ranks
{
args
.
lora_ranks
}
"
)
# Get all benchmarking contexts
bench_contexts
:
L
ist
[
BenchmarkContext
]
=
as_benchmark_contexts
(
bench_contexts
:
l
ist
[
BenchmarkContext
]
=
as_benchmark_contexts
(
hidden_sizes
=
hidden_sizes
,
lora_ranks
=
args
.
lora_ranks
,
args
=
args
)
run
(
args
,
bench_contexts
)
...
...
benchmarks/kernels/benchmark_machete.py
View file @
cf069aa8
...
...
@@ -7,9 +7,10 @@ import math
import
os
import
pickle
as
pkl
import
time
from
collections.abc
import
Iterable
from
dataclasses
import
dataclass
from
itertools
import
product
from
typing
import
Callable
,
Iterable
,
List
,
Optional
,
Tuple
from
typing
import
Callable
,
Optional
import
pandas
as
pd
import
torch
...
...
@@ -102,8 +103,8 @@ def quantize_and_pack(atype: torch.dtype,
return
w_ref
,
w_q
,
w_s
,
w_zp
def
create_bench_tensors
(
shape
:
T
uple
[
int
,
int
,
int
],
types
:
TypeConfig
,
group_size
:
Optional
[
int
])
->
L
ist
[
BenchmarkTensors
]:
def
create_bench_tensors
(
shape
:
t
uple
[
int
,
int
,
int
],
types
:
TypeConfig
,
group_size
:
Optional
[
int
])
->
l
ist
[
BenchmarkTensors
]:
m
,
n
,
k
=
shape
# we want to make sure that weights don't fit into L2 cache between runs so
...
...
@@ -114,7 +115,7 @@ def create_bench_tensors(shape: Tuple[int, int, int], types: TypeConfig,
a
=
rand_data
((
m
,
k
),
types
.
act_type
,
scale
=
5
)
benchmark_tensors
:
L
ist
[
BenchmarkTensors
]
=
[]
benchmark_tensors
:
l
ist
[
BenchmarkTensors
]
=
[]
for
_
in
range
(
num_weights
):
w
=
rand_data
((
k
,
n
),
types
.
act_type
,
scale
=
5
)
...
...
@@ -276,7 +277,7 @@ def machete_create_bench_fn(bt: BenchmarkTensors,
def
bench_fns
(
label
:
str
,
sub_label
:
str
,
description
:
str
,
fns
:
L
ist
[
Callable
]):
fns
:
l
ist
[
Callable
]):
min_run_time
=
1
if
not
NVTX_PROFILE
else
0.1
res
=
TBenchmark
.
Timer
(
...
...
@@ -311,7 +312,7 @@ def bench(types: TypeConfig,
n
:
int
,
label
:
str
,
sub_label
:
str
,
sweep_schedules
:
bool
=
True
)
->
L
ist
[
TMeasurement
]:
sweep_schedules
:
bool
=
True
)
->
l
ist
[
TMeasurement
]:
benchmark_tensors
=
create_bench_tensors
((
m
,
n
,
k
),
types
,
group_size
)
sub_label
+=
f
", L=
{
len
(
benchmark_tensors
)
}
"
...
...
@@ -414,12 +415,12 @@ def bench(types: TypeConfig,
# runner
def
print_timers
(
timers
:
L
ist
[
TMeasurement
]):
def
print_timers
(
timers
:
l
ist
[
TMeasurement
]):
compare
=
TBenchmark
.
Compare
(
timers
)
compare
.
print
()
def
run
(
args
,
MKNs
:
Iterable
[
T
uple
[
int
,
int
,
int
]])
->
Iterable
[
TMeasurement
]:
def
run
(
args
,
MKNs
:
Iterable
[
t
uple
[
int
,
int
,
int
]])
->
Iterable
[
TMeasurement
]:
types
=
TypeConfig
(
act_type
=
args
.
act_type
,
weight_type
=
scalar_types
.
uint4b8
if
args
.
group_zero_type
is
None
\
...
...
@@ -431,7 +432,7 @@ def run(args, MKNs: Iterable[Tuple[int, int, int]]) -> Iterable[TMeasurement]:
token_scale_type
=
args
.
token_scale_type
,
)
results
:
L
ist
[
TMeasurement
]
=
[]
results
:
l
ist
[
TMeasurement
]
=
[]
for
m
,
k
,
n
in
MKNs
:
timers
=
bench
(
types
,
args
.
group_size
,
...
...
@@ -449,8 +450,8 @@ def run(args, MKNs: Iterable[Tuple[int, int, int]]) -> Iterable[TMeasurement]:
# output makers
def
make_output
(
data
:
L
ist
[
TMeasurement
],
MKNs
:
Iterable
[
T
uple
[
int
,
int
,
int
]],
data
:
l
ist
[
TMeasurement
],
MKNs
:
Iterable
[
t
uple
[
int
,
int
,
int
]],
base_description
:
str
,
timestamp
=
None
,
):
...
...
@@ -497,7 +498,7 @@ def run_model_bench(args):
for
i
,
model
in
enumerate
(
args
.
models
):
print
(
f
"[
{
i
}
]
{
model
}
"
)
def
model_shapes
(
model_name
:
str
,
tp_size
:
int
)
->
L
ist
[
T
uple
[
int
,
int
]]:
def
model_shapes
(
model_name
:
str
,
tp_size
:
int
)
->
l
ist
[
t
uple
[
int
,
int
]]:
KNs
=
[]
for
KN
,
tp_split_dim
in
copy
.
deepcopy
(
WEIGHT_SHAPES
[
model_name
]):
KN
[
tp_split_dim
]
=
KN
[
tp_split_dim
]
//
tp_size
...
...
benchmarks/kernels/benchmark_marlin.py
View file @
cf069aa8
# SPDX-License-Identifier: Apache-2.0
from
typing
import
List
import
torch
import
torch.utils.benchmark
as
benchmark
from
benchmark_shapes
import
WEIGHT_SHAPES
...
...
@@ -31,7 +29,7 @@ ACT_ORDER_OPTS = [False, True]
K_FULL_OPTS
=
[
False
,
True
]
def
bench_run
(
results
:
L
ist
[
benchmark
.
Measurement
],
model
:
str
,
def
bench_run
(
results
:
l
ist
[
benchmark
.
Measurement
],
model
:
str
,
act_order
:
bool
,
is_k_full
:
bool
,
quant_type
:
ScalarType
,
group_size
:
int
,
size_m
:
int
,
size_k
:
int
,
size_n
:
int
):
label
=
"Quant Matmul"
...
...
@@ -221,7 +219,7 @@ def main(args):
for
i
,
model
in
enumerate
(
args
.
models
):
print
(
f
"[
{
i
}
]
{
model
}
"
)
results
:
L
ist
[
benchmark
.
Measurement
]
=
[]
results
:
l
ist
[
benchmark
.
Measurement
]
=
[]
for
model
in
args
.
models
:
for
layer
in
WEIGHT_SHAPES
[
model
]:
...
...
benchmarks/kernels/benchmark_moe.py
View file @
cf069aa8
...
...
@@ -4,7 +4,7 @@ import argparse
import
time
from
datetime
import
datetime
from
itertools
import
product
from
typing
import
Any
,
Dict
,
List
,
Tuple
,
TypedDict
from
typing
import
Any
,
TypedDict
import
ray
import
torch
...
...
@@ -132,7 +132,7 @@ def benchmark_config(
start_event
=
torch
.
cuda
.
Event
(
enable_timing
=
True
)
end_event
=
torch
.
cuda
.
Event
(
enable_timing
=
True
)
latencies
:
L
ist
[
float
]
=
[]
latencies
:
l
ist
[
float
]
=
[]
for
i
in
range
(
num_iters
):
prepare
(
i
)
torch
.
cuda
.
synchronize
()
...
...
@@ -175,8 +175,8 @@ def get_rocm_tuning_space(use_fp16):
return
param_ranges
def
get_configs_compute_bound
(
use_fp16
)
->
L
ist
[
D
ict
[
str
,
int
]]:
configs
:
L
ist
[
BenchmarkConfig
]
=
[]
def
get_configs_compute_bound
(
use_fp16
)
->
l
ist
[
d
ict
[
str
,
int
]]:
configs
:
l
ist
[
BenchmarkConfig
]
=
[]
if
current_platform
.
is_rocm
():
param_ranges
=
get_rocm_tuning_space
(
use_fp16
)
...
...
@@ -335,7 +335,7 @@ class BenchmarkWorker:
dtype
:
torch
.
dtype
,
use_fp8_w8a8
:
bool
,
use_int8_w8a16
:
bool
,
)
->
T
uple
[
D
ict
[
str
,
int
],
float
]:
)
->
t
uple
[
d
ict
[
str
,
int
],
float
]:
current_platform
.
seed_everything
(
self
.
seed
)
dtype_str
=
get_config_dtype_str
(
dtype
,
use_int8_w8a16
=
use_int8_w8a16
,
...
...
@@ -371,8 +371,8 @@ class BenchmarkWorker:
dtype
:
torch
.
dtype
,
use_fp8_w8a8
:
bool
,
use_int8_w8a16
:
bool
,
search_space
:
L
ist
[
D
ict
[
str
,
int
]],
)
->
D
ict
[
str
,
int
]:
search_space
:
l
ist
[
d
ict
[
str
,
int
]],
)
->
d
ict
[
str
,
int
]:
best_config
=
None
best_time
=
float
(
"inf"
)
if
current_platform
.
is_rocm
():
...
...
@@ -434,7 +434,7 @@ def sort_config(config: BenchmarkConfig) -> BenchmarkConfig:
}
def
save_configs
(
configs
:
D
ict
[
int
,
BenchmarkConfig
],
num_experts
:
int
,
def
save_configs
(
configs
:
d
ict
[
int
,
BenchmarkConfig
],
num_experts
:
int
,
shard_intermediate_size
:
int
,
hidden_size
:
int
,
topk
:
int
,
dtype
:
torch
.
dtype
,
use_fp8_w8a8
:
bool
,
use_int8_w8a16
:
bool
)
->
None
:
...
...
@@ -498,7 +498,7 @@ def main(args: argparse.Namespace):
num_gpus
=
int
(
ray
.
available_resources
()[
"GPU"
])
workers
=
[
BenchmarkWorker
.
remote
(
args
.
seed
)
for
_
in
range
(
num_gpus
)]
def
_distribute
(
method
:
str
,
inputs
:
L
ist
[
Any
])
->
L
ist
[
Any
]:
def
_distribute
(
method
:
str
,
inputs
:
l
ist
[
Any
])
->
l
ist
[
Any
]:
outputs
=
[]
worker_idx
=
0
for
input_args
in
inputs
:
...
...
benchmarks/kernels/benchmark_paged_attention.py
View file @
cf069aa8
...
...
@@ -2,7 +2,7 @@
import
random
import
time
from
typing
import
List
,
Optional
from
typing
import
Optional
import
torch
...
...
@@ -54,7 +54,7 @@ def main(
# Create the block tables.
max_num_blocks_per_seq
=
(
max_seq_len
+
block_size
-
1
)
//
block_size
block_tables_lst
:
L
ist
[
L
ist
[
int
]]
=
[]
block_tables_lst
:
l
ist
[
l
ist
[
int
]]
=
[]
for
_
in
range
(
num_seqs
):
block_table
=
[
random
.
randint
(
0
,
NUM_BLOCKS
-
1
)
...
...
benchmarks/kernels/benchmark_rmsnorm.py
View file @
cf069aa8
# SPDX-License-Identifier: Apache-2.0
import
itertools
from
typing
import
Optional
,
Tuple
,
Union
from
typing
import
Optional
,
Union
import
torch
import
triton
...
...
@@ -22,7 +22,7 @@ class HuggingFaceRMSNorm(nn.Module):
self
,
x
:
torch
.
Tensor
,
residual
:
Optional
[
torch
.
Tensor
]
=
None
,
)
->
Union
[
torch
.
Tensor
,
T
uple
[
torch
.
Tensor
,
torch
.
Tensor
]]:
)
->
Union
[
torch
.
Tensor
,
t
uple
[
torch
.
Tensor
,
torch
.
Tensor
]]:
orig_dtype
=
x
.
dtype
x
=
x
.
to
(
torch
.
float32
)
if
residual
is
not
None
:
...
...
benchmarks/kernels/benchmark_rope.py
View file @
cf069aa8
# SPDX-License-Identifier: Apache-2.0
from
itertools
import
accumulate
from
typing
import
List
,
Optional
from
typing
import
Optional
import
nvtx
import
torch
...
...
@@ -39,7 +39,7 @@ def benchmark_rope_kernels_multi_lora(
})
# non-batched RoPE takes only one scaling factor, we create multiple
# instances to simulate the same behavior
non_batched_ropes
:
L
ist
[
RotaryEmbedding
]
=
[]
non_batched_ropes
:
l
ist
[
RotaryEmbedding
]
=
[]
for
scaling_factor
in
scaling_factors
:
non_batched_ropes
.
append
(
get_rope
(
head_size
,
rotary_dim
,
max_position
,
base
,
is_neox_style
,
...
...
Prev
1
2
3
4
5
…
15
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment