Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
539aa992
Commit
539aa992
authored
Sep 27, 2024
by
zhuwenwen
Browse files
Merge tag 'v0.6.2' into v0.6.2-dev
parents
93872128
7193774b
Changes
383
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
432 additions
and
208 deletions
+432
-208
benchmarks/benchmark_serving.py
benchmarks/benchmark_serving.py
+177
-72
benchmarks/benchmark_throughput.py
benchmarks/benchmark_throughput.py
+21
-6
benchmarks/kernels/benchmark_layernorm.py
benchmarks/kernels/benchmark_layernorm.py
+3
-6
benchmarks/kernels/benchmark_machete.py
benchmarks/kernels/benchmark_machete.py
+61
-13
benchmarks/kernels/benchmark_moe.py
benchmarks/kernels/benchmark_moe.py
+3
-3
benchmarks/kernels/benchmark_paged_attention.py
benchmarks/kernels/benchmark_paged_attention.py
+2
-6
benchmarks/kernels/benchmark_quant.py
benchmarks/kernels/benchmark_quant.py
+3
-6
benchmarks/kernels/benchmark_rope.py
benchmarks/kernels/benchmark_rope.py
+2
-4
benchmarks/kernels/graph_machete_bench.py
benchmarks/kernels/graph_machete_bench.py
+1
-3
benchmarks/kernels/requirements.txt
benchmarks/kernels/requirements.txt
+1
-0
cmake/cpu_extension.cmake
cmake/cpu_extension.cmake
+0
-1
cmake/utils.cmake
cmake/utils.cmake
+5
-4
collect_env.py
collect_env.py
+7
-2
csrc/cpu/quant.cpp
csrc/cpu/quant.cpp
+6
-3
csrc/cpu/torch_bindings.cpp
csrc/cpu/torch_bindings.cpp
+5
-4
csrc/custom_all_reduce.cu
csrc/custom_all_reduce.cu
+0
-12
csrc/custom_all_reduce.cuh
csrc/custom_all_reduce.cuh
+85
-54
csrc/custom_all_reduce_test.cu
csrc/custom_all_reduce_test.cu
+16
-5
csrc/cutlass_extensions/torch_utils.hpp
csrc/cutlass_extensions/torch_utils.hpp
+7
-1
csrc/mamba/causal_conv1d/causal_conv1d.cu
csrc/mamba/causal_conv1d/causal_conv1d.cu
+27
-3
No files found.
benchmarks/benchmark_serving.py
View file @
539aa992
...
...
@@ -24,6 +24,8 @@ On the client side, run:
"""
import
argparse
import
asyncio
import
base64
import
io
import
json
import
os
import
random
...
...
@@ -31,11 +33,13 @@ import time
import
warnings
from
dataclasses
import
dataclass
from
datetime
import
datetime
from
typing
import
Any
,
AsyncGenerator
,
Dict
,
List
,
Optional
,
Tuple
from
typing
import
Any
,
AsyncGenerator
,
Collection
,
Dict
,
List
,
Optional
,
Tuple
import
numpy
as
np
from
backend_request_func
import
(
ASYNC_REQUEST_FUNCS
,
RequestFuncInput
,
RequestFuncOutput
)
from
datasets
import
load_dataset
from
PIL.Image
import
Image
from
tqdm.asyncio
import
tqdm
from
transformers
import
PreTrainedTokenizerBase
...
...
@@ -84,7 +88,7 @@ def sample_sharegpt_requests(
num_requests
:
int
,
tokenizer
:
PreTrainedTokenizerBase
,
fixed_output_len
:
Optional
[
int
]
=
None
,
)
->
List
[
Tuple
[
str
,
int
,
int
]]:
)
->
List
[
Tuple
[
str
,
int
,
int
,
None
]]:
if
fixed_output_len
is
not
None
and
fixed_output_len
<
4
:
raise
ValueError
(
"output_len too small"
)
# Load the dataset.
...
...
@@ -119,7 +123,7 @@ def sample_sharegpt_requests(
if
prompt_len
>
1024
or
prompt_len
+
output_len
>
2048
:
# Prune too long sequences.
continue
filtered_dataset
.
append
((
prompt
,
prompt_len
,
output_len
))
filtered_dataset
.
append
((
prompt
,
prompt_len
,
output_len
,
None
))
return
filtered_dataset
...
...
@@ -131,7 +135,7 @@ def sample_sonnet_requests(
output_len
:
int
,
prefix_len
:
int
,
tokenizer
:
PreTrainedTokenizerBase
,
)
->
List
[
Tuple
[
str
,
str
,
int
,
int
]]:
)
->
List
[
Tuple
[
str
,
str
,
int
,
int
,
None
]]:
assert
(
input_len
>
prefix_len
),
"'args.sonnet-input-len' must be greater than 'args.prefix-input-len'."
...
...
@@ -189,7 +193,65 @@ def sample_sonnet_requests(
message
,
add_generation_prompt
=
True
,
tokenize
=
False
)
prompt_len
=
len
(
tokenizer
(
prompt_formatted
).
input_ids
)
sampled_requests
.
append
(
(
prompt
,
prompt_formatted
,
prompt_len
,
output_len
))
(
prompt
,
prompt_formatted
,
prompt_len
,
output_len
,
None
))
return
sampled_requests
def
sample_hf_requests
(
dataset_path
:
str
,
dataset_subset
:
str
,
dataset_split
:
str
,
num_requests
:
int
,
tokenizer
:
PreTrainedTokenizerBase
,
fixed_output_len
:
Optional
[
int
]
=
None
,
)
->
List
[
Tuple
[
str
,
str
,
int
,
Optional
[
Dict
[
str
,
Collection
[
str
]]]]]:
dataset
=
load_dataset
(
dataset_path
,
name
=
dataset_subset
,
split
=
dataset_split
,
streaming
=
True
)
assert
"conversations"
in
dataset
.
features
,
(
"HF Dataset must have 'conversations' column."
)
filtered_dataset
=
dataset
.
shuffle
().
filter
(
lambda
x
:
len
(
x
[
"conversations"
])
>=
2
)
sampled_requests
:
List
[
Tuple
[
str
,
int
,
int
,
Dict
[
str
,
Collection
[
str
]]]]
=
[]
for
data
in
filtered_dataset
:
if
len
(
sampled_requests
)
==
num_requests
:
break
# Tokenize the prompts and completions.
prompt
=
data
[
"conversations"
][
0
][
"value"
]
prompt_token_ids
=
tokenizer
(
prompt
).
input_ids
completion
=
data
[
"conversations"
][
1
][
"value"
]
completion_token_ids
=
tokenizer
(
completion
).
input_ids
prompt_len
=
len
(
prompt_token_ids
)
output_len
=
len
(
completion_token_ids
)
if
fixed_output_len
is
None
else
fixed_output_len
if
prompt_len
<
4
or
output_len
<
4
:
# Prune too short sequences.
continue
if
prompt_len
>
1024
or
prompt_len
+
output_len
>
2048
:
# Prune too long sequences.
continue
if
"image"
in
data
and
isinstance
(
data
[
"image"
],
Image
):
image
:
Image
=
data
[
"image"
]
image
=
image
.
convert
(
"RGB"
)
image_data
=
io
.
BytesIO
()
image
.
save
(
image_data
,
format
=
'JPEG'
)
image_base64
=
base64
.
b64encode
(
image_data
.
getvalue
()).
decode
(
"utf-8"
)
mm_content
=
{
"type"
:
"image_url"
,
"image_url"
:
{
"url"
:
f
"data:image/jpeg;base64,
{
image_base64
}
"
},
}
else
:
mm_content
=
None
sampled_requests
.
append
((
prompt
,
prompt_len
,
output_len
,
mm_content
))
return
sampled_requests
...
...
@@ -223,8 +285,8 @@ def sample_random_requests(
[(
offsets
[
i
]
+
i
+
j
)
%
tokenizer
.
vocab_size
for
j
in
range
(
input_lens
[
i
])])
input_requests
.
append
(
(
prompt
,
int
(
prefix_len
+
input_lens
[
i
]),
int
(
output_lens
[
i
])))
input_requests
.
append
(
(
prompt
,
int
(
prefix_len
+
input_lens
[
i
]),
int
(
output_lens
[
i
])
,
None
))
return
input_requests
...
...
@@ -343,7 +405,12 @@ async def benchmark(
raise
ValueError
(
f
"Unknown backend:
{
backend
}
"
)
print
(
"Starting initial single prompt test run..."
)
test_prompt
,
test_prompt_len
,
test_output_len
=
input_requests
[
0
]
test_prompt
,
test_prompt_len
,
test_output_len
,
test_mm_content
=
(
input_requests
[
0
])
if
backend
!=
"openai-chat"
and
test_mm_content
is
not
None
:
# multi-modal benchmark is only available on OpenAI Chat backend.
raise
ValueError
(
"Multi-modal content is only supported on 'openai-chat' backend."
)
test_input
=
RequestFuncInput
(
model
=
model_id
,
prompt
=
test_prompt
,
...
...
@@ -353,6 +420,7 @@ async def benchmark(
logprobs
=
logprobs
,
best_of
=
best_of
,
use_beam_search
=
use_beam_search
,
multi_modal_content
=
test_mm_content
,
)
test_output
=
await
request_func
(
request_func_input
=
test_input
)
if
not
test_output
.
success
:
...
...
@@ -373,6 +441,7 @@ async def benchmark(
logprobs
=
logprobs
,
best_of
=
best_of
,
use_beam_search
=
use_beam_search
,
multi_modal_content
=
test_mm_content
,
)
profile_output
=
await
request_func
(
request_func_input
=
profile_input
)
if
profile_output
.
success
:
...
...
@@ -385,7 +454,7 @@ async def benchmark(
benchmark_start_time
=
time
.
perf_counter
()
tasks
:
List
[
asyncio
.
Task
]
=
[]
async
for
request
in
get_request
(
input_requests
,
request_rate
):
prompt
,
prompt_len
,
output_len
=
request
prompt
,
prompt_len
,
output_len
,
mm_content
=
request
request_func_input
=
RequestFuncInput
(
model
=
model_id
,
prompt
=
prompt
,
...
...
@@ -395,6 +464,7 @@ async def benchmark(
logprobs
=
logprobs
,
best_of
=
best_of
,
use_beam_search
=
use_beam_search
,
multi_modal_content
=
mm_content
,
)
tasks
.
append
(
asyncio
.
create_task
(
...
...
@@ -556,9 +626,9 @@ def main(args: argparse.Namespace):
prefix_len
=
args
.
sonnet_prefix_len
,
tokenizer
=
tokenizer
,
)
input_requests
=
[(
prompt
,
prompt_len
,
output_len
)
input_requests
=
[(
prompt
,
prompt_len
,
output_len
,
None
)
for
prompt
,
prompt_formatted
,
prompt_len
,
output_len
in
input_requests
]
output_len
,
_
in
input_requests
]
else
:
assert
(
tokenizer
.
chat_template
or
tokenizer
.
default_chat_template
...
...
@@ -571,9 +641,19 @@ def main(args: argparse.Namespace):
prefix_len
=
args
.
sonnet_prefix_len
,
tokenizer
=
tokenizer
,
)
input_requests
=
[(
prompt_formatted
,
prompt_len
,
output_len
)
input_requests
=
[(
prompt_formatted
,
prompt_len
,
output_len
,
None
)
for
prompt
,
prompt_formatted
,
prompt_len
,
output_len
in
input_requests
]
output_len
,
_
in
input_requests
]
elif
args
.
dataset_name
==
"hf"
:
input_requests
=
sample_hf_requests
(
dataset_path
=
args
.
dataset_path
,
dataset_subset
=
args
.
hf_subset
,
dataset_split
=
args
.
hf_split
,
num_requests
=
args
.
num_prompts
,
tokenizer
=
tokenizer
,
fixed_output_len
=
args
.
hf_output_len
,
)
elif
args
.
dataset_name
==
"random"
:
input_requests
=
sample_random_requests
(
...
...
@@ -685,13 +765,14 @@ if __name__ == "__main__":
"--dataset-name"
,
type
=
str
,
default
=
"sharegpt"
,
choices
=
[
"sharegpt"
,
"sonnet"
,
"random"
],
choices
=
[
"sharegpt"
,
"sonnet"
,
"random"
,
"hf"
],
help
=
"Name of the dataset to benchmark on."
,
)
parser
.
add_argument
(
"--dataset-path"
,
type
=
str
,
default
=
None
,
help
=
"Path to the dataset."
)
help
=
"Path to the sharegpt/sonnet dataset. "
"Or the huggingface dataset ID if using HF dataset."
)
parser
.
add_argument
(
"--model"
,
type
=
str
,
...
...
@@ -718,26 +799,6 @@ if __name__ == "__main__":
default
=
1000
,
help
=
"Number of prompts to process."
,
)
parser
.
add_argument
(
"--sharegpt-output-len"
,
type
=
int
,
default
=
None
,
help
=
"Output length for each request. Overrides the output length "
"from the ShareGPT dataset."
)
parser
.
add_argument
(
"--sonnet-input-len"
,
type
=
int
,
default
=
550
,
help
=
"Number of input tokens per request, used only for sonnet dataset."
,
)
parser
.
add_argument
(
"--sonnet-output-len"
,
type
=
int
,
default
=
150
,
help
=
"Number of output tokens per request, used only for sonnet dataset."
,
)
parser
.
add_argument
(
"--logprobs"
,
type
=
int
,
...
...
@@ -748,42 +809,6 @@ if __name__ == "__main__":
"logprob is returned for each token; or (2) if beam search "
"is enabled 1 logprob per token is computed"
),
)
parser
.
add_argument
(
"--sonnet-prefix-len"
,
type
=
int
,
default
=
200
,
help
=
"Number of prefix tokens per request, used only for sonnet dataset."
,
)
parser
.
add_argument
(
"--random-input-len"
,
type
=
int
,
default
=
1024
,
help
=
"Number of input tokens per request, used only for random sampling."
,
)
parser
.
add_argument
(
"--random-output-len"
,
type
=
int
,
default
=
128
,
help
=
"Number of output tokens per request, used only for random sampling."
,
)
parser
.
add_argument
(
"--random-range-ratio"
,
type
=
float
,
default
=
1.0
,
help
=
"Range of sampled ratio of input/output length, "
"used only for random sampling."
,
)
parser
.
add_argument
(
"--random-prefix-len"
,
type
=
int
,
default
=
0
,
help
=
"Number of fixed prefix tokens before random "
" context. The length range of context in a random "
" request is [random-prefix-len, "
" random-prefix-len + random-prefix-len * random-range-ratio)."
)
parser
.
add_argument
(
"--request-rate"
,
type
=
float
,
...
...
@@ -857,5 +882,85 @@ if __name__ == "__main__":
"Use
\"
--percentile-metrics
\"
to select metrics."
,
)
# group for dataset specific arguments
sonnet_group
=
parser
.
add_argument_group
(
"sonnet dataset options"
)
sonnet_group
.
add_argument
(
"--sonnet-input-len"
,
type
=
int
,
default
=
550
,
help
=
"Number of input tokens per request, used only for sonnet dataset."
,
)
sonnet_group
.
add_argument
(
"--sonnet-output-len"
,
type
=
int
,
default
=
150
,
help
=
"Number of output tokens per request, used only for sonnet dataset."
,
)
sonnet_group
.
add_argument
(
"--sonnet-prefix-len"
,
type
=
int
,
default
=
200
,
help
=
"Number of prefix tokens per request, used only for sonnet dataset."
,
)
sharegpt_group
=
parser
.
add_argument_group
(
"sharegpt dataset options"
)
sharegpt_group
.
add_argument
(
"--sharegpt-output-len"
,
type
=
int
,
default
=
None
,
help
=
"Output length for each request. Overrides the output length "
"from the ShareGPT dataset."
)
random_group
=
parser
.
add_argument_group
(
"random dataset options"
)
random_group
.
add_argument
(
"--random-input-len"
,
type
=
int
,
default
=
1024
,
help
=
"Number of input tokens per request, used only for random sampling."
,
)
random_group
.
add_argument
(
"--random-output-len"
,
type
=
int
,
default
=
128
,
help
=
"Number of output tokens per request, used only for random sampling."
,
)
random_group
.
add_argument
(
"--random-range-ratio"
,
type
=
float
,
default
=
1.0
,
help
=
"Range of sampled ratio of input/output length, "
"used only for random sampling."
,
)
random_group
.
add_argument
(
"--random-prefix-len"
,
type
=
int
,
default
=
0
,
help
=
"Number of fixed prefix tokens before random "
" context. The length range of context in a random "
" request is [random-prefix-len, "
" random-prefix-len + random-prefix-len * random-range-ratio)."
)
hf_group
=
parser
.
add_argument_group
(
"hf dataset options"
)
hf_group
.
add_argument
(
"--hf-subset"
,
type
=
str
,
default
=
None
,
help
=
"Subset of the HF dataset."
)
hf_group
.
add_argument
(
"--hf-split"
,
type
=
str
,
default
=
None
,
help
=
"Split of the HF dataset."
)
hf_group
.
add_argument
(
"--hf-output-len"
,
type
=
int
,
default
=
None
,
help
=
"Output length for each request. Overrides the output lengths "
"from the sampled HF dataset."
,
)
args
=
parser
.
parse_args
()
main
(
args
)
main
(
args
)
\ No newline at end of file
benchmarks/benchmark_throughput.py
View file @
539aa992
...
...
@@ -93,6 +93,7 @@ def run_vllm(
download_dir
:
Optional
[
str
]
=
None
,
load_format
:
str
=
EngineArgs
.
load_format
,
disable_async_output_proc
:
bool
=
False
,
use_new_beam_search_impl
:
bool
=
False
,
)
->
float
:
from
vllm
import
LLM
,
SamplingParams
llm
=
LLM
(
...
...
@@ -169,10 +170,24 @@ def run_vllm(
# print("Warming up...")
# for _ in tqdm(range(args.num_iters_warmup), desc="Warmup iterations"):
# run_to_completion()
start
=
time
.
perf_counter
()
llm
.
generate
(
prompts
,
sampling_params
,
use_tqdm
=
True
)
end
=
time
.
perf_counter
()
if
not
use_new_beam_search_impl
:
start
=
time
.
perf_counter
()
llm
.
generate
(
prompts
,
sampling_params
,
use_tqdm
=
True
)
end
=
time
.
perf_counter
()
else
:
assert
use_beam_search
prompts
=
[
prompt
for
prompt
,
_
,
_
in
requests
]
# output_len should be the same for all requests.
output_len
=
requests
[
0
][
2
]
for
prompt
,
input_len
,
_output_len
in
requests
:
assert
_output_len
==
output_len
start
=
time
.
perf_counter
()
llm
.
beam_search
(
prompts
,
beam_width
=
n
,
max_tokens
=
output_len
,
ignore_eos
=
True
)
end
=
time
.
perf_counter
()
return
end
-
start
...
...
@@ -229,7 +244,6 @@ async def run_vllm_async(
use_v2_block_manager
=
use_v2_block_manager
,
disable_async_output_proc
=
disable_async_output_proc
,
worker_use_ray
=
False
,
engine_use_ray
=
False
,
disable_log_requests
=
True
,
)
...
...
@@ -378,7 +392,7 @@ def main(args: argparse.Namespace):
run_args
.
append
(
args
.
disable_frontend_multiprocessing
)
elapsed_time
=
uvloop
.
run
(
run_vllm_async
(
*
run_args
))
else
:
elapsed_time
=
run_vllm
(
*
run_args
)
elapsed_time
=
run_vllm
(
*
run_args
,
args
.
use_new_beam_search_impl
)
elif
args
.
backend
==
"hf"
:
assert
args
.
tensor_parallel_size
==
1
elapsed_time
=
run_hf
(
requests
,
args
.
model
,
tokenizer
,
args
.
n
,
...
...
@@ -450,6 +464,7 @@ if __name__ == "__main__":
type
=
int
,
default
=
1
,
help
=
'Number of iterations to run for warmup.'
)
parser
.
add_argument
(
"--use-new-beam-search-impl"
,
action
=
"store_true"
)
parser
.
add_argument
(
"--num-prompts"
,
type
=
int
,
default
=
1000
,
...
...
benchmarks/kernels/benchmark_layernorm.py
View file @
539aa992
import
random
import
time
import
torch
from
vllm.model_executor.layers.layernorm
import
RMSNorm
from
vllm.utils
import
STR_DTYPE_TO_TORCH_DTYPE
,
FlexibleArgumentParser
from
vllm.utils
import
(
STR_DTYPE_TO_TORCH_DTYPE
,
FlexibleArgumentParser
,
seed_everything
)
@
torch
.
inference_mode
()
...
...
@@ -16,10 +16,7 @@ def main(num_tokens: int,
do_profile
:
bool
=
False
,
num_warmup_iters
:
int
=
5
,
num_iters
:
int
=
100
)
->
None
:
random
.
seed
(
seed
)
torch
.
random
.
manual_seed
(
seed
)
if
torch
.
cuda
.
is_available
():
torch
.
cuda
.
manual_seed
(
seed
)
seed_everything
(
seed
)
torch
.
set_default_device
(
"cuda"
)
layer
=
RMSNorm
(
hidden_size
).
to
(
dtype
=
dtype
)
...
...
benchmarks/kernels/benchmark_machete.py
View file @
539aa992
...
...
@@ -4,8 +4,10 @@ import itertools
import
math
import
pickle
as
pkl
import
time
from
typing
import
Callable
,
Iterable
,
List
,
Tuple
from
itertools
import
product
from
typing
import
Callable
,
Iterable
,
List
,
Optional
,
Tuple
import
pandas
as
pd
import
torch
import
torch.utils.benchmark
as
TBenchmark
from
torch.utils.benchmark
import
Measurement
as
TMeasurement
...
...
@@ -84,6 +86,10 @@ def loop_over_weights(
fn
(
a
,
w_ref
,
w_q
,
w_s
)
_SWEEP_SCHEDULES_RESULTS
:
Optional
[
pd
.
DataFrame
]
=
None
_SWEEP_SCHEDULES_RESULTS_CSV
:
Optional
[
str
]
=
None
def
bench
(
atype
:
torch
.
dtype
,
wtype
:
ScalarType
,
group_size
:
int
,
...
...
@@ -94,6 +100,8 @@ def bench(atype: torch.dtype,
sub_label
:
str
,
benchmark_marlinv1
:
bool
=
True
,
sweep_schedules
:
bool
=
True
)
->
Iterable
[
TMeasurement
]:
global
_SWEEP_SCHEDULES_RESULTS
a
,
weights
=
make_bench_tensors
(
atype
,
wtype
,
group_size
,
m
,
n
,
k
)
sub_label
+=
f
", L=
{
len
(
weights
)
}
"
...
...
@@ -163,6 +171,11 @@ def bench(atype: torch.dtype,
best_schedule
=
None
schedules
=
ops
.
machete_supported_schedules
(
wtype
)
for
schedule
in
reversed
(
schedules
):
schedule_M
=
int
(
schedule
.
split
(
"_"
)[
0
].
split
(
"x"
)[
1
])
# Prune known bad schedules
if
schedule_M
>=
2
*
max
(
m
,
16
)
or
schedule_M
<
m
//
4
:
continue
def
run
(
a
,
_
,
w_q
,
w_s
,
schedule
=
schedule
):
ops
.
machete_gemm
(
a
,
...
...
@@ -175,6 +188,20 @@ def bench(atype: torch.dtype,
res
=
bench_fn
(
label
,
sub_label
,
"machete_best"
,
lambda
:
loop_over_weights
(
a
,
weights_machete
,
run
))
results_row
=
{
"M"
:
m
,
"K"
:
k
,
"N"
:
n
,
"group_size"
:
group_size
,
"schedule"
:
schedule
,
"median"
:
res
.
median
,
}
if
_SWEEP_SCHEDULES_RESULTS
is
None
:
_SWEEP_SCHEDULES_RESULTS
=
pd
.
DataFrame
(
columns
=
results_row
.
keys
())
_SWEEP_SCHEDULES_RESULTS
.
\
loc
[
len
(
_SWEEP_SCHEDULES_RESULTS
)]
=
results_row
print
(
f
"
{
res
.
median
:
5.5
}
"
,
schedule
)
if
not
best
or
res
.
median
<
best
.
median
:
best
=
res
...
...
@@ -235,18 +262,22 @@ def run_square_bench(args):
dim_sizes
=
list
(
range
(
args
.
dim_start
,
args
.
dim_end
+
1
,
args
.
dim_increment
))
MKNs
=
list
(
zip
(
dim_sizes
,
dim_sizes
,
dim_sizes
))
data
=
run
(
args
.
dtype
,
args
.
sweep_schedules
,
MKNs
)
make_output
(
data
,
MKNs
,
f
"square_bench-
{
args
.
dtype
}
"
)
def
run_range_bench
(
args
):
dim_sizes
=
list
(
range
(
args
.
dim_start
,
args
.
dim_end
,
args
.
dim_increment
))
n
=
len
(
dim_sizes
)
Ms
=
[
args
.
m_constant
]
*
n
if
args
.
m_constant
is
not
None
else
dim_sizes
Ks
=
[
args
.
k_constant
]
*
n
if
args
.
k_constant
is
not
None
else
dim_sizes
Ns
=
[
args
.
n_constant
]
*
n
if
args
.
n_constant
is
not
None
else
dim_sizes
MKNs
=
list
(
zip
(
Ms
,
Ks
,
Ns
))
m_start
,
k_start
,
n_start
=
[
int
(
x
)
for
x
in
args
.
dim_start
.
split
(
","
)]
m_end
,
k_end
,
n_end
=
[
int
(
x
)
for
x
in
args
.
dim_end
.
split
(
","
)]
m_increment
,
k_increment
,
n_increment
=
\
[
int
(
x
)
for
x
in
args
.
dim_increment
.
split
(
","
)]
Ms
=
list
(
range
(
m_start
,
m_end
+
1
,
m_increment
))
Ks
=
list
(
range
(
k_start
,
k_end
+
1
,
k_increment
))
Ns
=
list
(
range
(
n_start
,
n_end
+
1
,
n_increment
))
MKNs
=
list
(
product
(
Ms
,
Ks
,
Ns
))
data
=
run
(
args
.
dtype
,
args
.
sweep_schedules
,
MKNs
)
make_output
(
data
,
MKNs
,
f
"range_bench-
{
args
.
dtype
}
"
)
...
...
@@ -333,6 +364,9 @@ Benchmark Machete GEMM.
action
=
"store_true"
,
help
=
"Run a sweep over all supported schedules"
,
)
parser
.
add_argument
(
"--sweep-csv-out"
,
help
=
"CSV to store sweep results"
,
default
=
"sch_sweep_results.csv"
)
subparsers
=
parser
.
add_subparsers
(
dest
=
"cmd"
,
required
=
True
)
square_parser
=
subparsers
.
add_parser
(
"square_bench"
)
...
...
@@ -342,12 +376,21 @@ Benchmark Machete GEMM.
square_parser
.
set_defaults
(
func
=
run_square_bench
)
range_parser
=
subparsers
.
add_parser
(
"range_bench"
)
range_parser
.
add_argument
(
"--dim-start"
,
type
=
int
,
required
=
True
)
range_parser
.
add_argument
(
"--dim-end"
,
type
=
int
,
required
=
True
)
range_parser
.
add_argument
(
"--dim-increment"
,
type
=
int
,
required
=
True
)
range_parser
.
add_argument
(
"--m-constant"
,
type
=
int
,
default
=
None
)
range_parser
.
add_argument
(
"--n-constant"
,
type
=
int
,
default
=
None
)
range_parser
.
add_argument
(
"--k-constant"
,
type
=
int
,
default
=
None
)
range_parser
.
add_argument
(
"--dim-start"
,
type
=
str
,
required
=
True
,
help
=
"Start value for M,K,N as common separated list"
)
range_parser
.
add_argument
(
"--dim-end"
,
type
=
str
,
required
=
True
,
help
=
"End value (inclusive) for M,K,N as common separated list"
)
range_parser
.
add_argument
(
"--dim-increment"
,
type
=
str
,
required
=
True
,
help
=
"Increment value for M,K,N as common separated list"
)
range_parser
.
set_defaults
(
func
=
run_range_bench
)
model_parser
=
subparsers
.
add_parser
(
"model_bench"
)
...
...
@@ -369,4 +412,9 @@ Benchmark Machete GEMM.
model_parser
.
set_defaults
(
func
=
run_model_bench
)
args
=
parser
.
parse_args
()
_SWEEP_SCHEDULES_RESULTS_CSV
=
args
.
sweep_csv_out
args
.
func
(
args
)
if
_SWEEP_SCHEDULES_RESULTS
is
not
None
:
_SWEEP_SCHEDULES_RESULTS
.
to_csv
(
_SWEEP_SCHEDULES_RESULTS_CSV
)
benchmarks/kernels/benchmark_moe.py
View file @
539aa992
...
...
@@ -10,7 +10,7 @@ from ray.experimental.tqdm_ray import tqdm
from
transformers
import
AutoConfig
from
vllm.model_executor.layers.fused_moe.fused_moe
import
*
from
vllm.utils
import
FlexibleArgumentParser
from
vllm.utils
import
FlexibleArgumentParser
,
seed_everything
class
BenchmarkConfig
(
TypedDict
):
...
...
@@ -166,7 +166,7 @@ class BenchmarkWorker:
def
__init__
(
self
,
seed
:
int
)
->
None
:
torch
.
set_default_device
(
"cuda"
)
torch
.
cuda
.
manual_seed_all
(
seed
)
seed_everything
(
seed
)
self
.
seed
=
seed
def
benchmark
(
...
...
@@ -180,7 +180,7 @@ class BenchmarkWorker:
use_fp8_w8a8
:
bool
,
use_int8_w8a16
:
bool
,
)
->
Tuple
[
Dict
[
str
,
int
],
float
]:
torch
.
cuda
.
manual_seed_all
(
self
.
seed
)
seed_everything
(
self
.
seed
)
dtype_str
=
get_config_dtype_str
(
dtype
,
use_int8_w8a16
=
use_int8_w8a16
,
use_fp8_w8a8
=
use_fp8_w8a8
)
...
...
benchmarks/kernels/benchmark_paged_attention.py
View file @
539aa992
...
...
@@ -6,8 +6,7 @@ import torch
from
vllm
import
_custom_ops
as
ops
from
vllm.utils
import
(
STR_DTYPE_TO_TORCH_DTYPE
,
FlexibleArgumentParser
,
create_kv_caches_with_random
)
import
vllm.envs
as
envs
create_kv_caches_with_random
,
seed_everything
)
NUM_BLOCKS
=
1024
PARTITION_SIZE
=
512
...
...
@@ -29,10 +28,7 @@ def main(
device
:
str
=
"cuda"
,
kv_cache_dtype
:
Optional
[
str
]
=
None
,
)
->
None
:
random
.
seed
(
seed
)
torch
.
random
.
manual_seed
(
seed
)
if
torch
.
cuda
.
is_available
():
torch
.
cuda
.
manual_seed
(
seed
)
seed_everything
(
seed
)
scale
=
float
(
1.0
/
(
head_size
**
0.5
))
query
=
torch
.
empty
(
num_seqs
,
...
...
benchmarks/kernels/benchmark_quant.py
View file @
539aa992
import
random
import
time
import
torch
from
vllm
import
_custom_ops
as
ops
from
vllm.utils
import
STR_DTYPE_TO_TORCH_DTYPE
,
FlexibleArgumentParser
from
vllm.utils
import
(
STR_DTYPE_TO_TORCH_DTYPE
,
FlexibleArgumentParser
,
seed_everything
)
@
torch
.
inference_mode
()
...
...
@@ -17,10 +17,7 @@ def main(num_tokens: int,
do_profile
:
bool
=
False
,
num_warmup_iters
:
int
=
5
,
num_iters
:
int
=
100
)
->
None
:
random
.
seed
(
seed
)
torch
.
random
.
manual_seed
(
seed
)
if
torch
.
cuda
.
is_available
():
torch
.
cuda
.
manual_seed
(
seed
)
seed_everything
(
seed
)
torch
.
set_default_device
(
"cuda"
)
x
=
torch
.
randn
(
num_tokens
,
hidden_size
,
dtype
=
dtype
)
...
...
benchmarks/kernels/benchmark_rope.py
View file @
539aa992
...
...
@@ -6,7 +6,7 @@ import torch
from
vllm.model_executor.layers.rotary_embedding
import
(
RotaryEmbedding
,
get_rope
)
from
vllm.utils
import
FlexibleArgumentParser
from
vllm.utils
import
FlexibleArgumentParser
,
seed_everything
def
benchmark_rope_kernels_multi_lora
(
...
...
@@ -22,9 +22,7 @@ def benchmark_rope_kernels_multi_lora(
max_position
:
int
=
8192
,
base
:
int
=
10000
,
)
->
None
:
torch
.
random
.
manual_seed
(
seed
)
if
torch
.
cuda
.
is_available
():
torch
.
cuda
.
manual_seed
(
seed
)
seed_everything
(
seed
)
torch
.
set_default_device
(
device
)
if
rotary_dim
is
None
:
rotary_dim
=
head_size
...
...
benchmarks/kernels/graph_machete_bench.py
View file @
539aa992
...
...
@@ -45,8 +45,7 @@ if __name__ == "__main__":
rows
=
int
(
math
.
ceil
(
len
(
results
)
/
2
))
fig
,
axs
=
plt
.
subplots
(
rows
,
2
,
figsize
=
(
12
,
5
*
rows
))
axs
=
axs
.
flatten
()
axs_idx
=
0
for
shape
,
data
in
results
.
items
():
for
axs_idx
,
(
shape
,
data
)
in
enumerate
(
results
.
items
()):
plt
.
sca
(
axs
[
axs_idx
])
df
=
pd
.
DataFrame
(
data
)
sns
.
lineplot
(
data
=
df
,
...
...
@@ -59,6 +58,5 @@ if __name__ == "__main__":
palette
=
"Dark2"
)
plt
.
title
(
f
"Shape:
{
shape
}
"
)
plt
.
ylabel
(
"time (median, s)"
)
axs_idx
+=
1
plt
.
tight_layout
()
plt
.
savefig
(
"graph_machete_bench.pdf"
)
benchmarks/kernels/requirements.txt
0 → 100644
View file @
539aa992
pandas
\ No newline at end of file
cmake/cpu_extension.cmake
View file @
539aa992
...
...
@@ -120,4 +120,3 @@ define_gpu_extension_target(
)
message
(
STATUS
"Enabling C extension."
)
add_dependencies
(
default _C
)
cmake/utils.cmake
View file @
539aa992
...
...
@@ -355,18 +355,19 @@ function (define_gpu_extension_target GPU_MOD_NAME)
target_include_directories
(
${
GPU_MOD_NAME
}
PRIVATE csrc
${
GPU_INCLUDE_DIRECTORIES
}
)
# TODO: is torch_python_LIBRARY needed?
target_link_libraries
(
${
GPU_MOD_NAME
}
PRIVATE torch
${
torch_python_LIBRARY
}
${
GPU_LIBRARIES
}
)
target_link_libraries
(
${
GPU_MOD_NAME
}
PRIVATE torch
${
GPU_LIBRARIES
}
)
# Don't use `TORCH_LIBRARIES` for CUDA since it pulls in a bunch of
# dependencies that are not necessary and may not be installed.
if
(
GPU_LANGUAGE STREQUAL
"CUDA"
)
if
(
"
${
CUDA_CUDA_LIB
}
"
STREQUAL
""
)
set
(
CUDA_CUDA_LIB
"
${
CUDA_CUDA_LIBRARY
}
"
)
endif
()
target_link_libraries
(
${
GPU_MOD_NAME
}
PRIVATE
${
CUDA_CUDA_LIB
}
${
CUDA_LIBRARIES
}
)
else
()
target_link_libraries
(
${
GPU_MOD_NAME
}
PRIVATE
${
TORCH_LIBRARIES
}
)
endif
()
install
(
TARGETS
${
GPU_MOD_NAME
}
LIBRARY DESTINATION
${
GPU_DESTINATION
}
)
install
(
TARGETS
${
GPU_MOD_NAME
}
LIBRARY DESTINATION
${
GPU_DESTINATION
}
COMPONENT
${
GPU_MOD_NAME
}
)
endfunction
()
collect_env.py
View file @
539aa992
...
...
@@ -285,9 +285,14 @@ def summarize_vllm_build_flags():
def
get_gpu_topo
(
run_lambda
):
output
=
None
if
get_platform
()
==
'linux'
:
return
run_and_read_all
(
run_lambda
,
'nvidia-smi topo -m'
)
return
None
output
=
run_and_read_all
(
run_lambda
,
'nvidia-smi topo -m'
)
if
output
is
None
:
output
=
run_and_read_all
(
run_lambda
,
'rocm-smi --showtopo'
)
return
output
# example outputs of CPU infos
...
...
csrc/cpu/quant.cpp
View file @
539aa992
...
...
@@ -257,11 +257,13 @@ void int8_scaled_mm(torch::Tensor& c, // [M, OC], row-major
// static-per-tensor quantization.
void
static_scaled_int8_quant
(
torch
::
Tensor
&
out
,
// [..., hidden_size]
const
torch
::
Tensor
&
input
,
// [..., hidden_size]
const
torch
::
Tensor
&
scale
)
{
const
torch
::
Tensor
&
scale
,
c10
::
optional
<
torch
::
Tensor
>
const
&
azp
)
{
CPU_KERNEL_GUARD_IN
(
static_scaled_int8_quant
)
TORCH_CHECK
(
input
.
is_contiguous
());
TORCH_CHECK
(
out
.
is_contiguous
());
TORCH_CHECK
(
scale
.
numel
()
==
1
);
TORCH_CHECK
(
!
azp
.
has_value
(),
"Zero point is not supported on CPU."
);
const
int
hidden_size
=
input
.
size
(
-
1
);
const
int
num_tokens
=
input
.
numel
()
/
hidden_size
;
...
...
@@ -277,11 +279,12 @@ void static_scaled_int8_quant(torch::Tensor& out, // [..., hidden_size]
void
dynamic_scaled_int8_quant
(
torch
::
Tensor
&
out
,
// [..., hidden_size]
const
torch
::
Tensor
&
input
,
// [..., hidden_size]
torch
::
Tensor
&
scale
// [..., 1]
)
{
torch
::
Tensor
&
scale
,
// [..., 1]
c10
::
optional
<
torch
::
Tensor
>
const
&
azp
)
{
CPU_KERNEL_GUARD_IN
(
dynamic_scaled_int8_quant
)
TORCH_CHECK
(
input
.
is_contiguous
());
TORCH_CHECK
(
out
.
is_contiguous
());
TORCH_CHECK
(
!
azp
.
has_value
(),
"Zero point is not supported on CPU."
);
int
const
hidden_size
=
input
.
size
(
-
1
);
int
const
num_tokens
=
input
.
numel
()
/
hidden_size
;
...
...
csrc/cpu/torch_bindings.cpp
View file @
539aa992
...
...
@@ -94,13 +94,14 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
#ifdef __AVX512F__
// Compute int8 quantized tensor for given scaling factor.
ops
.
def
(
"static_scaled_int8_quant(Tensor! out, Tensor input, Tensor scale
) ->
"
"()"
);
"static_scaled_int8_quant(Tensor! out, Tensor input, Tensor scale
,
"
"
Tensor? azp) ->
()"
);
ops
.
impl
(
"static_scaled_int8_quant"
,
torch
::
kCPU
,
&
static_scaled_int8_quant
);
// Compute int8 quantized tensor and scaling factor
ops
.
def
(
"dynamic_scaled_int8_quant(Tensor! out, Tensor input, Tensor! scale
) ->
"
"()"
);
"dynamic_scaled_int8_quant(Tensor! out, Tensor input, Tensor! scale
,
"
"
Tensor!? azp) ->
()"
);
ops
.
impl
(
"dynamic_scaled_int8_quant"
,
torch
::
kCPU
,
&
dynamic_scaled_int8_quant
);
// W8A8 GEMM, supporting symmetric per-tensor or per-row/column
...
...
csrc/custom_all_reduce.cu
View file @
539aa992
...
...
@@ -55,18 +55,6 @@ bool _is_weak_contiguous(torch::Tensor& t) {
t
.
numel
()
*
t
.
element_size
());
}
bool
should_custom_ar
(
torch
::
Tensor
&
inp
,
int64_t
max_size
,
int64_t
world_size
,
bool
full_nvlink
)
{
auto
inp_size
=
inp
.
numel
()
*
inp
.
element_size
();
// custom allreduce requires input byte size to be multiples of 16
if
(
inp_size
%
16
!=
0
)
return
false
;
if
(
!
_is_weak_contiguous
(
inp
))
return
false
;
if
(
world_size
==
2
||
full_nvlink
)
return
inp_size
<=
max_size
;
// for 4 or more non NVLink-capable GPUs, custom allreduce provides little
// performance improvement over NCCL.
return
false
;
}
void
_all_reduce
(
fptr_t
_fa
,
torch
::
Tensor
&
inp
,
torch
::
Tensor
&
out
,
cudaStream_t
stream
)
{
auto
fa
=
reinterpret_cast
<
vllm
::
CustomAllreduce
*>
(
_fa
);
...
...
csrc/custom_all_reduce.cuh
View file @
539aa992
...
...
@@ -6,6 +6,7 @@
#include <cuda_runtime.h>
#include <iostream>
#include <array>
#include <limits>
#include <map>
#include <unordered_map>
...
...
@@ -23,17 +24,23 @@
namespace
vllm
{
constexpr
int
kMaxBlocks
=
64
;
// note: we don't want to use atomics for signals because peer atomics are no
// supported on PCIe links
constexpr
int
kMaxBlocks
=
36
;
// Counter may overflow, but it's fine since unsigned int overflow is
// well-defined behavior.
using
FlagType
=
uint32_t
;
struct
Signal
{
alignas
(
128
)
uint32_t
start
[
kMaxBlocks
][
8
];
alignas
(
128
)
uint32_t
end
[
kMaxBlocks
][
8
];
alignas
(
128
)
FlagType
self_counter
[
kMaxBlocks
][
8
];
// Two sets of peer counters are needed for two syncs. The reason is that
// it's possible for peer GPU block to arrive at the second sync point while
// the current GPU block haven't passed the first sync point. Thus, peer GPU
// may write counter+1 while current GPU is busy waiting for counter. We use
// alternating counter array to avoid this possibility.
alignas
(
128
)
FlagType
peer_counter
[
2
][
kMaxBlocks
][
8
];
};
struct
__align__
(
16
)
RankData
{
const
void
*
__restrict__
ptrs
[
8
];
};
struct
__align__
(
16
)
RankSignals
{
volatile
Signal
*
signals
[
8
];
};
struct
__align__
(
16
)
RankSignals
{
Signal
*
signals
[
8
];
};
// like std::array, but aligned
template
<
typename
T
,
int
sz
>
...
...
@@ -123,47 +130,71 @@ DINLINE O downcast(array_t<float, O::size> val) {
}
}
// This function is meant to be used as the first synchronization in the all
// reduce kernel. Thus, it doesn't need to make any visibility guarantees for
// prior memory accesses. Note: volatile writes will not be reordered against
// other volatile writes.
template
<
int
ngpus
>
DINLINE
void
start_sync
(
const
RankSignals
&
sg
,
volatile
Signal
*
self_sg
,
int
rank
)
{
if
(
threadIdx
.
x
<
ngpus
)
{
// reset flag for next time
self_sg
->
end
[
blockIdx
.
x
][
threadIdx
.
x
]
=
0
;
// simultaneously write to the corresponding flag of all ranks.
// Latency = 1 p2p write
sg
.
signals
[
threadIdx
.
x
]
->
start
[
blockIdx
.
x
][
rank
]
=
1
;
// wait until we got true from all ranks
while
(
!
self_sg
->
start
[
blockIdx
.
x
][
threadIdx
.
x
]);
}
__syncthreads
();
static
DINLINE
void
st_flag_release
(
FlagType
*
flag_addr
,
FlagType
flag
)
{
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700
asm
volatile
(
"st.release.sys.global.u32 [%1], %0;"
::
"r"
(
flag
),
"l"
(
flag_addr
));
#else
asm
volatile
(
"membar.sys; st.volatile.global.u32 [%1], %0;"
::
"r"
(
flag
),
"l"
(
flag_addr
));
#endif
}
static
DINLINE
FlagType
ld_flag_acquire
(
FlagType
*
flag_addr
)
{
FlagType
flag
;
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700
asm
volatile
(
"ld.acquire.sys.global.u32 %0, [%1];"
:
"=r"
(
flag
)
:
"l"
(
flag_addr
));
#else
asm
volatile
(
"ld.volatile.global.u32 %0, [%1]; membar.gl;"
:
"=r"
(
flag
)
:
"l"
(
flag_addr
));
#endif
return
flag
;
}
// This function is meant to be used as the second or the final synchronization
// barrier in the all reduce kernel. If it's the final synchronization barrier,
// we don't need to make any visibility guarantees for prior memory accesses.
template
<
int
ngpus
,
bool
final_sync
=
false
>
DINLINE
void
end_sync
(
const
RankSignals
&
sg
,
volatile
Signal
*
self_sg
,
int
rank
)
{
__syncthreads
();
// eliminate the case that prior writes are not visible after signals become
// visible. Note that I did not managed to make this happen through a lot of
// testing. Might be the case that hardware provides stronger guarantee than
// the memory model.
if
constexpr
(
!
final_sync
)
__threadfence_system
();
static
DINLINE
void
st_flag_volatile
(
FlagType
*
flag_addr
,
FlagType
flag
)
{
asm
volatile
(
"st.volatile.global.u32 [%1], %0;"
::
"r"
(
flag
),
"l"
(
flag_addr
));
}
static
DINLINE
FlagType
ld_flag_volatile
(
FlagType
*
flag_addr
)
{
FlagType
flag
;
asm
volatile
(
"ld.volatile.global.u32 %0, [%1];"
:
"=r"
(
flag
)
:
"l"
(
flag_addr
));
return
flag
;
}
// is_start: whether this is the very first synchronization barrier.
// need_fence: whether a memory fence is needed. If true, a release-acquire
// semantic is used to enforce memory access order before and after this
// barrier.
template
<
int
ngpus
,
bool
is_start
,
bool
need_fence
=
false
>
DINLINE
void
multi_gpu_barrier
(
const
RankSignals
&
sg
,
Signal
*
self_sg
,
int
rank
)
{
if
constexpr
(
!
is_start
)
__syncthreads
();
static_assert
(
!
(
is_start
&&
need_fence
));
// Start barrier shouldn't need fence.
if
(
threadIdx
.
x
<
ngpus
)
{
// reset flag for next time
self_sg
->
start
[
blockIdx
.
x
][
threadIdx
.
x
]
=
0
;
// simultaneously write to the corresponding flag of all ranks.
// Latency = 1 p2p write
sg
.
signals
[
threadIdx
.
x
]
->
end
[
blockIdx
.
x
][
rank
]
=
1
;
// wait until we got true from all ranks
while
(
!
self_sg
->
end
[
blockIdx
.
x
][
threadIdx
.
x
]);
// Increment the counter. Technically we only need one counter, but we use
// multiple per block to eliminate the need to share the counter via smem.
auto
val
=
self_sg
->
self_counter
[
blockIdx
.
x
][
threadIdx
.
x
]
+=
1
;
// Write the expected counter value to peer and wait for correct value from
// peer.
auto
peer_counter_ptr
=
&
sg
.
signals
[
threadIdx
.
x
]
->
peer_counter
[
val
%
2
][
blockIdx
.
x
][
rank
];
auto
self_counter_ptr
=
&
self_sg
->
peer_counter
[
val
%
2
][
blockIdx
.
x
][
threadIdx
.
x
];
if
constexpr
(
need_fence
)
{
st_flag_release
(
peer_counter_ptr
,
val
);
while
(
ld_flag_acquire
(
self_counter_ptr
)
!=
val
);
}
else
{
st_flag_volatile
(
peer_counter_ptr
,
val
);
while
(
ld_flag_volatile
(
self_counter_ptr
)
!=
val
);
}
}
if
constexpr
(
!
final_sy
nc
)
__syncthreads
();
if
constexpr
(
is_start
||
need_fe
nc
e
)
__syncthreads
();
}
template
<
typename
P
,
int
ngpus
,
typename
A
>
...
...
@@ -178,33 +209,31 @@ DINLINE P packed_reduce(const P* ptrs[], int idx) {
template
<
typename
T
,
int
ngpus
>
__global__
void
__launch_bounds__
(
512
,
1
)
cross_device_reduce_1stage
(
RankData
*
_dp
,
RankSignals
sg
,
volatile
Signal
*
self_sg
,
T
*
__restrict__
result
,
int
rank
,
int
size
)
{
cross_device_reduce_1stage
(
RankData
*
_dp
,
RankSignals
sg
,
Signal
*
self_sg
,
T
*
__restrict__
result
,
int
rank
,
int
size
)
{
using
P
=
typename
packed_t
<
T
>::
P
;
using
A
=
typename
packed_t
<
T
>::
A
;
// note: we don't reorder the address so the accumulation order is the same
// for all ranks, ensuring bitwise identical results
auto
dp
=
*
_dp
;
start_sync
<
ngpus
>
(
sg
,
self_sg
,
rank
);
multi_gpu_barrier
<
ngpus
,
true
>
(
sg
,
self_sg
,
rank
);
// do the actual reduction
for
(
int
idx
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
idx
<
size
;
idx
+=
gridDim
.
x
*
blockDim
.
x
)
{
((
P
*
)
result
)[
idx
]
=
packed_reduce
<
P
,
ngpus
,
A
>
((
const
P
**
)
&
dp
.
ptrs
[
0
],
idx
);
}
end_sync
<
ngpus
,
tru
e
>
(
sg
,
self_sg
,
rank
);
multi_gpu_barrier
<
ngpus
,
fals
e
>
(
sg
,
self_sg
,
rank
);
}
template
<
typename
P
>
DINLINE
P
*
get_tmp_buf
(
volatile
Signal
*
sg
)
{
DINLINE
P
*
get_tmp_buf
(
Signal
*
sg
)
{
return
(
P
*
)(((
Signal
*
)
sg
)
+
1
);
}
template
<
typename
T
,
int
ngpus
>
__global__
void
__launch_bounds__
(
512
,
1
)
cross_device_reduce_2stage
(
RankData
*
_dp
,
RankSignals
sg
,
volatile
Signal
*
self_sg
,
T
*
__restrict__
result
,
int
rank
,
int
size
)
{
cross_device_reduce_2stage
(
RankData
*
_dp
,
RankSignals
sg
,
Signal
*
self_sg
,
T
*
__restrict__
result
,
int
rank
,
int
size
)
{
int
tid
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
int
stride
=
gridDim
.
x
*
blockDim
.
x
;
using
P
=
typename
packed_t
<
T
>::
P
;
...
...
@@ -222,12 +251,12 @@ __global__ void __launch_bounds__(512, 1)
tmps
[
i
]
=
get_tmp_buf
<
P
>
(
sg
.
signals
[
target
]);
}
auto
tmp_out
=
tmps
[
0
];
start_sync
<
ngpus
>
(
sg
,
self_sg
,
rank
);
multi_gpu_barrier
<
ngpus
,
true
>
(
sg
,
self_sg
,
rank
);
// stage 1: reduce scatter
for
(
int
idx
=
start
+
tid
;
idx
<
end
;
idx
+=
stride
)
{
tmp_out
[
idx
-
start
]
=
packed_reduce
<
P
,
ngpus
,
A
>
(
ptrs
,
idx
);
}
end_sync
<
ngpus
>
(
sg
,
self_sg
,
rank
);
multi_gpu_barrier
<
ngpus
,
false
,
true
>
(
sg
,
self_sg
,
rank
);
// stage 2: allgather. Note: it's important to match the tid between
// the two stages, because visibility across devices is only guaranteed
...
...
@@ -437,6 +466,8 @@ class CustomAllreduce {
#define KL(ngpus, name) \
name<T, ngpus><<<blocks, threads, 0, stream>>>(ptrs, sg_, self_sg_, output, \
rank_, size);
// TODO(hanzhi713): Threshold is different for A100 and H100.
// Add per device threshold.
#define REDUCE_CASE(ngpus) \
case ngpus: { \
if (world_size_ == 2) { \
...
...
csrc/custom_all_reduce_test.cu
View file @
539aa992
/**
* This is a standalone test for custom allreduce.
* To compile, make sure you have MPI and NCCL installed in your system.
* export MPI_HOME=
XXX
* export MPI_HOME=
xxx
* nvcc -O2 -arch=native -std=c++17 custom_all_reduce_test.cu -o
* custom_all_reduce_test -lnccl -I${MPI_HOME}
/include
-lmpi
* custom_all_reduce_test -lnccl -I${MPI_HOME} -lmpi
*
* Warning: this C++ test is not designed to be very readable and was used
* during the rapid prototyping process.
*
* To run:
* mpirun -np 8 ./custom_all_reduce_test
* mpirun
--allow-run-as-root
-np 8 ./custom_all_reduce_test
*/
#include <cuda.h>
#include <curand_kernel.h>
...
...
@@ -44,7 +44,14 @@
} while (0)
__global__
void
dummy_kernel
()
{
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700
for
(
int
i
=
0
;
i
<
100
;
i
++
)
__nanosleep
(
1000000
);
// 100ms
#else
for
(
int
i
=
0
;
i
<
100
;
i
++
)
{
long
long
int
start
=
clock64
();
while
(
clock64
()
-
start
<
150000000
);
// approximately 98.4ms on P40
}
#endif
}
template
<
typename
T
>
...
...
@@ -302,15 +309,19 @@ int main(int argc, char** argv) {
bool
performance_test
=
true
;
cudaProfilerStart
();
// for (int threads : {256, 512}) {
// Uncomment to scan through different block size configs.
// for (int threads : {256, 512, 1024}) {
// for (int block_limit = 16; block_limit < 112; block_limit += 4) {
// run<half>(myRank, nRanks, comm, threads, block_limit, 4096 * 1024);
// run<half>(myRank, nRanks, comm, threads, block_limit, 1024 * 1024,
// performance_test);
// }
// }
// Scan through different sizes to test performance.
for
(
int
sz
=
512
;
sz
<=
(
8
<<
20
);
sz
*=
2
)
{
run
<
half
>
(
myRank
,
nRanks
,
comm
,
512
,
36
,
sz
+
8
*
47
,
performance_test
);
}
cudaProfilerStop
();
MPICHECK
(
MPI_Finalize
());
return
EXIT_SUCCESS
;
}
csrc/cutlass_extensions/torch_utils.hpp
View file @
539aa992
...
...
@@ -68,7 +68,13 @@ static inline auto make_cute_layout(torch::Tensor const& tensor,
name
,
".stride("
,
idx
,
") to be "
,
StrideEle
::
value
);
return
StrideEle
{};
}
else
{
return
tensor
.
stride
(
idx
);
if
(
tensor
.
size
(
idx
)
==
1
)
{
// use 0 stride for dim with size 1, this is easier for
// cute/cutlass to optimize (helps the TMA code flatten dims)
return
StrideEle
{
0
};
}
else
{
return
tensor
.
stride
(
idx
);
}
}
}
else
{
// Extra strides are assumed to be 0 or 1
...
...
csrc/mamba/causal_conv1d/causal_conv1d.cu
View file @
539aa992
...
...
@@ -198,7 +198,8 @@ causal_conv1d_update(const at::Tensor &x,
const
at
::
Tensor
&
conv_state
,
const
at
::
Tensor
&
weight
,
const
c10
::
optional
<
at
::
Tensor
>
&
bias_
,
bool
silu_activation
)
{
bool
silu_activation
,
const
c10
::
optional
<
at
::
Tensor
>
&
conv_state_indices_
)
{
auto
input_type
=
x
.
scalar_type
();
auto
weight_type
=
weight
.
scalar_type
();
TORCH_CHECK
(
input_type
==
at
::
ScalarType
::
Float
||
input_type
==
at
::
ScalarType
::
Half
||
input_type
==
at
::
ScalarType
::
BFloat16
);
...
...
@@ -216,7 +217,6 @@ causal_conv1d_update(const at::Tensor &x,
const
int
width
=
weight
.
size
(
-
1
);
CHECK_SHAPE
(
x
,
batch_size
,
dim
);
CHECK_SHAPE
(
conv_state
,
batch_size
,
dim
,
width
);
CHECK_SHAPE
(
weight
,
dim
,
width
);
TORCH_CHECK
(
width
>=
2
&&
width
<=
4
,
"causal_conv1d only supports width between 2 and 4"
);
...
...
@@ -241,6 +241,22 @@ causal_conv1d_update(const at::Tensor &x,
params
.
conv_state_c_stride
=
conv_state
.
stride
(
1
);
params
.
conv_state_l_stride
=
conv_state
.
stride
(
2
);
if
(
conv_state_indices_
.
has_value
())
{
auto
conv_state_indices
=
conv_state_indices_
.
value
();
TORCH_CHECK
(
conv_state_indices
.
scalar_type
()
==
torch
::
kInt32
)
TORCH_CHECK
(
conv_state_indices
.
is_cuda
());
TORCH_CHECK
(
conv_state_indices
.
stride
(
0
)
==
1
)
CHECK_SHAPE
(
conv_state_indices
,
batch_size
);
int
conv_state_entries
=
conv_state
.
size
(
0
);
CHECK_SHAPE
(
conv_state
,
conv_state_entries
,
dim
,
width
);
params
.
conv_state_indices_ptr
=
conv_state_indices
.
data_ptr
<
int32_t
>
();
}
else
{
CHECK_SHAPE
(
conv_state
,
batch_size
,
dim
,
width
);
params
.
conv_state_indices_ptr
=
nullptr
;
}
// Otherwise the kernel will be launched from cuda:0 device
// Cast to char to avoid compiler warning about narrowing
at
::
cuda
::
CUDAGuard
device_guard
{(
char
)
x
.
get_device
()};
...
...
@@ -646,8 +662,16 @@ void causal_conv1d_update_kernel(ConvParamsBase params) {
const
int
channel_id
=
blockIdx
.
y
*
kNThreads
+
tidx
;
input_t
*
x
=
reinterpret_cast
<
input_t
*>
(
params
.
x_ptr
)
+
batch_id
*
params
.
x_batch_stride
+
channel_id
*
params
.
x_c_stride
;
input_t
*
conv_state
=
reinterpret_cast
<
input_t
*>
(
params
.
conv_state_ptr
)
+
batch_id
*
params
.
conv_state_batch_stride
// If params.conv_state_batch_indices is set, then the conv state is gathered from the conv state tensor
// along the batch axis. Otherwise, the conv state coordinate is the same as the batch id.
const
int
conv_state_batch_coord
=
params
.
conv_state_indices_ptr
==
nullptr
?
batch_id
:
params
.
conv_state_indices_ptr
[
batch_id
];
input_t
*
conv_state
=
reinterpret_cast
<
input_t
*>
(
params
.
conv_state_ptr
)
+
conv_state_batch_coord
*
params
.
conv_state_batch_stride
+
channel_id
*
params
.
conv_state_c_stride
;
weight_t
*
weight
=
reinterpret_cast
<
weight_t
*>
(
params
.
weight_ptr
)
+
channel_id
*
params
.
weight_c_stride
;
input_t
*
out
=
reinterpret_cast
<
input_t
*>
(
params
.
out_ptr
)
+
batch_id
*
params
.
out_batch_stride
+
channel_id
*
params
.
out_c_stride
;
...
...
Prev
1
2
3
4
5
6
…
20
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment