Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
change
sglang
Commits
81262c7b
Unverified
Commit
81262c7b
authored
Jan 28, 2025
by
Xiaoyu Zhang
Committed by
GitHub
Jan 28, 2025
Browse files
clean up useless file (#3192)
parent
27aeb4b7
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
0 additions
and
159 deletions
+0
-159
sgl-kernel/benchmark/bench_sampling_scaling_penalties.py
sgl-kernel/benchmark/bench_sampling_scaling_penalties.py
+0
-159
No files found.
sgl-kernel/benchmark/bench_sampling_scaling_penalties.py
deleted
100644 → 0
View file @
27aeb4b7
import
itertools
import
torch
import
triton
from
sgl_kernel
import
sampling_scaling_penalties
def
sampling_scaling_penalties_naive
(
logits
,
scaling_penalties
):
return
torch
.
where
(
logits
>
0
,
logits
/
scaling_penalties
,
logits
*
scaling_penalties
)
def
sampling_scaling_penalties_kernel
(
logits
,
scaling_penalties
):
return
sampling_scaling_penalties
(
logits
,
scaling_penalties
)
def
test_memory
(
func
,
_iter
):
total_mem
=
[]
for
_
in
range
(
_iter
):
torch
.
cuda
.
memory
.
reset_peak_memory_stats
()
func
()
mem
=
torch
.
cuda
.
max_memory_allocated
()
/
(
2
**
20
)
total_mem
.
append
(
mem
)
return
sum
(
total_mem
)
/
len
(
total_mem
)
def
calculate_diff
(
batch_size
,
vocab_size
):
dtype
=
torch
.
bfloat16
device
=
torch
.
device
(
"cuda"
)
logits
=
torch
.
randn
(
batch_size
,
vocab_size
,
device
=
device
,
dtype
=
dtype
)
scaling_penalties
=
(
torch
.
rand
(
batch_size
,
vocab_size
,
device
=
device
,
dtype
=
dtype
)
+
0.5
)
output_naive
=
sampling_scaling_penalties_naive
(
logits
.
clone
(),
scaling_penalties
.
clone
()
)
output_kernel
=
sampling_scaling_penalties_kernel
(
logits
.
clone
(),
scaling_penalties
.
clone
()
)
print
(
f
"Naive output=
{
output_naive
}
"
)
print
(
f
"Kernel output=
{
output_kernel
}
"
)
if
torch
.
allclose
(
output_naive
,
output_kernel
,
atol
=
1e-2
,
rtol
=
1e-2
):
print
(
"✅ Both implementations match"
)
else
:
print
(
"❌ Implementations differ"
)
batch_size_range
=
[
2
**
i
for
i
in
range
(
0
,
12
)]
vocab_size_range
=
[
2
**
i
for
i
in
range
(
10
,
17
)]
configs
=
list
(
itertools
.
product
(
batch_size_range
,
vocab_size_range
))
@
triton
.
testing
.
perf_report
(
triton
.
testing
.
Benchmark
(
x_names
=
[
"batch_size"
,
"vocab_size"
],
x_vals
=
[
list
(
_
)
for
_
in
configs
],
line_arg
=
"provider"
,
line_vals
=
[
"naive"
,
"kernel"
],
line_names
=
[
"PyTorch Naive"
,
"SGL Kernel"
],
styles
=
[(
"blue"
,
"-"
),
(
"red"
,
"-"
)],
ylabel
=
"us"
,
plot_name
=
"sampling-scaling-penalties-performance"
,
args
=
{},
)
)
def
benchmark
(
batch_size
,
vocab_size
,
provider
):
dtype
=
torch
.
bfloat16
device
=
torch
.
device
(
"cuda"
)
logits
=
torch
.
randn
(
batch_size
,
vocab_size
,
device
=
device
,
dtype
=
dtype
)
scaling_penalties
=
(
torch
.
rand
(
batch_size
,
vocab_size
,
device
=
device
,
dtype
=
dtype
)
+
0.5
)
quantiles
=
[
0.5
,
0.2
,
0.8
]
if
provider
==
"naive"
:
ms
,
min_ms
,
max_ms
=
triton
.
testing
.
do_bench
(
lambda
:
sampling_scaling_penalties_naive
(
logits
.
clone
(),
scaling_penalties
.
clone
(),
),
quantiles
=
quantiles
,
)
else
:
ms
,
min_ms
,
max_ms
=
triton
.
testing
.
do_bench
(
lambda
:
sampling_scaling_penalties_kernel
(
logits
.
clone
(),
scaling_penalties
.
clone
(),
),
quantiles
=
quantiles
,
)
return
1000
*
ms
,
1000
*
max_ms
,
1000
*
min_ms
@
triton
.
testing
.
perf_report
(
triton
.
testing
.
Benchmark
(
x_names
=
[
"batch_size"
,
"vocab_size"
],
x_vals
=
[
list
(
_
)
for
_
in
configs
],
line_arg
=
"provider"
,
line_vals
=
[
"naive"
,
"kernel"
],
line_names
=
[
"PyTorch Naive"
,
"SGL Kernel"
],
styles
=
[(
"blue"
,
"-"
),
(
"red"
,
"-"
)],
ylabel
=
"GPU memory usage (MB)"
,
plot_name
=
"sampling-scaling-penalties-memory"
,
args
=
{},
)
)
def
benchmark_memory
(
batch_size
,
vocab_size
,
provider
):
dtype
=
torch
.
bfloat16
device
=
torch
.
device
(
"cuda"
)
print
(
f
"Running memory benchmark with batch_size=
{
batch_size
}
, vocab_size=
{
vocab_size
}
, provider=
{
provider
}
"
)
def
run_kernel
():
logits
=
torch
.
randn
(
batch_size
,
vocab_size
,
device
=
device
,
dtype
=
dtype
)
scaling_penalties
=
(
torch
.
rand
(
batch_size
,
vocab_size
,
device
=
device
,
dtype
=
dtype
)
+
0.5
)
if
provider
==
"naive"
:
return
sampling_scaling_penalties_naive
(
logits
,
scaling_penalties
)
else
:
return
sampling_scaling_penalties_kernel
(
logits
,
scaling_penalties
)
mem
=
test_memory
(
run_kernel
,
_iter
=
10
)
return
mem
if
__name__
==
"__main__"
:
import
argparse
parser
=
argparse
.
ArgumentParser
()
parser
.
add_argument
(
"--save_path"
,
type
=
str
,
default
=
"./configs/benchmark_ops/sampling_scaling_penalties/"
,
help
=
"Path to save sampling_scaling_penalties benchmark results"
,
)
args
=
parser
.
parse_args
()
# Run correctness test
calculate_diff
(
batch_size
=
4
,
vocab_size
=
4096
)
# Run performance benchmark
benchmark
.
run
(
print_data
=
True
,
save_path
=
args
.
save_path
)
# Run memory benchmark
benchmark_memory
.
run
(
print_data
=
True
,
save_path
=
args
.
save_path
)
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment