Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
change
sglang
Commits
81262c7b
".github/vscode:/vscode.git/clone" did not exist on "2db73f4a509864d38c668979c58ce1cee57a84db"
Unverified
Commit
81262c7b
authored
Jan 28, 2025
by
Xiaoyu Zhang
Committed by
GitHub
Jan 28, 2025
Browse files
clean up useless file (#3192)
parent
27aeb4b7
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
0 additions
and
159 deletions
+0
-159
sgl-kernel/benchmark/bench_sampling_scaling_penalties.py
sgl-kernel/benchmark/bench_sampling_scaling_penalties.py
+0
-159
No files found.
sgl-kernel/benchmark/bench_sampling_scaling_penalties.py
deleted
100644 → 0
View file @
27aeb4b7
import
itertools
import
torch
import
triton
from
sgl_kernel
import
sampling_scaling_penalties
def
sampling_scaling_penalties_naive
(
logits
,
scaling_penalties
):
return
torch
.
where
(
logits
>
0
,
logits
/
scaling_penalties
,
logits
*
scaling_penalties
)
def
sampling_scaling_penalties_kernel
(
logits
,
scaling_penalties
):
return
sampling_scaling_penalties
(
logits
,
scaling_penalties
)
def
test_memory
(
func
,
_iter
):
total_mem
=
[]
for
_
in
range
(
_iter
):
torch
.
cuda
.
memory
.
reset_peak_memory_stats
()
func
()
mem
=
torch
.
cuda
.
max_memory_allocated
()
/
(
2
**
20
)
total_mem
.
append
(
mem
)
return
sum
(
total_mem
)
/
len
(
total_mem
)
def
calculate_diff
(
batch_size
,
vocab_size
):
dtype
=
torch
.
bfloat16
device
=
torch
.
device
(
"cuda"
)
logits
=
torch
.
randn
(
batch_size
,
vocab_size
,
device
=
device
,
dtype
=
dtype
)
scaling_penalties
=
(
torch
.
rand
(
batch_size
,
vocab_size
,
device
=
device
,
dtype
=
dtype
)
+
0.5
)
output_naive
=
sampling_scaling_penalties_naive
(
logits
.
clone
(),
scaling_penalties
.
clone
()
)
output_kernel
=
sampling_scaling_penalties_kernel
(
logits
.
clone
(),
scaling_penalties
.
clone
()
)
print
(
f
"Naive output=
{
output_naive
}
"
)
print
(
f
"Kernel output=
{
output_kernel
}
"
)
if
torch
.
allclose
(
output_naive
,
output_kernel
,
atol
=
1e-2
,
rtol
=
1e-2
):
print
(
"✅ Both implementations match"
)
else
:
print
(
"❌ Implementations differ"
)
batch_size_range
=
[
2
**
i
for
i
in
range
(
0
,
12
)]
vocab_size_range
=
[
2
**
i
for
i
in
range
(
10
,
17
)]
configs
=
list
(
itertools
.
product
(
batch_size_range
,
vocab_size_range
))
@
triton
.
testing
.
perf_report
(
triton
.
testing
.
Benchmark
(
x_names
=
[
"batch_size"
,
"vocab_size"
],
x_vals
=
[
list
(
_
)
for
_
in
configs
],
line_arg
=
"provider"
,
line_vals
=
[
"naive"
,
"kernel"
],
line_names
=
[
"PyTorch Naive"
,
"SGL Kernel"
],
styles
=
[(
"blue"
,
"-"
),
(
"red"
,
"-"
)],
ylabel
=
"us"
,
plot_name
=
"sampling-scaling-penalties-performance"
,
args
=
{},
)
)
def
benchmark
(
batch_size
,
vocab_size
,
provider
):
dtype
=
torch
.
bfloat16
device
=
torch
.
device
(
"cuda"
)
logits
=
torch
.
randn
(
batch_size
,
vocab_size
,
device
=
device
,
dtype
=
dtype
)
scaling_penalties
=
(
torch
.
rand
(
batch_size
,
vocab_size
,
device
=
device
,
dtype
=
dtype
)
+
0.5
)
quantiles
=
[
0.5
,
0.2
,
0.8
]
if
provider
==
"naive"
:
ms
,
min_ms
,
max_ms
=
triton
.
testing
.
do_bench
(
lambda
:
sampling_scaling_penalties_naive
(
logits
.
clone
(),
scaling_penalties
.
clone
(),
),
quantiles
=
quantiles
,
)
else
:
ms
,
min_ms
,
max_ms
=
triton
.
testing
.
do_bench
(
lambda
:
sampling_scaling_penalties_kernel
(
logits
.
clone
(),
scaling_penalties
.
clone
(),
),
quantiles
=
quantiles
,
)
return
1000
*
ms
,
1000
*
max_ms
,
1000
*
min_ms
@
triton
.
testing
.
perf_report
(
triton
.
testing
.
Benchmark
(
x_names
=
[
"batch_size"
,
"vocab_size"
],
x_vals
=
[
list
(
_
)
for
_
in
configs
],
line_arg
=
"provider"
,
line_vals
=
[
"naive"
,
"kernel"
],
line_names
=
[
"PyTorch Naive"
,
"SGL Kernel"
],
styles
=
[(
"blue"
,
"-"
),
(
"red"
,
"-"
)],
ylabel
=
"GPU memory usage (MB)"
,
plot_name
=
"sampling-scaling-penalties-memory"
,
args
=
{},
)
)
def
benchmark_memory
(
batch_size
,
vocab_size
,
provider
):
dtype
=
torch
.
bfloat16
device
=
torch
.
device
(
"cuda"
)
print
(
f
"Running memory benchmark with batch_size=
{
batch_size
}
, vocab_size=
{
vocab_size
}
, provider=
{
provider
}
"
)
def
run_kernel
():
logits
=
torch
.
randn
(
batch_size
,
vocab_size
,
device
=
device
,
dtype
=
dtype
)
scaling_penalties
=
(
torch
.
rand
(
batch_size
,
vocab_size
,
device
=
device
,
dtype
=
dtype
)
+
0.5
)
if
provider
==
"naive"
:
return
sampling_scaling_penalties_naive
(
logits
,
scaling_penalties
)
else
:
return
sampling_scaling_penalties_kernel
(
logits
,
scaling_penalties
)
mem
=
test_memory
(
run_kernel
,
_iter
=
10
)
return
mem
if
__name__
==
"__main__"
:
import
argparse
parser
=
argparse
.
ArgumentParser
()
parser
.
add_argument
(
"--save_path"
,
type
=
str
,
default
=
"./configs/benchmark_ops/sampling_scaling_penalties/"
,
help
=
"Path to save sampling_scaling_penalties benchmark results"
,
)
args
=
parser
.
parse_args
()
# Run correctness test
calculate_diff
(
batch_size
=
4
,
vocab_size
=
4096
)
# Run performance benchmark
benchmark
.
run
(
print_data
=
True
,
save_path
=
args
.
save_path
)
# Run memory benchmark
benchmark_memory
.
run
(
print_data
=
True
,
save_path
=
args
.
save_path
)
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment