Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
zhaoyu6
sglang
Commits
0feca02d
"ggml/src/ggml-vulkan/vulkan-shaders/dequant_q3_k.comp" did not exist on "4cc1a6143387f41e2466536abcd6a2620b63a35b"
Unverified
Commit
0feca02d
authored
Jul 13, 2024
by
Lianmin Zheng
Committed by
GitHub
Jul 13, 2024
Browse files
Improve benchmark scripts (#615)
parent
10143e1a
Changes
2
Show whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
6 additions
and
5 deletions
+6
-5
benchmark/latency_throughput/bench_one.py
benchmark/latency_throughput/bench_one.py
+2
-1
python/sglang/srt/memory_pool.py
python/sglang/srt/memory_pool.py
+4
-4
No files found.
benchmark/latency_throughput/bench_one.py
View file @
0feca02d
...
@@ -100,11 +100,12 @@ def run_one_batch_size(bs):
...
@@ -100,11 +100,12 @@ def run_one_batch_size(bs):
with
open
(
"results.jsonl"
,
"a"
)
as
fout
:
with
open
(
"results.jsonl"
,
"a"
)
as
fout
:
res
=
{
res
=
{
"backend"
:
args
.
backend
,
"input_len"
:
args
.
input_len
,
"input_len"
:
args
.
input_len
,
"output_len"
:
args
.
max_tokens
,
"output_len"
:
args
.
max_tokens
,
"batch_size"
:
bs
,
"batch_size"
:
bs
,
"latency"
:
latency
,
"latency"
:
latency
,
"output_throughput"
:
output_throughput
"output_throughput"
:
output_throughput
,
}
}
fout
.
write
(
json
.
dumps
(
res
)
+
"
\n
"
)
fout
.
write
(
json
.
dumps
(
res
)
+
"
\n
"
)
...
...
python/sglang/srt/memory_pool.py
View file @
0feca02d
...
@@ -52,7 +52,7 @@ class TokenToKVPool:
...
@@ -52,7 +52,7 @@ class TokenToKVPool:
# Prefetch buffer
# Prefetch buffer
self
.
prefetch_buffer
=
torch
.
empty
(
0
,
device
=
"cuda"
,
dtype
=
torch
.
int32
)
self
.
prefetch_buffer
=
torch
.
empty
(
0
,
device
=
"cuda"
,
dtype
=
torch
.
int32
)
self
.
prefetch_chunk_size
=
256
self
.
prefetch_chunk_size
=
512
self
.
clear
()
self
.
clear
()
...
@@ -67,11 +67,11 @@ class TokenToKVPool:
...
@@ -67,11 +67,11 @@ class TokenToKVPool:
if
need_size
<=
buffer_len
:
if
need_size
<=
buffer_len
:
select_index
=
self
.
prefetch_buffer
[:
need_size
]
select_index
=
self
.
prefetch_buffer
[:
need_size
]
self
.
prefetch_buffer
=
self
.
prefetch_buffer
[
need_size
:]
self
.
prefetch_buffer
=
self
.
prefetch_buffer
[
need_size
:]
return
select_index
.
to
(
torch
.
int32
)
return
select_index
addition_size
=
need_size
-
buffer_len
addition_size
=
need_size
-
buffer_len
alloc_size
=
max
(
addition_size
,
self
.
prefetch_chunk_size
)
alloc_size
=
max
(
addition_size
,
self
.
prefetch_chunk_size
)
select_index
=
torch
.
nonzero
(
self
.
mem_state
==
0
).
squeeze
(
1
)[:
alloc_size
]
select_index
=
torch
.
nonzero
(
self
.
mem_state
==
0
).
squeeze
(
1
)[:
alloc_size
]
.
to
(
torch
.
int32
)
if
select_index
.
shape
[
0
]
<
addition_size
:
if
select_index
.
shape
[
0
]
<
addition_size
:
return
None
return
None
...
@@ -82,7 +82,7 @@ class TokenToKVPool:
...
@@ -82,7 +82,7 @@ class TokenToKVPool:
ret_index
=
self
.
prefetch_buffer
[:
need_size
]
ret_index
=
self
.
prefetch_buffer
[:
need_size
]
self
.
prefetch_buffer
=
self
.
prefetch_buffer
[
need_size
:]
self
.
prefetch_buffer
=
self
.
prefetch_buffer
[
need_size
:]
return
ret_index
.
to
(
torch
.
int32
)
return
ret_index
def
alloc_contiguous
(
self
,
need_size
):
def
alloc_contiguous
(
self
,
need_size
):
# NOTE: This function is deprecated.
# NOTE: This function is deprecated.
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment