Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
b169d5f7
Unverified
Commit
b169d5f7
authored
May 29, 2025
by
Duyi-Wang
Committed by
GitHub
May 29, 2025
Browse files
[Misc][Tools][Benchmark] Add benchmark_serving supports for llama.cpp. (#18692)
Signed-off-by:
Duyi-Wang
<
duyi.wang@intel.com
>
parent
f8977c23
Changes
2
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
6 additions
and
1 deletion
+6
-1
benchmarks/backend_request_func.py
benchmarks/backend_request_func.py
+2
-1
benchmarks/benchmark_serving.py
benchmarks/benchmark_serving.py
+4
-0
No files found.
benchmarks/backend_request_func.py
View file @
b169d5f7
...
@@ -324,7 +324,7 @@ async def async_request_openai_completions(
...
@@ -324,7 +324,7 @@ async def async_request_openai_completions(
most_recent_timestamp
=
timestamp
most_recent_timestamp
=
timestamp
generated_text
+=
text
or
""
generated_text
+=
text
or
""
el
if
usage
:
=
data
.
get
(
"usage"
):
if
usage
:
=
data
.
get
(
"usage"
):
output
.
output_tokens
=
usage
.
get
(
"completion_tokens"
)
output
.
output_tokens
=
usage
.
get
(
"completion_tokens"
)
if
first_chunk_received
:
if
first_chunk_received
:
output
.
success
=
True
output
.
success
=
True
...
@@ -611,6 +611,7 @@ ASYNC_REQUEST_FUNCS = {
...
@@ -611,6 +611,7 @@ ASYNC_REQUEST_FUNCS = {
"tensorrt-llm"
:
async_request_trt_llm
,
"tensorrt-llm"
:
async_request_trt_llm
,
"scalellm"
:
async_request_openai_completions
,
"scalellm"
:
async_request_openai_completions
,
"sglang"
:
async_request_openai_completions
,
"sglang"
:
async_request_openai_completions
,
"llama.cpp"
:
async_request_openai_completions
,
}
}
OPENAI_COMPATIBLE_BACKENDS
=
[
OPENAI_COMPATIBLE_BACKENDS
=
[
...
...
benchmarks/benchmark_serving.py
View file @
b169d5f7
...
@@ -762,6 +762,10 @@ def main(args: argparse.Namespace):
...
@@ -762,6 +762,10 @@ def main(args: argparse.Namespace):
if
"temperature"
not
in
sampling_params
:
if
"temperature"
not
in
sampling_params
:
sampling_params
[
"temperature"
]
=
0.0
# Default to greedy decoding.
sampling_params
[
"temperature"
]
=
0.0
# Default to greedy decoding.
if
args
.
backend
==
"llama.cpp"
:
# Disable prompt caching in llama.cpp backend
sampling_params
[
"cache_prompt"
]
=
False
# Avoid GC processing "static" data - reduce pause times.
# Avoid GC processing "static" data - reduce pause times.
gc
.
collect
()
gc
.
collect
()
gc
.
freeze
()
gc
.
freeze
()
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment