[Misc][Tools][Benchmark] Add benchmark_serving supports for llama.cpp. (#18692)

Signed-off-by: Duyi-Wang <duyi.wang@intel.com>

[Misc][Tools][Benchmark] Add benchmark_serving supports for llama.cpp. (#18692)
Signed-off-by: Duyi-Wang <duyi.wang@intel.com>
b169d5f7 · Duyi-Wang · GitHub · f8977c23 · b169d5f7 · b169d5f7
Unverified Commit b169d5f7 authored May 29, 2025 by Duyi-Wang Committed by GitHub May 29, 2025
Show whitespace changes
Inline Side-by-side

Showing with 6 additions and 1 deletion

benchmarks/backend_request_func.py benchmarks/backend_request_func.py +2 -1

benchmarks/benchmark_serving.py benchmarks/benchmark_serving.py +4 -0

No files found.
--- a/benchmarks/backend_request_func.py
+++ b/benchmarks/backend_request_func.py
@@ -324,7 +324,7 @@ async def async_request_openai_completions(

                                most_recent_timestamp = timestamp
                                generated_text += text or ""
-                            elif usage := data.get("usage"):
+                            if usage := data.get("usage"):
                                output.output_tokens = usage.get("completion_tokens")
                    if first_chunk_received:
                        output.success = True
@@ -611,6 +611,7 @@ ASYNC_REQUEST_FUNCS = {
    "tensorrt-llm": async_request_trt_llm,
    "scalellm": async_request_openai_completions,
    "sglang": async_request_openai_completions,
+    "llama.cpp": async_request_openai_completions,
 }

 OPENAI_COMPATIBLE_BACKENDS = [

--- a/benchmarks/benchmark_serving.py
+++ b/benchmarks/benchmark_serving.py
@@ -762,6 +762,10 @@ def main(args: argparse.Namespace):
    if "temperature" not in sampling_params:
        sampling_params["temperature"] = 0.0  # Default to greedy decoding.

+    if args.backend == "llama.cpp":
+        # Disable prompt caching in llama.cpp backend
+        sampling_params["cache_prompt"] = False
+
    # Avoid GC processing "static" data - reduce pause times.
    gc.collect()
    gc.freeze()