Unverified Commit 4bb08f6e authored by ykwd's avatar ykwd Committed by GitHub
Browse files

[Hicache] Evaluate Per-Round Metrics in Multiturn Bench (#10203)


Co-authored-by: default avatarTeng Ma <sima.mt@alibaba-inc.com>
parent ec272dda
...@@ -105,12 +105,16 @@ def parse_args(): ...@@ -105,12 +105,16 @@ def parse_args():
action="store_true", action="store_true",
help="If set, disable automatically testing with a range of request rates.", help="If set, disable automatically testing with a range of request rates.",
) )
parser.add_argument( parser.add_argument(
"--disable-random-sample", "--disable-random-sample",
action="store_true", action="store_true",
help="If set, disable random sampling of requests from the ShareGPT dataset.", help="If set, disable random sampling of requests from the ShareGPT dataset.",
) )
parser.add_argument(
"--enable-round-barrier",
action="store_true",
help="If set, only send i-th turn requests after all (i-1)-th turn requests finished.",
)
parser.add_argument( parser.add_argument(
"--sub-question-input-length", "--sub-question-input-length",
type=int, type=int,
...@@ -335,6 +339,19 @@ class WorkloadGenerator: ...@@ -335,6 +339,19 @@ class WorkloadGenerator:
"cached_tokens": [], "cached_tokens": [],
"generated_len": [], "generated_len": [],
} }
self.enable_round_barrier = args.enable_round_barrier
if self.enable_round_barrier:
# Add round-specific metrics while preserving the original structure
for i in range(args.num_rounds):
self.performance_metrics[f"round_{i}"] = {
"ttft": [],
"latency": [],
"prompt_len": [],
"cached_tokens": [],
"generated_len": [],
}
self.num_clients = args.num_clients
self.num_rounds = args.num_rounds self.num_rounds = args.num_rounds
self.max_parallel = args.max_parallel self.max_parallel = args.max_parallel
self.output_length = args.output_length self.output_length = args.output_length
...@@ -383,6 +400,7 @@ class WorkloadGenerator: ...@@ -383,6 +400,7 @@ class WorkloadGenerator:
loop.close() loop.close()
def response_handler(self): def response_handler(self):
next_round_reqs = []
while True: while True:
try: try:
client_id, response = self.response_queue.get( client_id, response = self.response_queue.get(
...@@ -391,12 +409,29 @@ class WorkloadGenerator: ...@@ -391,12 +409,29 @@ class WorkloadGenerator:
if not response.success: if not response.success:
raise ValueError(f"Request failed with error: {response.error}") raise ValueError(f"Request failed with error: {response.error}")
self.client_records[client_id]["history"] += response.generated_text self.client_records[client_id]["history"] += response.generated_text
current_round = self.client_records[client_id]["round"]
self.client_records[client_id]["round"] += 1 self.client_records[client_id]["round"] += 1
self.performance_metrics["ttft"].append(response.ttft) self.performance_metrics["ttft"].append(response.ttft)
self.performance_metrics["latency"].append(response.latency) self.performance_metrics["latency"].append(response.latency)
self.performance_metrics["prompt_len"].append(response.prompt_len) self.performance_metrics["prompt_len"].append(response.prompt_len)
self.performance_metrics["cached_tokens"].append(response.cached_tokens) self.performance_metrics["cached_tokens"].append(response.cached_tokens)
self.performance_metrics["generated_len"].append(response.generated_len) self.performance_metrics["generated_len"].append(response.generated_len)
if self.enable_round_barrier:
self.performance_metrics[f"round_{current_round}"]["ttft"].append(
response.ttft
)
self.performance_metrics[f"round_{current_round}"][
"latency"
].append(response.latency)
self.performance_metrics[f"round_{current_round}"][
"prompt_len"
].append(response.prompt_len)
self.performance_metrics[f"round_{current_round}"][
"cached_tokens"
].append(response.cached_tokens)
self.performance_metrics[f"round_{current_round}"][
"generated_len"
].append(response.generated_len)
self.completed_requests += 1 self.completed_requests += 1
if self.client_records[client_id]["round"] < self.num_rounds: if self.client_records[client_id]["round"] < self.num_rounds:
...@@ -404,16 +439,22 @@ class WorkloadGenerator: ...@@ -404,16 +439,22 @@ class WorkloadGenerator:
self.client_records[client_id][ self.client_records[client_id][
"history" "history"
] += self.sub_question_inputs.pop().prompt ] += self.sub_question_inputs.pop().prompt
self.ready_queue.append( new_req = (
( client_id,
client_id, gen_payload(
gen_payload( self.client_records[client_id]["history"],
self.client_records[client_id]["history"], self.output_length,
self.output_length, args.lora_path,
args.lora_path, ),
),
)
) )
if self.enable_round_barrier:
next_round_reqs.append(new_req)
if len(next_round_reqs) == self.num_clients:
for req in next_round_reqs:
self.ready_queue.append(req)
next_round_reqs = []
else:
self.ready_queue.append(new_req)
except queue.Empty: except queue.Empty:
if self.pbar.n == self.pbar.total: if self.pbar.n == self.pbar.total:
break break
...@@ -469,6 +510,25 @@ class WorkloadGenerator: ...@@ -469,6 +510,25 @@ class WorkloadGenerator:
), ),
}, },
} }
if self.enable_round_barrier:
performance_data["round"] = {}
for round_num in range(args.num_rounds):
round_key = f"round_{round_num}"
round_metrics = self.performance_metrics[round_key]
performance_data["round"][round_key] = {
"average_ttft": (
sum(round_metrics["ttft"]) / len(round_metrics["ttft"])
if round_metrics["ttft"]
else 0
),
"cache_hit_rate": (
0
if sum(round_metrics["prompt_len"]) == 0
else sum(round_metrics["cached_tokens"])
/ sum(round_metrics["prompt_len"])
),
"request_count": len(round_metrics["ttft"]),
}
print("All requests completed") print("All requests completed")
print("Performance metrics summary:") print("Performance metrics summary:")
print( print(
...@@ -492,6 +552,26 @@ class WorkloadGenerator: ...@@ -492,6 +552,26 @@ class WorkloadGenerator:
f" Request Throughput: {performance_data['summary']['throughput']:.2f} requests per second" f" Request Throughput: {performance_data['summary']['throughput']:.2f} requests per second"
) )
print(f" Cache Hit Rate: {performance_data['summary']['cache_hit_rate']:.6f}") print(f" Cache Hit Rate: {performance_data['summary']['cache_hit_rate']:.6f}")
if self.enable_round_barrier:
# Print round-basedsummary
print("Per-round metrics:")
if "round" in performance_data:
for round_num in range(self.num_rounds):
round_key = f"round_{round_num}"
if round_key in performance_data["round"]:
round_data = performance_data["round"][round_key]
avg_ttft = round_data["average_ttft"]
cache_hit_rate = round_data["cache_hit_rate"]
request_count = round_data["request_count"]
print(
f" Round {round_num}: Average TTFT = {avg_ttft:.2f}s, "
f"Cache Hit Rate = {cache_hit_rate:.6f} "
f"({request_count} requests)"
)
else:
print(f" Round {round_num}: No requests completed")
return performance_data return performance_data
......
...@@ -66,15 +66,22 @@ python -m mooncake.http_metadata_server ...@@ -66,15 +66,22 @@ python -m mooncake.http_metadata_server
**Launch Mooncake `master service`:** **Launch Mooncake `master service`:**
```bash ```bash
mooncake_master mooncake_master --eviction_high_watermark_ratio=0.95
``` ```
To start both the metadata and master services together: To start both the metadata and master services together:
```bash ```bash
mooncake_master --enable_http_metadata_server=true mooncake_master --enable_http_metadata_server=true --eviction_high_watermark_ratio=0.95
``` ```
**Launch Mooncake `store service`:** **Understanding `eviction_high_watermark_ratio`:**
When a `PutStart` request fails due to insufficient memory, or when the eviction thread detects that space usage has reached the configured high watermark ratio, an eviction task is triggered to free up space by evicting a portion of objects.
Due to memory fragmentation, allocation failures may occur even when memory usage has not yet reached 100%. The actual threshold depends on the workload. This [benchmark document](https://kvcache-ai.github.io/Mooncake/performance/allocator_benchmark_result.html)
provides memory allocation efficiency results under different scenarios. if excessive allocation failures are observed, consider lowering this parameter accordingly.
**Launch Mooncake `store service` (Optional):**
First, create and save a configuration file in JSON format. For example: First, create and save a configuration file in JSON format. For example:
...@@ -106,9 +113,10 @@ Then start the `store service`: ...@@ -106,9 +113,10 @@ Then start the `store service`:
python -m mooncake.mooncake_store_service --config=[config_path] python -m mooncake.mooncake_store_service --config=[config_path]
``` ```
Note: To get started quickly, if `MOONCAKE_GLOBAL_SEGMENT_SIZE` is set to a non-zero value when starting the `SGLang server`, launching the `store service` can be skipped. In this case, the `SGLang server` also fulfills the role of the `store service`. Note: If `MOONCAKE_GLOBAL_SEGMENT_SIZE` is set to a non-zero value when starting the `SGLang server`, launching the `store service` can be skipped. In this case, the `SGLang server` also takes on the role of the `store service`, which simplifies deployment but couples the two components together. Users can choose the deployment approach that best fits their needs.
**Start the `SGLang server` with Mooncake enabled:** **Start the `SGLang server` with Mooncake enabled:**
Mooncake configuration can be provided via environment variables. Note that, for optimal performance, the Mooncake backend currently supports only the `page_first` layout (which optimizes memory access patterns for KV cache operations). Mooncake configuration can be provided via environment variables. Note that, for optimal performance, the Mooncake backend currently supports only the `page_first` layout (which optimizes memory access patterns for KV cache operations).
There are two ways to configure Mooncake: 1. Using environment variables; 2. Using extra-config of sglang arguments. There are two ways to configure Mooncake: 1. Using environment variables; 2. Using extra-config of sglang arguments.
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment