fix(benchmarks): use --conversation-num for grouped single_turn sweeps (DIS-1807) (#8458)

Signed-off-by: Qi Wang <qiwa@nvidia.com> Co-authored-by: Claude Opus 4.7 (1M context) <noreply@anthropic.com>

fix(benchmarks): use --conversation-num for grouped single_turn sweeps (DIS-1807) (#8458)
Signed-off-by: Qi Wang <qiwa@nvidia.com> Co-authored-by: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
f222e555 · Qi Wang · GitHub · f12c8605 · f222e555 · f222e555
Unverified Commit f222e555 authored Apr 23, 2026 by Qi Wang Committed by GitHub Apr 23, 2026
6 changed files
--- a/benchmarks/multimodal/sweep/README.md
+++ b/benchmarks/multimodal/sweep/README.md
@@ -29,7 +29,9 @@ python -m benchmarks.multimodal.sweep \
 model: Qwen/Qwen3-VL-30B-A3B-Instruct-FP8
 concurrencies: [16, 32, 64, 128, 256]
 osl: 150                    # output sequence length
-request_count: 1000         # requests per concurrency level
+conversation_num: 10        # sessions per sweep value (optional; derived from
+                            # input JSONL's unique session_id count if unset;
+                            # flat JSONLs count each row as a 1-turn conversation)
 warmup_count: 5
 port: 8000
 timeout: 900                # seconds to wait for server readiness
@@ -64,10 +66,21 @@ python -m benchmarks.multimodal.sweep \
  --config experiments/embedding_cache/vllm_serve.yaml \
  --concurrencies 1,2,4 \
  --osl 200 \
-  --request-count 50 \
+  --conversation-num 10 \
  --skip-plots
 ```

+## Warmup semantics
+
+`warmup_count: N` is a **request (turn) budget**, not a session budget. For a
+10×10 JSONL with `warmup_count: 2`, warmup issues 2 total requests — both go
+to `user_0` (turns 0 and 1) because aiperf's continuation-turn priority keeps
+feeding the in-flight session until its budget runs out. Warmup does NOT
+consume 2 full sessions (20 requests). Profiling then starts at `user_1`,
+runs `user_1..user_9` to completion, and wraps to a fresh `user_0` instance
+for the 10th session. Keep `warmup_count` small (≤ turns-per-session) so
+warmup stays within a single session's prefix.
+
 ## Output Directory Structure

 Given the config above with two input files and two configs (`cache-off`,

--- a/benchmarks/multimodal/sweep/args.py
+++ b/benchmarks/multimodal/sweep/args.py
@@ -59,10 +59,14 @@ def parse_args(argv=None) -> argparse.Namespace:
        help="Override output sequence length.",
    )
    parser.add_argument(
-        "--request-count",
+        "--conversation-num",
        type=int,
        default=None,
-        help="Override request count per sweep value.",
+        help=(
+            "Override number of conversations (sessions) per sweep value. "
+            "If unset, derived from the input JSONL's unique session_id count "
+            "(flat JSONLs count each row as a 1-turn conversation)."
+        ),
    )
    parser.add_argument(
        "--skip-plots",

--- a/benchmarks/multimodal/sweep/config.py
+++ b/benchmarks/multimodal/sweep/config.py
@@ -32,7 +32,7 @@ class SweepConfig:
    request_rates: Optional[List[int]] = None
    concurrencies: Optional[List[int]] = None
    osl: int = 150
-    request_count: int = 1000
+    conversation_num: Optional[int] = None
    warmup_count: int = 5
    port: int = 8000
    timeout: int = 600
@@ -127,7 +127,7 @@ def load_config(
        request_rates=yaml_request_rates,
        concurrencies=yaml_concurrencies,
        osl=raw.get("osl", 150),
-        request_count=raw.get("request_count", 1000),
+        conversation_num=raw.get("conversation_num"),
        warmup_count=raw.get("warmup_count", 5),
        port=raw.get("port", 8000),
        timeout=raw.get("timeout", 600),

--- a/benchmarks/multimodal/sweep/dataset_shape.py
+++ b/benchmarks/multimodal/sweep/dataset_shape.py
+# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+from __future__ import annotations
+
+import json
+from pathlib import Path
+
+
+def count_session_ids(jsonl_path: str | Path) -> int:
+    """Count unique ``session_id`` values in a JSONL dataset.
+
+    Used to derive ``conversation_num`` for the sweep when the user hasn't set
+    it explicitly. Rows without ``session_id`` count as distinct sessions
+    (matches aiperf's per-row UUID fallback in ``SingleTurnDatasetLoader``).
+
+    For multi_turn rows (``{"type": "multi_turn", "session_id": ..., "turns": [...]}``),
+    the top-level ``session_id`` is what counts.
+    """
+    sessions: set[str] = set()
+    anon_count = 0
+    with open(jsonl_path) as f:
+        for line in f:
+            line = line.strip()
+            if not line:
+                continue
+            row = json.loads(line)
+            sid = row.get("session_id")
+            if sid is None:
+                anon_count += 1
+            else:
+                sessions.add(str(sid))
+    return len(sessions) + anon_count
--- a/benchmarks/multimodal/sweep/orchestrator.py
+++ b/benchmarks/multimodal/sweep/orchestrator.py
@@ -7,6 +7,7 @@ from pathlib import Path
 from typing import List, Optional

 from .config import BenchmarkConfig, SweepConfig, input_file_tag, resolve_repo_root
+from .dataset_shape import count_session_ids
 from .runner import run_aiperf_single
 from .server import ServerManager

@@ -18,6 +19,26 @@ def _resolve_workflow(workflow: str, repo_root: Path) -> str:
    return str(repo_root / p)


+def _resolve_conversation_num(config: SweepConfig, input_file: str) -> int:
+    """Pick conversation_num for this input file: explicit value from config wins,
+    otherwise derive from the JSONL's unique session_id count. Error if an
+    explicit value exceeds the JSONL's capacity (sampler would wrap)."""
+    detected = count_session_ids(input_file)
+    if config.conversation_num is None:
+        print(
+            f"  conversation_num derived from {input_file}: {detected}",
+            flush=True,
+        )
+        return detected
+    if config.conversation_num > detected:
+        raise ValueError(
+            f"conversation_num={config.conversation_num} exceeds unique "
+            f"session_id count ({detected}) in {input_file}. SequentialSampler "
+            f"would wrap. Set conversation_num <= {detected} or reshape the JSONL."
+        )
+    return config.conversation_num
+
+
 def _print_banner(title: str, char: str = "=", width: int = 70) -> None:
    print(f"\n{char * width}")
    print(f"  {title}")
@@ -46,7 +67,8 @@ def run_sweep(
    print(f"  Sweep mode:    {sweep_mode}")
    print(f"  Sweep values:  {sweep_values}")
    print(f"  OSL:           {config.osl}")
-    print(f"  Requests:      {config.request_count} per {sweep_mode}")
+    if config.conversation_num is not None:
+        print(f"  Conversations: {config.conversation_num} per {sweep_mode}")
    print(
        f"  Restart:       {'every run' if config.restart_server_every_benchmark else 'per config'}"
    )
@@ -98,11 +120,13 @@ def _run_config(
    _print_banner(f"Config: {bench_cfg.label}", char="#")

    # Collect pending runs, skipping those with existing results.
-    pending_runs: List[tuple[str, str, int, Path]] = []
+    pending_runs: List[tuple[str, str, int, Path, int]] = []
    for input_file in config.input_files:
        file_tag = input_file_tag(input_file)
        sweep_dir = output_base / file_tag / bench_cfg.label

+        conversation_num = _resolve_conversation_num(config, input_file)
+
        for value in sorted(sweep_values):
            artifact_dir = sweep_dir / f"{sweep_mode}{value}"

@@ -113,7 +137,9 @@ def _run_config(
                    flush=True,
                )
            else:
-                pending_runs.append((input_file, file_tag, value, artifact_dir))
+                pending_runs.append(
+                    (input_file, file_tag, value, artifact_dir, conversation_num)
+                )

    if not pending_runs:
        print(f"  All runs skipped for {bench_cfg.label}", flush=True)
@@ -128,7 +154,7 @@ def _run_config(
        )

    try:
-        for input_file, file_tag, value, artifact_dir in pending_runs:
+        for input_file, file_tag, value, artifact_dir, conversation_num in pending_runs:
            _print_banner(
                f"[{file_tag}] Config: {bench_cfg.label}  " f"{sweep_mode}={value}",
                char="-",
@@ -148,7 +174,7 @@ def _run_config(
                    port=config.port,
                    sweep_mode=sweep_mode,
                    sweep_value=value,
-                    request_count=config.request_count,
+                    conversation_num=conversation_num,
                    warmup_count=config.warmup_count,
                    input_file=input_file,
                    osl=config.osl,

--- a/benchmarks/multimodal/sweep/runner.py
+++ b/benchmarks/multimodal/sweep/runner.py
@@ -13,7 +13,7 @@ def _build_aiperf_cmd(
    port: int,
    sweep_mode: str,
    sweep_value: int,
-    request_count: int,
+    conversation_num: int,
    warmup_count: int,
    input_file: str,
    osl: int,
@@ -33,8 +33,8 @@ def _build_aiperf_cmd(
        f"http://localhost:{port}",
        sweep_flag,
        str(sweep_value),
-        "--request-count",
-        str(request_count),
+        "--conversation-num",
+        str(conversation_num),
        "--warmup-request-count",
        str(warmup_count),
        "--input-file",
@@ -63,7 +63,7 @@ def run_aiperf_single(
    port: int,
    sweep_mode: str,
    sweep_value: int,
-    request_count: int,
+    conversation_num: int,
    warmup_count: int,
    input_file: str,
    osl: int,
@@ -76,7 +76,7 @@ def run_aiperf_single(
        port=port,
        sweep_mode=sweep_mode,
        sweep_value=sweep_value,
-        request_count=request_count,
+        conversation_num=conversation_num,
        warmup_count=warmup_count,
        input_file=input_file,
        osl=osl,
@@ -104,7 +104,7 @@ def run_sweep(
    port: int,
    sweep_mode: str,
    sweep_values: List[int],
-    request_count: int,
+    conversation_num: int,
    warmup_count: int,
    input_file: str,
    osl: int,
@@ -119,7 +119,7 @@ def run_sweep(
            port=port,
            sweep_mode=sweep_mode,
            sweep_value=value,
-            request_count=request_count,
+            conversation_num=conversation_num,
            warmup_count=warmup_count,
            input_file=input_file,
            osl=osl,