fix: DSR1 DEP Prefill Profiling Benchmark (#4367)

Signed-off-by: hongkuanz <hongkuanz@nvidia.com>

fix: DSR1 DEP Prefill Profiling Benchmark (#4367)
Signed-off-by: hongkuanz <hongkuanz@nvidia.com>
cf97c0dc · Hongkuan Zhou · GitHub · 24af5a33 · cf97c0dc · cf97c0dc
Unverified Commit cf97c0dc authored Nov 17, 2025 by Hongkuan Zhou Committed by GitHub Nov 17, 2025
7 changed files
--- a/benchmarks/profiler/profile_sla.py
+++ b/benchmarks/profiler/profile_sla.py
@@ -457,6 +457,7 @@ async def run_profile(args):
                                model_name,
                                base_url=base_url,
                                num_gpus=num_gpus,
+                                attention_dp_size=mapping.get_attn_dp_size(),
                            )
                        if itl is not None and thpt_per_gpu is not None:

--- a/benchmarks/profiler/utils/aiperf.py
+++ b/benchmarks/profiler/utils/aiperf.py
@@ -20,6 +20,12 @@ import random
 import subprocess
 from typing import Optional, Tuple
+from benchmarks.profiler.utils.defaults import (
+    AIPERF_PREFILL_ATTN_DP_NUM_REQ_RATIO,
+    AIPERF_PREFILL_BENCHMARK_OSL,
+    AIPERF_WARMUP_REQUEST_PER_DP_RANK,
+)
 logger = logging.getLogger(__name__)
 logger.setLevel(logging.INFO)
 console_handler = logging.StreamHandler()
@@ -37,7 +43,7 @@ def _get_common_aiperf_cmd(
    model="deepseek-ai/DeepSeek-R1-Distill-Llama-8B",
    tokenizer="deepseek-ai/DeepSeek-R1-Distill-Llama-8B",
    base_url="http://localhost:8000",
-    warmup_request_count: int = 3,
+    warmup_request_count: int = AIPERF_WARMUP_REQUEST_PER_DP_RANK,
 ):
    return [
        "aiperf",
@@ -74,11 +80,11 @@ def get_prefill_aiperf_cmd(
    seed=100,
    model="deepseek-ai/DeepSeek-R1-Distill-Llama-8B",
    tokenizer="deepseek-ai/DeepSeek-R1-Distill-Llama-8B",
-    osl=5,
+    osl=AIPERF_PREFILL_BENCHMARK_OSL,
    base_url="http://localhost:8000",
    concurrency: int = 1,
    request_count: int = 1,
-    warmup_request_count: int = 3,
+    warmup_request_count: int = AIPERF_WARMUP_REQUEST_PER_DP_RANK,
 ):
    return _get_common_aiperf_cmd(
        artifact_dir,
@@ -116,6 +122,7 @@ def get_decode_aiperf_cmd(
    model="deepseek-ai/DeepSeek-R1-Distill-Llama-8B",
    tokenizer="deepseek-ai/DeepSeek-R1-Distill-Llama-8B",
    base_url="http://localhost:8000",
+    warmup_request_count: int = AIPERF_WARMUP_REQUEST_PER_DP_RANK,
 ):
    return _get_common_aiperf_cmd(
        artifact_dir,
@@ -123,6 +130,7 @@ def get_decode_aiperf_cmd(
        model,
        tokenizer,
        base_url,
+        warmup_request_count=warmup_request_count,
    ) + [
        "--synthetic-input-tokens-mean",
        str(isl),
@@ -207,7 +215,7 @@ def get_prefill_ttft(
    tokenizer: str,
    base_url: str = "http://localhost:8000",
    attention_dp_size: int = 1,
-    attn_dp_num_req_ratio: int = 4,
+    attn_dp_num_req_ratio: int = AIPERF_PREFILL_ATTN_DP_NUM_REQ_RATIO,
 ) -> Optional[float]:
    """
    Run prefill benchmark and extract TTFT (ms). Returns None on failure.
@@ -218,6 +226,7 @@ def get_prefill_ttft(
    """
    # DEP-aware measurement (waves of size attention_dp_size)
    if attention_dp_size > 1:
+        assert attn_dp_num_req_ratio > 0, "attn_dp_num_req_ratio must be greater than 0"
        total_concurrency = attention_dp_size * attn_dp_num_req_ratio
        logger.info(
            f"DEP prefill measurement: isl={isl}, attn_dp={attention_dp_size}, attn_dp_num_req_ratio={attn_dp_num_req_ratio}, "
@@ -232,9 +241,16 @@ def get_prefill_ttft(
            base_url=base_url,
            concurrency=total_concurrency,
            request_count=total_concurrency,
+            warmup_request_count=AIPERF_WARMUP_REQUEST_PER_DP_RANK * attention_dp_size,
        )
        try:
            max_ttft = float(aiperf_result["time_to_first_token"]["max"])
+            # subtract the decoding time in-between prefill runs
+            max_ttft -= (
+                float(aiperf_result["inter_token_latency"]["avg"])
+                * (AIPERF_PREFILL_BENCHMARK_OSL - 1)
+                * (attn_dp_num_req_ratio - 1)
+            )
            return max_ttft / float(attn_dp_num_req_ratio)
        except (KeyError, TypeError, ValueError):
            logger.warning(
@@ -266,6 +282,7 @@ def get_decode_itl_and_thpt_per_gpu(
    tokenizer: str,
    base_url: str = "http://localhost:8000",
    num_gpus: int = 1,
+    attention_dp_size: int = 1,
 ) -> Tuple[Optional[float], Optional[float]]:
    """
    Run decode benchmark and extract (ITL ms, throughput per GPU).
@@ -279,6 +296,7 @@ def get_decode_itl_and_thpt_per_gpu(
        model_name,
        tokenizer,
        base_url=base_url,
+        warmup_request_count=AIPERF_WARMUP_REQUEST_PER_DP_RANK * attention_dp_size,
    )
    if aiperf_result is None:
        return None, None
@@ -300,6 +318,7 @@ def benchmark_decode(
    model_name,
    tokenizer,
    base_url="http://localhost:8000",
+    warmup_request_count: int = AIPERF_WARMUP_REQUEST_PER_DP_RANK,
 ):
    logger.info(f"Profiling decode with num_request {num_request}...")
@@ -316,6 +335,7 @@ def benchmark_decode(
        model=model_name,
        tokenizer=tokenizer,
        base_url=base_url,
+        warmup_request_count=warmup_request_count,
    )
    aiperf_process = subprocess.Popen(
        aiperf_cmd,

--- a/benchmarks/profiler/utils/config_modifiers/parallelization_mapping.py
+++ b/benchmarks/profiler/utils/config_modifiers/parallelization_mapping.py
@@ -226,6 +226,7 @@ def apply_parallel_mapping_to_config(
        cfg = config_modifier.set_prefill_config(
            cfg,
            max_batch_size=mapping.get_attn_dp_size(),
-            max_num_tokens=PREFILL_MAX_NUM_TOKENS,
+            # max num tokens is shared by all attention dp ranks
+            max_num_tokens=PREFILL_MAX_NUM_TOKENS * mapping.get_attn_dp_size(),
        )
    return cfg
--- a/benchmarks/profiler/utils/config_modifiers/sglang.py
+++ b/benchmarks/profiler/utils/config_modifiers/sglang.py
@@ -376,5 +376,7 @@ class SGLangConfigModifier:
        # Cap total tokens processed in a batch to avoid chunked prefill
        args = set_argument_value(args, "--chunked-prefill-size", str(max_num_tokens))
+        args = append_argument(args, "--enable-dp-lm-head")
        worker_service.extraPodSpec.mainContainer.args = args
        return cfg.model_dump()
--- a/benchmarks/profiler/utils/defaults.py
+++ b/benchmarks/profiler/utils/defaults.py
@@ -25,6 +25,11 @@ DECODE_MAX_CONCURRENCY = 2000
 # set a prefill maximum number of tokens to 32768 to avoid chunked prefill but not too large to cause activation tensor too large
 PREFILL_MAX_NUM_TOKENS = 32768
+# AIPerf benchmarking related defaults
+AIPERF_WARMUP_REQUEST_PER_DP_RANK = 3
+AIPERF_PREFILL_BENCHMARK_OSL = 5
+AIPERF_PREFILL_ATTN_DP_NUM_REQ_RATIO = 4
 class EngineType(str, Enum):
    PREFILL = "prefill"

--- a/benchmarks/profiler/utils/profile_decode.py
+++ b/benchmarks/profiler/utils/profile_decode.py
@@ -123,6 +123,7 @@ def profile_decode(
            tokenizer,
            base_url=url,
            num_gpus=num_gpus,
+            attention_dp_size=attention_dp_size,
        )
    return _profile_decode_helper(

--- a/benchmarks/profiler/utils/profile_prefill.py
+++ b/benchmarks/profiler/utils/profile_prefill.py
@@ -90,7 +90,6 @@ def profile_prefill(
    max_context_length,
    interpolation_granularity,
    attention_dp_size: int = 1,
-    attn_dp_num_req_ratio: int = 4,
 ):
    def get_ttft(isl):
        ai_perf_artifact_dir = f"{work_dir}/aiperf_isl{isl}"
@@ -101,7 +100,6 @@ def profile_prefill(
            tokenizer,
            base_url=url,
            attention_dp_size=attention_dp_size,
-            attn_dp_num_req_ratio=attn_dp_num_req_ratio,
        )
    return _profile_prefill_helper(