"lib/bindings/python/vscode:/vscode.git/clone" did not exist on "960dc8967e51e9b0e9eecf64cacd6d5cace2f43e"
Unverified Commit cf97c0dc authored by Hongkuan Zhou's avatar Hongkuan Zhou Committed by GitHub
Browse files

fix: DSR1 DEP Prefill Profiling Benchmark (#4367)


Signed-off-by: default avatarhongkuanz <hongkuanz@nvidia.com>
parent 24af5a33
......@@ -457,6 +457,7 @@ async def run_profile(args):
model_name,
base_url=base_url,
num_gpus=num_gpus,
attention_dp_size=mapping.get_attn_dp_size(),
)
if itl is not None and thpt_per_gpu is not None:
......
......@@ -20,6 +20,12 @@ import random
import subprocess
from typing import Optional, Tuple
from benchmarks.profiler.utils.defaults import (
AIPERF_PREFILL_ATTN_DP_NUM_REQ_RATIO,
AIPERF_PREFILL_BENCHMARK_OSL,
AIPERF_WARMUP_REQUEST_PER_DP_RANK,
)
logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)
console_handler = logging.StreamHandler()
......@@ -37,7 +43,7 @@ def _get_common_aiperf_cmd(
model="deepseek-ai/DeepSeek-R1-Distill-Llama-8B",
tokenizer="deepseek-ai/DeepSeek-R1-Distill-Llama-8B",
base_url="http://localhost:8000",
warmup_request_count: int = 3,
warmup_request_count: int = AIPERF_WARMUP_REQUEST_PER_DP_RANK,
):
return [
"aiperf",
......@@ -74,11 +80,11 @@ def get_prefill_aiperf_cmd(
seed=100,
model="deepseek-ai/DeepSeek-R1-Distill-Llama-8B",
tokenizer="deepseek-ai/DeepSeek-R1-Distill-Llama-8B",
osl=5,
osl=AIPERF_PREFILL_BENCHMARK_OSL,
base_url="http://localhost:8000",
concurrency: int = 1,
request_count: int = 1,
warmup_request_count: int = 3,
warmup_request_count: int = AIPERF_WARMUP_REQUEST_PER_DP_RANK,
):
return _get_common_aiperf_cmd(
artifact_dir,
......@@ -116,6 +122,7 @@ def get_decode_aiperf_cmd(
model="deepseek-ai/DeepSeek-R1-Distill-Llama-8B",
tokenizer="deepseek-ai/DeepSeek-R1-Distill-Llama-8B",
base_url="http://localhost:8000",
warmup_request_count: int = AIPERF_WARMUP_REQUEST_PER_DP_RANK,
):
return _get_common_aiperf_cmd(
artifact_dir,
......@@ -123,6 +130,7 @@ def get_decode_aiperf_cmd(
model,
tokenizer,
base_url,
warmup_request_count=warmup_request_count,
) + [
"--synthetic-input-tokens-mean",
str(isl),
......@@ -207,7 +215,7 @@ def get_prefill_ttft(
tokenizer: str,
base_url: str = "http://localhost:8000",
attention_dp_size: int = 1,
attn_dp_num_req_ratio: int = 4,
attn_dp_num_req_ratio: int = AIPERF_PREFILL_ATTN_DP_NUM_REQ_RATIO,
) -> Optional[float]:
"""
Run prefill benchmark and extract TTFT (ms). Returns None on failure.
......@@ -218,6 +226,7 @@ def get_prefill_ttft(
"""
# DEP-aware measurement (waves of size attention_dp_size)
if attention_dp_size > 1:
assert attn_dp_num_req_ratio > 0, "attn_dp_num_req_ratio must be greater than 0"
total_concurrency = attention_dp_size * attn_dp_num_req_ratio
logger.info(
f"DEP prefill measurement: isl={isl}, attn_dp={attention_dp_size}, attn_dp_num_req_ratio={attn_dp_num_req_ratio}, "
......@@ -232,9 +241,16 @@ def get_prefill_ttft(
base_url=base_url,
concurrency=total_concurrency,
request_count=total_concurrency,
warmup_request_count=AIPERF_WARMUP_REQUEST_PER_DP_RANK * attention_dp_size,
)
try:
max_ttft = float(aiperf_result["time_to_first_token"]["max"])
# subtract the decoding time in-between prefill runs
max_ttft -= (
float(aiperf_result["inter_token_latency"]["avg"])
* (AIPERF_PREFILL_BENCHMARK_OSL - 1)
* (attn_dp_num_req_ratio - 1)
)
return max_ttft / float(attn_dp_num_req_ratio)
except (KeyError, TypeError, ValueError):
logger.warning(
......@@ -266,6 +282,7 @@ def get_decode_itl_and_thpt_per_gpu(
tokenizer: str,
base_url: str = "http://localhost:8000",
num_gpus: int = 1,
attention_dp_size: int = 1,
) -> Tuple[Optional[float], Optional[float]]:
"""
Run decode benchmark and extract (ITL ms, throughput per GPU).
......@@ -279,6 +296,7 @@ def get_decode_itl_and_thpt_per_gpu(
model_name,
tokenizer,
base_url=base_url,
warmup_request_count=AIPERF_WARMUP_REQUEST_PER_DP_RANK * attention_dp_size,
)
if aiperf_result is None:
return None, None
......@@ -300,6 +318,7 @@ def benchmark_decode(
model_name,
tokenizer,
base_url="http://localhost:8000",
warmup_request_count: int = AIPERF_WARMUP_REQUEST_PER_DP_RANK,
):
logger.info(f"Profiling decode with num_request {num_request}...")
......@@ -316,6 +335,7 @@ def benchmark_decode(
model=model_name,
tokenizer=tokenizer,
base_url=base_url,
warmup_request_count=warmup_request_count,
)
aiperf_process = subprocess.Popen(
aiperf_cmd,
......
......@@ -226,6 +226,7 @@ def apply_parallel_mapping_to_config(
cfg = config_modifier.set_prefill_config(
cfg,
max_batch_size=mapping.get_attn_dp_size(),
max_num_tokens=PREFILL_MAX_NUM_TOKENS,
# max num tokens is shared by all attention dp ranks
max_num_tokens=PREFILL_MAX_NUM_TOKENS * mapping.get_attn_dp_size(),
)
return cfg
......@@ -376,5 +376,7 @@ class SGLangConfigModifier:
# Cap total tokens processed in a batch to avoid chunked prefill
args = set_argument_value(args, "--chunked-prefill-size", str(max_num_tokens))
args = append_argument(args, "--enable-dp-lm-head")
worker_service.extraPodSpec.mainContainer.args = args
return cfg.model_dump()
......@@ -25,6 +25,11 @@ DECODE_MAX_CONCURRENCY = 2000
# set a prefill maximum number of tokens to 32768 to avoid chunked prefill but not too large to cause activation tensor too large
PREFILL_MAX_NUM_TOKENS = 32768
# AIPerf benchmarking related defaults
AIPERF_WARMUP_REQUEST_PER_DP_RANK = 3
AIPERF_PREFILL_BENCHMARK_OSL = 5
AIPERF_PREFILL_ATTN_DP_NUM_REQ_RATIO = 4
class EngineType(str, Enum):
PREFILL = "prefill"
......
......@@ -123,6 +123,7 @@ def profile_decode(
tokenizer,
base_url=url,
num_gpus=num_gpus,
attention_dp_size=attention_dp_size,
)
return _profile_decode_helper(
......
......@@ -90,7 +90,6 @@ def profile_prefill(
max_context_length,
interpolation_granularity,
attention_dp_size: int = 1,
attn_dp_num_req_ratio: int = 4,
):
def get_ttft(isl):
ai_perf_artifact_dir = f"{work_dir}/aiperf_isl{isl}"
......@@ -101,7 +100,6 @@ def profile_prefill(
tokenizer,
base_url=url,
attention_dp_size=attention_dp_size,
attn_dp_num_req_ratio=attn_dp_num_req_ratio,
)
return _profile_prefill_helper(
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment