Unverified Commit cf97c0dc authored by Hongkuan Zhou's avatar Hongkuan Zhou Committed by GitHub
Browse files

fix: DSR1 DEP Prefill Profiling Benchmark (#4367)


Signed-off-by: default avatarhongkuanz <hongkuanz@nvidia.com>
parent 24af5a33
...@@ -457,6 +457,7 @@ async def run_profile(args): ...@@ -457,6 +457,7 @@ async def run_profile(args):
model_name, model_name,
base_url=base_url, base_url=base_url,
num_gpus=num_gpus, num_gpus=num_gpus,
attention_dp_size=mapping.get_attn_dp_size(),
) )
if itl is not None and thpt_per_gpu is not None: if itl is not None and thpt_per_gpu is not None:
......
...@@ -20,6 +20,12 @@ import random ...@@ -20,6 +20,12 @@ import random
import subprocess import subprocess
from typing import Optional, Tuple from typing import Optional, Tuple
from benchmarks.profiler.utils.defaults import (
AIPERF_PREFILL_ATTN_DP_NUM_REQ_RATIO,
AIPERF_PREFILL_BENCHMARK_OSL,
AIPERF_WARMUP_REQUEST_PER_DP_RANK,
)
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO) logger.setLevel(logging.INFO)
console_handler = logging.StreamHandler() console_handler = logging.StreamHandler()
...@@ -37,7 +43,7 @@ def _get_common_aiperf_cmd( ...@@ -37,7 +43,7 @@ def _get_common_aiperf_cmd(
model="deepseek-ai/DeepSeek-R1-Distill-Llama-8B", model="deepseek-ai/DeepSeek-R1-Distill-Llama-8B",
tokenizer="deepseek-ai/DeepSeek-R1-Distill-Llama-8B", tokenizer="deepseek-ai/DeepSeek-R1-Distill-Llama-8B",
base_url="http://localhost:8000", base_url="http://localhost:8000",
warmup_request_count: int = 3, warmup_request_count: int = AIPERF_WARMUP_REQUEST_PER_DP_RANK,
): ):
return [ return [
"aiperf", "aiperf",
...@@ -74,11 +80,11 @@ def get_prefill_aiperf_cmd( ...@@ -74,11 +80,11 @@ def get_prefill_aiperf_cmd(
seed=100, seed=100,
model="deepseek-ai/DeepSeek-R1-Distill-Llama-8B", model="deepseek-ai/DeepSeek-R1-Distill-Llama-8B",
tokenizer="deepseek-ai/DeepSeek-R1-Distill-Llama-8B", tokenizer="deepseek-ai/DeepSeek-R1-Distill-Llama-8B",
osl=5, osl=AIPERF_PREFILL_BENCHMARK_OSL,
base_url="http://localhost:8000", base_url="http://localhost:8000",
concurrency: int = 1, concurrency: int = 1,
request_count: int = 1, request_count: int = 1,
warmup_request_count: int = 3, warmup_request_count: int = AIPERF_WARMUP_REQUEST_PER_DP_RANK,
): ):
return _get_common_aiperf_cmd( return _get_common_aiperf_cmd(
artifact_dir, artifact_dir,
...@@ -116,6 +122,7 @@ def get_decode_aiperf_cmd( ...@@ -116,6 +122,7 @@ def get_decode_aiperf_cmd(
model="deepseek-ai/DeepSeek-R1-Distill-Llama-8B", model="deepseek-ai/DeepSeek-R1-Distill-Llama-8B",
tokenizer="deepseek-ai/DeepSeek-R1-Distill-Llama-8B", tokenizer="deepseek-ai/DeepSeek-R1-Distill-Llama-8B",
base_url="http://localhost:8000", base_url="http://localhost:8000",
warmup_request_count: int = AIPERF_WARMUP_REQUEST_PER_DP_RANK,
): ):
return _get_common_aiperf_cmd( return _get_common_aiperf_cmd(
artifact_dir, artifact_dir,
...@@ -123,6 +130,7 @@ def get_decode_aiperf_cmd( ...@@ -123,6 +130,7 @@ def get_decode_aiperf_cmd(
model, model,
tokenizer, tokenizer,
base_url, base_url,
warmup_request_count=warmup_request_count,
) + [ ) + [
"--synthetic-input-tokens-mean", "--synthetic-input-tokens-mean",
str(isl), str(isl),
...@@ -207,7 +215,7 @@ def get_prefill_ttft( ...@@ -207,7 +215,7 @@ def get_prefill_ttft(
tokenizer: str, tokenizer: str,
base_url: str = "http://localhost:8000", base_url: str = "http://localhost:8000",
attention_dp_size: int = 1, attention_dp_size: int = 1,
attn_dp_num_req_ratio: int = 4, attn_dp_num_req_ratio: int = AIPERF_PREFILL_ATTN_DP_NUM_REQ_RATIO,
) -> Optional[float]: ) -> Optional[float]:
""" """
Run prefill benchmark and extract TTFT (ms). Returns None on failure. Run prefill benchmark and extract TTFT (ms). Returns None on failure.
...@@ -218,6 +226,7 @@ def get_prefill_ttft( ...@@ -218,6 +226,7 @@ def get_prefill_ttft(
""" """
# DEP-aware measurement (waves of size attention_dp_size) # DEP-aware measurement (waves of size attention_dp_size)
if attention_dp_size > 1: if attention_dp_size > 1:
assert attn_dp_num_req_ratio > 0, "attn_dp_num_req_ratio must be greater than 0"
total_concurrency = attention_dp_size * attn_dp_num_req_ratio total_concurrency = attention_dp_size * attn_dp_num_req_ratio
logger.info( logger.info(
f"DEP prefill measurement: isl={isl}, attn_dp={attention_dp_size}, attn_dp_num_req_ratio={attn_dp_num_req_ratio}, " f"DEP prefill measurement: isl={isl}, attn_dp={attention_dp_size}, attn_dp_num_req_ratio={attn_dp_num_req_ratio}, "
...@@ -232,9 +241,16 @@ def get_prefill_ttft( ...@@ -232,9 +241,16 @@ def get_prefill_ttft(
base_url=base_url, base_url=base_url,
concurrency=total_concurrency, concurrency=total_concurrency,
request_count=total_concurrency, request_count=total_concurrency,
warmup_request_count=AIPERF_WARMUP_REQUEST_PER_DP_RANK * attention_dp_size,
) )
try: try:
max_ttft = float(aiperf_result["time_to_first_token"]["max"]) max_ttft = float(aiperf_result["time_to_first_token"]["max"])
# subtract the decoding time in-between prefill runs
max_ttft -= (
float(aiperf_result["inter_token_latency"]["avg"])
* (AIPERF_PREFILL_BENCHMARK_OSL - 1)
* (attn_dp_num_req_ratio - 1)
)
return max_ttft / float(attn_dp_num_req_ratio) return max_ttft / float(attn_dp_num_req_ratio)
except (KeyError, TypeError, ValueError): except (KeyError, TypeError, ValueError):
logger.warning( logger.warning(
...@@ -266,6 +282,7 @@ def get_decode_itl_and_thpt_per_gpu( ...@@ -266,6 +282,7 @@ def get_decode_itl_and_thpt_per_gpu(
tokenizer: str, tokenizer: str,
base_url: str = "http://localhost:8000", base_url: str = "http://localhost:8000",
num_gpus: int = 1, num_gpus: int = 1,
attention_dp_size: int = 1,
) -> Tuple[Optional[float], Optional[float]]: ) -> Tuple[Optional[float], Optional[float]]:
""" """
Run decode benchmark and extract (ITL ms, throughput per GPU). Run decode benchmark and extract (ITL ms, throughput per GPU).
...@@ -279,6 +296,7 @@ def get_decode_itl_and_thpt_per_gpu( ...@@ -279,6 +296,7 @@ def get_decode_itl_and_thpt_per_gpu(
model_name, model_name,
tokenizer, tokenizer,
base_url=base_url, base_url=base_url,
warmup_request_count=AIPERF_WARMUP_REQUEST_PER_DP_RANK * attention_dp_size,
) )
if aiperf_result is None: if aiperf_result is None:
return None, None return None, None
...@@ -300,6 +318,7 @@ def benchmark_decode( ...@@ -300,6 +318,7 @@ def benchmark_decode(
model_name, model_name,
tokenizer, tokenizer,
base_url="http://localhost:8000", base_url="http://localhost:8000",
warmup_request_count: int = AIPERF_WARMUP_REQUEST_PER_DP_RANK,
): ):
logger.info(f"Profiling decode with num_request {num_request}...") logger.info(f"Profiling decode with num_request {num_request}...")
...@@ -316,6 +335,7 @@ def benchmark_decode( ...@@ -316,6 +335,7 @@ def benchmark_decode(
model=model_name, model=model_name,
tokenizer=tokenizer, tokenizer=tokenizer,
base_url=base_url, base_url=base_url,
warmup_request_count=warmup_request_count,
) )
aiperf_process = subprocess.Popen( aiperf_process = subprocess.Popen(
aiperf_cmd, aiperf_cmd,
......
...@@ -226,6 +226,7 @@ def apply_parallel_mapping_to_config( ...@@ -226,6 +226,7 @@ def apply_parallel_mapping_to_config(
cfg = config_modifier.set_prefill_config( cfg = config_modifier.set_prefill_config(
cfg, cfg,
max_batch_size=mapping.get_attn_dp_size(), max_batch_size=mapping.get_attn_dp_size(),
max_num_tokens=PREFILL_MAX_NUM_TOKENS, # max num tokens is shared by all attention dp ranks
max_num_tokens=PREFILL_MAX_NUM_TOKENS * mapping.get_attn_dp_size(),
) )
return cfg return cfg
...@@ -376,5 +376,7 @@ class SGLangConfigModifier: ...@@ -376,5 +376,7 @@ class SGLangConfigModifier:
# Cap total tokens processed in a batch to avoid chunked prefill # Cap total tokens processed in a batch to avoid chunked prefill
args = set_argument_value(args, "--chunked-prefill-size", str(max_num_tokens)) args = set_argument_value(args, "--chunked-prefill-size", str(max_num_tokens))
args = append_argument(args, "--enable-dp-lm-head")
worker_service.extraPodSpec.mainContainer.args = args worker_service.extraPodSpec.mainContainer.args = args
return cfg.model_dump() return cfg.model_dump()
...@@ -25,6 +25,11 @@ DECODE_MAX_CONCURRENCY = 2000 ...@@ -25,6 +25,11 @@ DECODE_MAX_CONCURRENCY = 2000
# set a prefill maximum number of tokens to 32768 to avoid chunked prefill but not too large to cause activation tensor too large # set a prefill maximum number of tokens to 32768 to avoid chunked prefill but not too large to cause activation tensor too large
PREFILL_MAX_NUM_TOKENS = 32768 PREFILL_MAX_NUM_TOKENS = 32768
# AIPerf benchmarking related defaults
AIPERF_WARMUP_REQUEST_PER_DP_RANK = 3
AIPERF_PREFILL_BENCHMARK_OSL = 5
AIPERF_PREFILL_ATTN_DP_NUM_REQ_RATIO = 4
class EngineType(str, Enum): class EngineType(str, Enum):
PREFILL = "prefill" PREFILL = "prefill"
......
...@@ -123,6 +123,7 @@ def profile_decode( ...@@ -123,6 +123,7 @@ def profile_decode(
tokenizer, tokenizer,
base_url=url, base_url=url,
num_gpus=num_gpus, num_gpus=num_gpus,
attention_dp_size=attention_dp_size,
) )
return _profile_decode_helper( return _profile_decode_helper(
......
...@@ -90,7 +90,6 @@ def profile_prefill( ...@@ -90,7 +90,6 @@ def profile_prefill(
max_context_length, max_context_length,
interpolation_granularity, interpolation_granularity,
attention_dp_size: int = 1, attention_dp_size: int = 1,
attn_dp_num_req_ratio: int = 4,
): ):
def get_ttft(isl): def get_ttft(isl):
ai_perf_artifact_dir = f"{work_dir}/aiperf_isl{isl}" ai_perf_artifact_dir = f"{work_dir}/aiperf_isl{isl}"
...@@ -101,7 +100,6 @@ def profile_prefill( ...@@ -101,7 +100,6 @@ def profile_prefill(
tokenizer, tokenizer,
base_url=url, base_url=url,
attention_dp_size=attention_dp_size, attention_dp_size=attention_dp_size,
attn_dp_num_req_ratio=attn_dp_num_req_ratio,
) )
return _profile_prefill_helper( return _profile_prefill_helper(
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment