Unverified Commit 59053354 authored by Jason Zhou's avatar Jason Zhou Committed by GitHub
Browse files

feat: add --tokenizer_path to profile_endpoint.py (#2550)

parent 344c21dc
...@@ -35,6 +35,13 @@ if __name__ == "__main__": ...@@ -35,6 +35,13 @@ if __name__ == "__main__":
required=True, required=True,
help="model name", help="model name",
) )
parser.add_argument(
"--tokenizer_path",
type=str,
required=False,
default="",
help="tokenizer path",
)
parser.add_argument( parser.add_argument(
"--url", "--url",
type=str, type=str,
...@@ -75,10 +82,13 @@ if __name__ == "__main__": ...@@ -75,10 +82,13 @@ if __name__ == "__main__":
args = parser.parse_args() args = parser.parse_args()
os.makedirs(args.work_dir, exist_ok=True) os.makedirs(args.work_dir, exist_ok=True)
if args.tokenizer_path == "":
args.tokenizer_path = args.model_name
if args.mode == "prefill": if args.mode == "prefill":
profile_prefill( profile_prefill(
args.work_dir, args.work_dir,
args.model_name, args.model_name,
args.tokenizer_path,
args.url, args.url,
args.num_gpus, args.num_gpus,
args.max_context_length, args.max_context_length,
...@@ -89,6 +99,7 @@ if __name__ == "__main__": ...@@ -89,6 +99,7 @@ if __name__ == "__main__":
profile_decode( profile_decode(
args.work_dir, args.work_dir,
args.model_name, args.model_name,
args.tokenizer_path,
args.url, args.url,
args.num_gpus, args.num_gpus,
args.max_kv_tokens, args.max_kv_tokens,
......
...@@ -421,6 +421,7 @@ async def run_profile(args): ...@@ -421,6 +421,7 @@ async def run_profile(args):
profile_prefill( profile_prefill(
work_dir, work_dir,
model_name, model_name,
model_name,
base_url, base_url,
best_prefill_tp, best_prefill_tp,
args.max_context_length, args.max_context_length,
...@@ -476,6 +477,7 @@ async def run_profile(args): ...@@ -476,6 +477,7 @@ async def run_profile(args):
profile_decode( profile_decode(
work_dir, work_dir,
model_name, model_name,
model_name,
base_url, base_url,
best_decode_tp, best_decode_tp,
max_kv_tokens, max_kv_tokens,
......
...@@ -34,6 +34,7 @@ def _get_common_genai_perf_cmd( ...@@ -34,6 +34,7 @@ def _get_common_genai_perf_cmd(
artifact_dir, artifact_dir,
seed=100, seed=100,
model="deepseek-ai/DeepSeek-R1-Distill-Llama-8B", model="deepseek-ai/DeepSeek-R1-Distill-Llama-8B",
tokenizer="deepseek-ai/DeepSeek-R1-Distill-Llama-8B",
base_url="http://localhost:8000", base_url="http://localhost:8000",
): ):
return [ return [
...@@ -42,7 +43,7 @@ def _get_common_genai_perf_cmd( ...@@ -42,7 +43,7 @@ def _get_common_genai_perf_cmd(
"--model", "--model",
model, model,
"--tokenizer", "--tokenizer",
model, tokenizer,
"--endpoint-type", "--endpoint-type",
"chat", "chat",
"--endpoint", "--endpoint",
...@@ -68,6 +69,7 @@ def get_prefill_genai_perf_cmd( ...@@ -68,6 +69,7 @@ def get_prefill_genai_perf_cmd(
artifact_dir, artifact_dir,
seed=100, seed=100,
model="deepseek-ai/DeepSeek-R1-Distill-Llama-8B", model="deepseek-ai/DeepSeek-R1-Distill-Llama-8B",
tokenizer="deepseek-ai/DeepSeek-R1-Distill-Llama-8B",
osl=5, osl=5,
base_url="http://localhost:8000", base_url="http://localhost:8000",
): ):
...@@ -75,6 +77,7 @@ def get_prefill_genai_perf_cmd( ...@@ -75,6 +77,7 @@ def get_prefill_genai_perf_cmd(
artifact_dir, artifact_dir,
seed, seed,
model, model,
tokenizer,
base_url, base_url,
) + [ ) + [
"--synthetic-input-tokens-mean", "--synthetic-input-tokens-mean",
...@@ -103,12 +106,14 @@ def get_decode_genai_perf_cmd( ...@@ -103,12 +106,14 @@ def get_decode_genai_perf_cmd(
num_request, num_request,
seed=100, seed=100,
model="deepseek-ai/DeepSeek-R1-Distill-Llama-8B", model="deepseek-ai/DeepSeek-R1-Distill-Llama-8B",
tokenizer="deepseek-ai/DeepSeek-R1-Distill-Llama-8B",
base_url="http://localhost:8000", base_url="http://localhost:8000",
): ):
return _get_common_genai_perf_cmd( return _get_common_genai_perf_cmd(
artifact_dir, artifact_dir,
seed, seed,
model, model,
tokenizer,
base_url, base_url,
) + [ ) + [
"--synthetic-input-tokens-mean", "--synthetic-input-tokens-mean",
...@@ -147,11 +152,19 @@ def get_gap_result(artifact_dir: str) -> dict: ...@@ -147,11 +152,19 @@ def get_gap_result(artifact_dir: str) -> dict:
def benchmark_prefill( def benchmark_prefill(
isl, genai_perf_artifact_dir, model_name, base_url="http://localhost:8000" isl,
genai_perf_artifact_dir,
model_name,
tokenizer,
base_url="http://localhost:8000",
): ):
logger.info(f"Running genai-perf with isl {isl}") logger.info(f"Running genai-perf with isl {isl}")
genai_perf_cmd = get_prefill_genai_perf_cmd( genai_perf_cmd = get_prefill_genai_perf_cmd(
isl, genai_perf_artifact_dir, model=model_name, base_url=base_url isl,
genai_perf_artifact_dir,
model=model_name,
tokenizer=tokenizer,
base_url=base_url,
) )
print(f"genai-perf cmd: {genai_perf_cmd}") print(f"genai-perf cmd: {genai_perf_cmd}")
# import pdb; pdb.set_trace() # import pdb; pdb.set_trace()
...@@ -179,6 +192,7 @@ def benchmark_decode( ...@@ -179,6 +192,7 @@ def benchmark_decode(
num_request, num_request,
genai_perf_artifact_dir, genai_perf_artifact_dir,
model_name, model_name,
tokenizer,
base_url="http://localhost:8000", base_url="http://localhost:8000",
): ):
logger.info(f"Profiling decode with num_request {num_request}...") logger.info(f"Profiling decode with num_request {num_request}...")
...@@ -194,6 +208,7 @@ def benchmark_decode( ...@@ -194,6 +208,7 @@ def benchmark_decode(
num_request, num_request,
seed=seed, seed=seed,
model=model_name, model=model_name,
tokenizer=tokenizer,
base_url=base_url, base_url=base_url,
) )
gap_process = subprocess.Popen( gap_process = subprocess.Popen(
...@@ -211,6 +226,7 @@ def benchmark_decode( ...@@ -211,6 +226,7 @@ def benchmark_decode(
num_request, num_request,
seed=seed, seed=seed,
model=model_name, model=model_name,
tokenizer=tokenizer,
base_url=base_url, base_url=base_url,
) )
gap_process = subprocess.Popen( gap_process = subprocess.Popen(
......
...@@ -21,6 +21,7 @@ logger.addHandler(console_handler) ...@@ -21,6 +21,7 @@ logger.addHandler(console_handler)
def profile_decode( def profile_decode(
work_dir, work_dir,
model_name, model_name,
tokenizer,
url, url,
num_gpus, num_gpus,
max_kv_tokens, max_kv_tokens,
...@@ -41,6 +42,13 @@ def profile_decode( ...@@ -41,6 +42,13 @@ def profile_decode(
(max_context_length - osl) // interpolation_granularity, (max_context_length - osl) // interpolation_granularity,
): ):
max_concurrency = max_kv_tokens // (isl + osl) max_concurrency = max_kv_tokens // (isl + osl)
if max_concurrency // interpolation_granularity == 0:
logger.warning(
f"max_concurrency {max_concurrency} is too small for"
f" interpolation granularity {interpolation_granularity}."
f" max_kv_tokens {max_kv_tokens}, isl {isl}, osl {osl}"
)
break
sweep_num_request = range( sweep_num_request = range(
1, 1,
max_concurrency, max_concurrency,
...@@ -54,6 +62,7 @@ def profile_decode( ...@@ -54,6 +62,7 @@ def profile_decode(
num_request, num_request,
genai_perf_artifact_dir, genai_perf_artifact_dir,
model_name, model_name,
tokenizer,
base_url=url, base_url=url,
) )
if gap_result is not None: if gap_result is not None:
......
...@@ -19,7 +19,13 @@ logger.addHandler(console_handler) ...@@ -19,7 +19,13 @@ logger.addHandler(console_handler)
def profile_prefill( def profile_prefill(
work_dir, model_name, url, num_gpus, max_context_length, interpolation_granularity work_dir,
model_name,
tokenizer,
url,
num_gpus,
max_context_length,
interpolation_granularity,
): ):
prefill_isl = [] prefill_isl = []
prefill_ttft = [] prefill_ttft = []
...@@ -32,7 +38,11 @@ def profile_prefill( ...@@ -32,7 +38,11 @@ def profile_prefill(
# run genai-perf # run genai-perf
genai_perf_artifact_dir = f"{work_dir}/gap_isl{isl}" genai_perf_artifact_dir = f"{work_dir}/gap_isl{isl}"
gap_result = benchmark_prefill( gap_result = benchmark_prefill(
isl, genai_perf_artifact_dir, model_name, base_url=url isl,
genai_perf_artifact_dir,
model_name,
tokenizer,
base_url=url,
) )
if gap_result is not None: if gap_result is not None:
ttft = gap_result["time_to_first_token"]["avg"] ttft = gap_result["time_to_first_token"]["avg"]
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment