"lib/vscode:/vscode.git/clone" did not exist on "55c6525f5b40e9dac57a764f9f36a912f4de25cc"
Unverified Commit 59053354 authored by Jason Zhou's avatar Jason Zhou Committed by GitHub
Browse files

feat: add --tokenizer_path to profile_endpoint.py (#2550)

parent 344c21dc
......@@ -35,6 +35,13 @@ if __name__ == "__main__":
required=True,
help="model name",
)
parser.add_argument(
"--tokenizer_path",
type=str,
required=False,
default="",
help="tokenizer path",
)
parser.add_argument(
"--url",
type=str,
......@@ -75,10 +82,13 @@ if __name__ == "__main__":
args = parser.parse_args()
os.makedirs(args.work_dir, exist_ok=True)
if args.tokenizer_path == "":
args.tokenizer_path = args.model_name
if args.mode == "prefill":
profile_prefill(
args.work_dir,
args.model_name,
args.tokenizer_path,
args.url,
args.num_gpus,
args.max_context_length,
......@@ -89,6 +99,7 @@ if __name__ == "__main__":
profile_decode(
args.work_dir,
args.model_name,
args.tokenizer_path,
args.url,
args.num_gpus,
args.max_kv_tokens,
......
......@@ -421,6 +421,7 @@ async def run_profile(args):
profile_prefill(
work_dir,
model_name,
model_name,
base_url,
best_prefill_tp,
args.max_context_length,
......@@ -476,6 +477,7 @@ async def run_profile(args):
profile_decode(
work_dir,
model_name,
model_name,
base_url,
best_decode_tp,
max_kv_tokens,
......
......@@ -34,6 +34,7 @@ def _get_common_genai_perf_cmd(
artifact_dir,
seed=100,
model="deepseek-ai/DeepSeek-R1-Distill-Llama-8B",
tokenizer="deepseek-ai/DeepSeek-R1-Distill-Llama-8B",
base_url="http://localhost:8000",
):
return [
......@@ -42,7 +43,7 @@ def _get_common_genai_perf_cmd(
"--model",
model,
"--tokenizer",
model,
tokenizer,
"--endpoint-type",
"chat",
"--endpoint",
......@@ -68,6 +69,7 @@ def get_prefill_genai_perf_cmd(
artifact_dir,
seed=100,
model="deepseek-ai/DeepSeek-R1-Distill-Llama-8B",
tokenizer="deepseek-ai/DeepSeek-R1-Distill-Llama-8B",
osl=5,
base_url="http://localhost:8000",
):
......@@ -75,6 +77,7 @@ def get_prefill_genai_perf_cmd(
artifact_dir,
seed,
model,
tokenizer,
base_url,
) + [
"--synthetic-input-tokens-mean",
......@@ -103,12 +106,14 @@ def get_decode_genai_perf_cmd(
num_request,
seed=100,
model="deepseek-ai/DeepSeek-R1-Distill-Llama-8B",
tokenizer="deepseek-ai/DeepSeek-R1-Distill-Llama-8B",
base_url="http://localhost:8000",
):
return _get_common_genai_perf_cmd(
artifact_dir,
seed,
model,
tokenizer,
base_url,
) + [
"--synthetic-input-tokens-mean",
......@@ -147,11 +152,19 @@ def get_gap_result(artifact_dir: str) -> dict:
def benchmark_prefill(
isl, genai_perf_artifact_dir, model_name, base_url="http://localhost:8000"
isl,
genai_perf_artifact_dir,
model_name,
tokenizer,
base_url="http://localhost:8000",
):
logger.info(f"Running genai-perf with isl {isl}")
genai_perf_cmd = get_prefill_genai_perf_cmd(
isl, genai_perf_artifact_dir, model=model_name, base_url=base_url
isl,
genai_perf_artifact_dir,
model=model_name,
tokenizer=tokenizer,
base_url=base_url,
)
print(f"genai-perf cmd: {genai_perf_cmd}")
# import pdb; pdb.set_trace()
......@@ -179,6 +192,7 @@ def benchmark_decode(
num_request,
genai_perf_artifact_dir,
model_name,
tokenizer,
base_url="http://localhost:8000",
):
logger.info(f"Profiling decode with num_request {num_request}...")
......@@ -194,6 +208,7 @@ def benchmark_decode(
num_request,
seed=seed,
model=model_name,
tokenizer=tokenizer,
base_url=base_url,
)
gap_process = subprocess.Popen(
......@@ -211,6 +226,7 @@ def benchmark_decode(
num_request,
seed=seed,
model=model_name,
tokenizer=tokenizer,
base_url=base_url,
)
gap_process = subprocess.Popen(
......
......@@ -21,6 +21,7 @@ logger.addHandler(console_handler)
def profile_decode(
work_dir,
model_name,
tokenizer,
url,
num_gpus,
max_kv_tokens,
......@@ -41,6 +42,13 @@ def profile_decode(
(max_context_length - osl) // interpolation_granularity,
):
max_concurrency = max_kv_tokens // (isl + osl)
if max_concurrency // interpolation_granularity == 0:
logger.warning(
f"max_concurrency {max_concurrency} is too small for"
f" interpolation granularity {interpolation_granularity}."
f" max_kv_tokens {max_kv_tokens}, isl {isl}, osl {osl}"
)
break
sweep_num_request = range(
1,
max_concurrency,
......@@ -54,6 +62,7 @@ def profile_decode(
num_request,
genai_perf_artifact_dir,
model_name,
tokenizer,
base_url=url,
)
if gap_result is not None:
......
......@@ -19,7 +19,13 @@ logger.addHandler(console_handler)
def profile_prefill(
work_dir, model_name, url, num_gpus, max_context_length, interpolation_granularity
work_dir,
model_name,
tokenizer,
url,
num_gpus,
max_context_length,
interpolation_granularity,
):
prefill_isl = []
prefill_ttft = []
......@@ -32,7 +38,11 @@ def profile_prefill(
# run genai-perf
genai_perf_artifact_dir = f"{work_dir}/gap_isl{isl}"
gap_result = benchmark_prefill(
isl, genai_perf_artifact_dir, model_name, base_url=url
isl,
genai_perf_artifact_dir,
model_name,
tokenizer,
base_url=url,
)
if gap_result is not None:
ttft = gap_result["time_to_first_token"]["avg"]
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment