Unverified Commit 79ece2c5 authored by Lianmin Zheng's avatar Lianmin Zheng Committed by GitHub
Browse files

Report median instead of mean in bench_latency.py (#1269)

parent 55f5976b
...@@ -233,7 +233,6 @@ python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct ...@@ -233,7 +233,6 @@ python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct
### Supported Models ### Supported Models
**Generative Models** **Generative Models**
- Exaone 3.0
- Llama / Llama 2 / Llama 3 / Llama 3.1 - Llama / Llama 2 / Llama 3 / Llama 3.1
- Mistral / Mixtral / Mistral NeMo - Mistral / Mixtral / Mistral NeMo
- Gemma / Gemma 2 - Gemma / Gemma 2
...@@ -253,6 +252,7 @@ python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct ...@@ -253,6 +252,7 @@ python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct
- Grok - Grok
- ChatGLM - ChatGLM
- InternLM 2 - InternLM 2
- Exaone 3
**Embedding Models** **Embedding Models**
......
...@@ -292,6 +292,7 @@ def latency_test_run_once( ...@@ -292,6 +292,7 @@ def latency_test_run_once(
measurement_results["prefill_throughput"] = throughput measurement_results["prefill_throughput"] = throughput
# Decode # Decode
decode_latencies = []
for i in range(output_len): for i in range(output_len):
torch.cuda.synchronize() torch.cuda.synchronize()
tic = time.time() tic = time.time()
...@@ -300,17 +301,18 @@ def latency_test_run_once( ...@@ -300,17 +301,18 @@ def latency_test_run_once(
latency = time.time() - tic latency = time.time() - tic
tot_latency += latency tot_latency += latency
throughput = batch_size / latency throughput = batch_size / latency
decode_latencies.append(latency)
if i < 5: if i < 5:
rank_print( rank_print(
f"Decode. latency: {latency:6.5f} s, throughput: {throughput:9.2f} token/s" f"Decode. latency: {latency:6.5f} s, throughput: {throughput:9.2f} token/s"
) )
avg_decode_latency = (tot_latency - prefill_latency) / output_len med_decode_latency = np.median(decode_latencies)
avg_decode_throughput = batch_size / avg_decode_latency med_decode_throughput = batch_size / med_decode_latency
rank_print( rank_print(
f"Decode. avg latency: {avg_decode_latency:6.5f} s, avg throughput: {avg_decode_throughput:9.2f} token/s" f"Decode. median latency: {med_decode_latency:6.5f} s, median throughput: {med_decode_throughput:9.2f} token/s"
) )
measurement_results["avg_decode_latency"] = avg_decode_latency measurement_results["median_decode_latency"] = med_decode_latency
measurement_results["avg_decode_throughput"] = avg_decode_throughput measurement_results["median_decode_throughput"] = med_decode_throughput
throughput = (input_len + output_len) * batch_size / tot_latency throughput = (input_len + output_len) * batch_size / tot_latency
rank_print( rank_print(
......
...@@ -50,8 +50,6 @@ for name, cls in _CONFIG_REGISTRY.items(): ...@@ -50,8 +50,6 @@ for name, cls in _CONFIG_REGISTRY.items():
with contextlib.suppress(ValueError): with contextlib.suppress(ValueError):
AutoConfig.register(name, cls) AutoConfig.register(name, cls)
from sglang.srt.utils import is_multimodal_model
def download_from_hf(model_path: str): def download_from_hf(model_path: str):
if os.path.exists(model_path): if os.path.exists(model_path):
...@@ -60,12 +58,6 @@ def download_from_hf(model_path: str): ...@@ -60,12 +58,6 @@ def download_from_hf(model_path: str):
return snapshot_download(model_path, allow_patterns=["*.json", "*.bin", "*.model"]) return snapshot_download(model_path, allow_patterns=["*.json", "*.bin", "*.model"])
def get_config_json(model_path: str):
with open(os.path.join(model_path, "configs.json")) as f:
config = json.load(f)
return config
def get_config( def get_config(
model: str, model: str,
trust_remote_code: bool, trust_remote_code: bool,
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment