Unverified Commit cbac4997 authored by YanbingJiang's avatar YanbingJiang Committed by GitHub
Browse files

Split test_intel_amx_attention_backend.py to pass CI of timeout (#11370)


Co-authored-by: default avatarMa Mingfei <mingfei.ma@intel.com>
parent 476c67d7
......@@ -16,7 +16,7 @@ import unittest
from concurrent.futures import ThreadPoolExecutor
from dataclasses import dataclass
from datetime import datetime
from functools import partial
from functools import partial, wraps
from pathlib import Path
from types import SimpleNamespace
from typing import Any, Awaitable, Callable, List, Optional, Tuple
......@@ -1807,3 +1807,33 @@ def write_results_to_json(model, metrics, mode="a"):
with open("results.json", "w") as f:
json.dump(existing_results, f, indent=2)
def intel_amx_benchmark(extra_args=None, min_throughput=None):
def decorator(test_func):
@wraps(test_func)
def wrapper(self):
common_args = [
"--attention-backend",
"intel_amx",
"--disable-radix",
"--trust-remote-code",
]
full_args = common_args + (extra_args or [])
model = test_func(self)
prefill_latency, decode_throughput, decode_latency = run_bench_one_batch(
model, full_args
)
print(f"{model=}")
print(f"{prefill_latency=}")
print(f"{decode_throughput=}")
print(f"{decode_latency=}")
if is_in_ci() and min_throughput is not None:
self.assertGreater(decode_throughput, min_throughput)
return wrapper
return decorator
......@@ -8,8 +8,6 @@ import os
import unittest
from types import SimpleNamespace
from test_intel_amx_attention_backend import intel_amx_benchmark
from sglang.srt.utils import get_cpu_ids_by_node, kill_process_tree
from sglang.test.run_eval import run_eval
from sglang.test.test_utils import (
......@@ -17,6 +15,7 @@ from sglang.test.test_utils import (
DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
DEFAULT_URL_FOR_TEST,
CustomTestCase,
intel_amx_benchmark,
is_in_ci,
popen_launch_server,
)
......
"""
Usage:
python3 -m unittest test_intel_amx_attention_backend.TestIntelAMXAttnBackend.test_mmlu
python3 -m unittest test_intel_amx_attention_backend.TestIntelAMXAttnBackend.test_latency_default_model
"""
import unittest
from functools import wraps
from types import SimpleNamespace
from sglang.srt.utils import kill_process_tree
......@@ -12,91 +11,30 @@ from sglang.test.run_eval import run_eval
from sglang.test.test_utils import (
DEFAULT_MLA_MODEL_NAME_FOR_TEST,
DEFAULT_MODEL_NAME_FOR_TEST,
DEFAULT_MODEL_NAME_FOR_TEST_FP8_WITH_MOE,
DEFAULT_MODEL_NAME_FOR_TEST_QWEN_FP8,
DEFAULT_MODEL_NAME_FOR_TEST_W8A8,
DEFAULT_MODEL_NAME_FOR_TEST_W8A8_WITH_MOE,
DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
DEFAULT_URL_FOR_TEST,
CustomTestCase,
intel_amx_benchmark,
is_in_ci,
popen_launch_server,
run_bench_one_batch,
)
def intel_amx_benchmark(extra_args=None, min_throughput=None):
def decorator(test_func):
@wraps(test_func)
def wrapper(self):
common_args = [
"--attention-backend",
"intel_amx",
"--disable-radix",
"--trust-remote-code",
]
full_args = common_args + (extra_args or [])
model = test_func(self)
prefill_latency, decode_throughput, decode_latency = run_bench_one_batch(
model, full_args
)
print(f"{model=}")
print(f"{prefill_latency=}")
print(f"{decode_throughput=}")
print(f"{decode_latency=}")
if is_in_ci() and min_throughput is not None:
self.assertGreater(decode_throughput, min_throughput)
return wrapper
return decorator
class TestIntelAMXAttnBackend(CustomTestCase):
@intel_amx_benchmark(extra_args=["--batch-size", "4"], min_throughput=10)
def test_latency_mla_model(self):
return DEFAULT_MLA_MODEL_NAME_FOR_TEST
@intel_amx_benchmark(extra_args=["--batch-size", "4"], min_throughput=40)
def test_latency_default_model(self):
return DEFAULT_MODEL_NAME_FOR_TEST
@intel_amx_benchmark(extra_args=["--batch-size", "4"], min_throughput=150)
def test_latency_fp8_qwen(self):
return DEFAULT_MODEL_NAME_FOR_TEST_QWEN_FP8
@intel_amx_benchmark(extra_args=["--batch-size", "4"], min_throughput=50)
def test_latency_fp8_moe_model(self):
return DEFAULT_MODEL_NAME_FOR_TEST_FP8_WITH_MOE
@intel_amx_benchmark(
extra_args=["--batch-size", "4", "--quantization", "w8a8_int8"],
min_throughput=100,
extra_args=["--batch-size", "4", "--mem-fraction-static", "0.3"],
min_throughput=10,
)
def test_latency_w8a8_default_model(self):
return DEFAULT_MODEL_NAME_FOR_TEST_W8A8
def test_latency_mla_model(self):
return DEFAULT_MLA_MODEL_NAME_FOR_TEST
@intel_amx_benchmark(
extra_args=[
"--batch-size",
"4",
"--quantization",
"w8a8_int8",
"--mem-fraction-static",
"0.9",
"--max-total-tokens",
"65536",
"--tp",
"6",
],
min_throughput=100,
extra_args=["--batch-size", "4", "--mem-fraction-static", "0.1"],
min_throughput=40,
)
def test_latency_w8a8_moe_model(self):
return DEFAULT_MODEL_NAME_FOR_TEST_W8A8_WITH_MOE
def test_latency_default_model(self):
return DEFAULT_MODEL_NAME_FOR_TEST
def test_mmlu(self):
model = DEFAULT_MLA_MODEL_NAME_FOR_TEST
......
"""
For intel_amx attention backend FP8 tests
Usage:
python3 -m unittest test_intel_amx_attention_backend_1.TestIntelAMXAttnBackendQuant.test_latency_fp8_qwen
"""
import unittest
from sglang.test.test_utils import (
DEFAULT_MODEL_NAME_FOR_TEST_FP8_WITH_MOE,
DEFAULT_MODEL_NAME_FOR_TEST_QWEN_FP8,
CustomTestCase,
intel_amx_benchmark,
)
class TestIntelAMXAttnBackendQuant(CustomTestCase):
@intel_amx_benchmark(
extra_args=["--batch-size", "4", "--mem-fraction-static", "0.1"],
min_throughput=150,
)
def test_latency_fp8_qwen(self):
return DEFAULT_MODEL_NAME_FOR_TEST_QWEN_FP8
@intel_amx_benchmark(
extra_args=["--batch-size", "4", "--mem-fraction-static", "0.1"],
min_throughput=50,
)
def test_latency_fp8_moe_model(self):
return DEFAULT_MODEL_NAME_FOR_TEST_FP8_WITH_MOE
if __name__ == "__main__":
unittest.main()
"""
For intel_amx attention backend w8a8 tests
Usage:
python3 -m unittest test_intel_amx_attention_backend_2.TestIntelAMXAttnBackendQuant.test_latency_w8a8_default_model
"""
import unittest
from sglang.test.test_utils import (
DEFAULT_MODEL_NAME_FOR_TEST_W8A8,
DEFAULT_MODEL_NAME_FOR_TEST_W8A8_WITH_MOE,
CustomTestCase,
intel_amx_benchmark,
)
class TestIntelAMXAttnBackendQuant(CustomTestCase):
@intel_amx_benchmark(
extra_args=[
"--batch-size",
"4",
"--quantization",
"w8a8_int8",
"--mem-fraction-static",
"0.1",
],
min_throughput=100,
)
def test_latency_w8a8_default_model(self):
return DEFAULT_MODEL_NAME_FOR_TEST_W8A8
@intel_amx_benchmark(
extra_args=[
"--batch-size",
"4",
"--quantization",
"w8a8_int8",
"--mem-fraction-static",
"0.9",
"--max-total-tokens",
"65536",
"--tp",
"6",
],
min_throughput=100,
)
def test_latency_w8a8_moe_model(self):
return DEFAULT_MODEL_NAME_FOR_TEST_W8A8_WITH_MOE
if __name__ == "__main__":
unittest.main()
......@@ -312,8 +312,10 @@ suite_xeon = {
TestFile("cpu/test_rope.py"),
TestFile("cpu/test_shared_expert.py"),
TestFile("cpu/test_topk.py"),
TestFile("test_cpu_graph.py"),
TestFile("test_intel_amx_attention_backend.py"),
TestFile("cpu/test_cpu_graph.py"),
TestFile("cpu/test_intel_amx_attention_backend_a.py"),
TestFile("cpu/test_intel_amx_attention_backend_b.py"),
TestFile("cpu/test_intel_amx_attention_backend_c.py"),
],
}
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment