Unverified Commit 2ac189ed authored by HandH1998's avatar HandH1998 Committed by GitHub
Browse files

Amd test fp8 (#4261)

parent 5a6400ee
...@@ -55,6 +55,7 @@ jobs: ...@@ -55,6 +55,7 @@ jobs:
timeout-minutes: 20 timeout-minutes: 20
run: | run: |
docker exec -w /sglang-checkout/test/srt ci_sglang python3 test_eval_accuracy_large.py docker exec -w /sglang-checkout/test/srt ci_sglang python3 test_eval_accuracy_large.py
docker exec -w /sglang-checkout/test/srt ci_sglang python3 test_eval_fp8_accuracy.py
docker exec -w /sglang-checkout/test/srt ci_sglang python3 models/test_qwen_models.py docker exec -w /sglang-checkout/test/srt ci_sglang python3 models/test_qwen_models.py
mla-test-1-gpu-amd: mla-test-1-gpu-amd:
......
...@@ -237,6 +237,7 @@ class ModelConfig: ...@@ -237,6 +237,7 @@ class ModelConfig:
"compressed_tensors", "compressed_tensors",
"compressed-tensors", "compressed-tensors",
"fbgemm_fp8", "fbgemm_fp8",
"w8a8_fp8",
] ]
optimized_quantization_methods = [ optimized_quantization_methods = [
"fp8", "fp8",
......
...@@ -32,6 +32,10 @@ if _is_cuda: ...@@ -32,6 +32,10 @@ if _is_cuda:
else: else:
from sgl_kernel import fp8_scaled_mm from sgl_kernel import fp8_scaled_mm
# Input scaling factors are no longer optional in _scaled_mm starting
# from pytorch 2.5. Allocating a dummy tensor to pass as input_scale
TORCH_DEVICE_IDENTITY = torch.ones(1, dtype=torch.float32)
def cutlass_fp8_supported(): def cutlass_fp8_supported():
if not _is_cuda: if not _is_cuda:
......
...@@ -28,6 +28,10 @@ from sglang.test.run_eval import run_eval ...@@ -28,6 +28,10 @@ from sglang.test.run_eval import run_eval
from sglang.utils import get_exception_traceback from sglang.utils import get_exception_traceback
DEFAULT_FP8_MODEL_NAME_FOR_TEST = "neuralmagic/Meta-Llama-3.1-8B-FP8" DEFAULT_FP8_MODEL_NAME_FOR_TEST = "neuralmagic/Meta-Llama-3.1-8B-FP8"
DEFAULT_FP8_MODEL_NAME_FOR_ACCURACY_TEST = "neuralmagic/Meta-Llama-3-8B-Instruct-FP8"
DEFAULT_FP8_MODEL_NAME_FOR_DYNAMIC_QUANT_ACCURACY_TEST = (
"neuralmagic/Meta-Llama-3.1-8B-Instruct-FP8-dynamic"
)
DEFAULT_MODEL_NAME_FOR_TEST = "meta-llama/Llama-3.1-8B-Instruct" DEFAULT_MODEL_NAME_FOR_TEST = "meta-llama/Llama-3.1-8B-Instruct"
DEFAULT_SMALL_MODEL_NAME_FOR_TEST = "meta-llama/Llama-3.2-1B-Instruct" DEFAULT_SMALL_MODEL_NAME_FOR_TEST = "meta-llama/Llama-3.2-1B-Instruct"
DEFAULT_MOE_MODEL_NAME_FOR_TEST = "mistralai/Mixtral-8x7B-Instruct-v0.1" DEFAULT_MOE_MODEL_NAME_FOR_TEST = "mistralai/Mixtral-8x7B-Instruct-v0.1"
......
...@@ -69,6 +69,7 @@ suites = { ...@@ -69,6 +69,7 @@ suites = {
TestFile("test_vision_llm.py", 18.4), TestFile("test_vision_llm.py", 18.4),
TestFile("test_vision_openai_server.py", 344), TestFile("test_vision_openai_server.py", 344),
TestFile("test_w8a8_quantization.py", 46), TestFile("test_w8a8_quantization.py", 46),
TestFile("test_eval_fp8_accuracy.py", 172),
], ],
"nightly": [ "nightly": [
TestFile("test_nightly_gsm8k_eval.py"), TestFile("test_nightly_gsm8k_eval.py"),
......
import unittest
from types import SimpleNamespace
from sglang.srt.utils import kill_process_tree
from sglang.test.run_eval import run_eval
from sglang.test.test_utils import (
DEFAULT_FP8_MODEL_NAME_FOR_ACCURACY_TEST,
DEFAULT_FP8_MODEL_NAME_FOR_DYNAMIC_QUANT_ACCURACY_TEST,
DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
DEFAULT_URL_FOR_TEST,
popen_launch_server,
)
class TestEvalFP8Accuracy(unittest.TestCase):
@classmethod
def setUpClass(cls):
cls.model = DEFAULT_FP8_MODEL_NAME_FOR_ACCURACY_TEST
cls.base_url = DEFAULT_URL_FOR_TEST
cls.process = popen_launch_server(
cls.model, cls.base_url, timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH
)
@classmethod
def tearDownClass(cls):
kill_process_tree(cls.process.pid)
def test_mmlu(self):
args = SimpleNamespace(
base_url=self.base_url,
model=self.model,
eval_name="mmlu",
num_examples=64,
num_threads=32,
temperature=0.1,
)
metrics = run_eval(args)
self.assertGreaterEqual(metrics["score"], 0.62)
class TestEvalFP8DynamicQuantAccuracy(unittest.TestCase):
@classmethod
def setUpClass(cls):
cls.model = DEFAULT_FP8_MODEL_NAME_FOR_DYNAMIC_QUANT_ACCURACY_TEST
cls.base_url = DEFAULT_URL_FOR_TEST
cls.process = popen_launch_server(
cls.model,
cls.base_url,
timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
other_args=["--quantization", "w8a8_fp8"],
)
@classmethod
def tearDownClass(cls):
kill_process_tree(cls.process.pid)
def test_mmlu(self):
args = SimpleNamespace(
base_url=self.base_url,
model=self.model,
eval_name="mmlu",
num_examples=64,
num_threads=32,
temperature=0.1,
)
metrics = run_eval(args)
self.assertGreaterEqual(metrics["score"], 0.70)
if __name__ == "__main__":
unittest.main()
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment