[Misc] add w8a8 asym models (#11075)

8cef6e02 · Dipika Sikka · GitHub · b866cdbd · 8cef6e02
Unverified Commit 8cef6e02 authored Dec 23, 2024 by Dipika Sikka Committed by GitHub Dec 23, 2024
Hide whitespace changes
Inline Side-by-side

Showing with 10 additions and 6 deletions

tests/quantization/test_compressed_tensors.py tests/quantization/test_compressed_tensors.py +10 -6

No files found.
--- a/tests/quantization/test_compressed_tensors.py
+++ b/tests/quantization/test_compressed_tensors.py
@@ -79,12 +79,12 @@ def test_compressed_tensors_w8a8_static_setup(vllm_runner, model_args):
        assert output
-@pytest.mark.parametrize(
+@pytest.mark.parametrize("model_path", [
-    "model_path",
+    "neuralmagic/Llama-3.2-1B-quantized.w8a8",
-    [
+    "nm-testing/Meta-Llama-3-8B-Instruct-W8A8-Dynamic-Asym",
-        "neuralmagic/Llama-3.2-1B-quantized.w8a8"
+    "nm-testing/Meta-Llama-3-8B-Instruct-W8A8-Static-Per-Tensor-Sym",
-        # TODO static & asymmetric
+    "nm-testing/Meta-Llama-3-8B-Instruct-W8A8-Static-Per-Tensor-Asym"
-    ])
+])
 @pytest.mark.parametrize("max_tokens", [32])
 @pytest.mark.parametrize("num_logprobs", [10])
 def test_compressed_tensors_w8a8_logprobs(hf_runner, vllm_runner,
@@ -92,6 +92,10 @@ def test_compressed_tensors_w8a8_logprobs(hf_runner, vllm_runner,
                                          max_tokens, num_logprobs):
    dtype = "bfloat16"
+    # skip language translation prompt for the static per tensor asym model
+    if model_path == "nm-testing/Meta-Llama-3-8B-Instruct-W8A8-Static-Per-Tensor-Asym":  # noqa: E501
+        example_prompts = example_prompts[0:-1]
    with hf_runner(model_path, dtype=dtype) as hf_model:
        hf_outputs = hf_model.generate_greedy_logprobs_limit(
            example_prompts, max_tokens, num_logprobs)