[Bugfix] Enable some fp8 and quantized fullgraph tests (#10171)

Signed-off-by: Bill Nell <bill@neuralmagic.com>

[Bugfix] Enable some fp8 and quantized fullgraph tests (#10171)
Signed-off-by: Bill Nell <bill@neuralmagic.com>
f192aeba · bnellnm · GitHub · 8e1529dc · f192aeba
Unverified Commit f192aeba authored Nov 09, 2024 by bnellnm Committed by GitHub Nov 09, 2024
Show whitespace changes
Inline Side-by-side

Showing with 13 additions and 16 deletions

tests/compile/utils.py tests/compile/utils.py +13 -16

No files found.
--- a/tests/compile/utils.py
+++ b/tests/compile/utils.py
@@ -9,29 +9,26 @@ from vllm.platforms import current_platform

 TEST_MODELS = [
    ("facebook/opt-125m", {}),
-    # TODO: add fake implementation for compressed-tensors
-    # ("nm-testing/tinyllama-oneshot-w8w8-test-static-shape-change", {
-    #     "dtype": torch.float16,
-    #     "quantization": "compressed-tensors"
-    # }),
+    ("nm-testing/tinyllama-oneshot-w8w8-test-static-shape-change", {
+        "dtype": torch.float16,
+        "quantization": "compressed-tensors"
+    }),
    ("neuralmagic/Meta-Llama-3-8B-Instruct-FP8", {
        "dtype": torch.float16,
        "quantization": "fp8"
    }),
-    # TODO: add fake implementation for compressed-tensors
-    # ("nm-testing/Meta-Llama-3-8B-Instruct-W8A8-Dyn-Per-Token-2048-Samples", {
-    #     "quantization": "compressed-tensors"
-    # }),
+    ("nm-testing/Meta-Llama-3-8B-Instruct-W8A8-Dyn-Per-Token-2048-Samples", {
+        "quantization": "compressed-tensors"
+    }),
    ("meta-llama/Meta-Llama-3-8B", {}),
 ]

-# TODO: enable in pytorch 2.5
-if False and is_quant_method_supported("aqlm"):  # noqa: SIM223
+if is_quant_method_supported("aqlm"):
    TEST_MODELS.append(("ISTA-DASLab/Llama-2-7b-AQLM-2Bit-1x16-hf", {
        "quantization": "aqlm"
    }))

-# TODO: enable in pytorch 2.5
+# TODO: figure out why this fails.
 if False and is_quant_method_supported("gguf"):  # noqa: SIM223
    TEST_MODELS.append(("TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF", {
        "quantization": "gguf"
@@ -71,13 +68,13 @@ def check_full_graph_support(model,
    os.environ["VLLM_TORCH_COMPILE_LEVEL"] = str(optimization_level)
    os.environ["VLLM_TEST_DYNAMO_FULLGRAPH_CAPTURE"] = "1"

-    # Inductor doesn't support fp8 and the base meta llama uses too
-    # much memory.
-    quantization = model_kwargs.get("quantization")
-    if ((quantization == "fp8" or model == "meta-llama/Meta-Llama-3-8B")
+    # The base meta llama uses too much memory.
+    if (model == "meta-llama/Meta-Llama-3-8B"
            and optimization_level >= CompilationLevel.PIECEWISE):
        return

+    print(f"MODEL={model}")
+
    prompts = [
        "Hello, my name is",
        "The president of the United States is",