[Bugfix] Fix Sparse24 Compressed Tensors models (#33446)

Signed-off-by: Kyle Sayers <kylesayrs@gmail.com> Co-authored-by: Michael Goin <mgoin64@gmail.com>

[Bugfix] Fix Sparse24 Compressed Tensors models (#33446)
Signed-off-by: Kyle Sayers <kylesayrs@gmail.com> Co-authored-by: Michael Goin <mgoin64@gmail.com>
e9cd6911 · Kyle Sayers · GitHub · 80f2ba6e · e9cd6911 · e9cd6911
Unverified Commit e9cd6911 authored Feb 12, 2026 by Kyle Sayers Committed by GitHub Feb 11, 2026
3 changed files
--- a/csrc/sparse/cutlass/sparse_scaled_mm_entry.cu
+++ b/csrc/sparse/cutlass/sparse_scaled_mm_entry.cu
@@ -6,11 +6,11 @@
 #include "cutlass_extensions/common.hpp"

 bool cutlass_sparse_scaled_mm_supported(int64_t cuda_device_capability) {
-  // sparse CUTLASS kernels need at least
+  // sparse CUTLASS kernels need exactly hopper and are not forward compatible
  //   CUDA 12.2 and SM90 (Hopper)

 #if defined CUDA_VERSION
-  return CUDA_VERSION >= 12020 && cuda_device_capability >= 90;
+  return CUDA_VERSION >= 12020 && cuda_device_capability == 90;
 #endif

  return false;
@@ -98,7 +98,7 @@ std::vector<torch::Tensor> cutlass_sparse_compress(torch::Tensor const& a) {

  TORCH_CHECK_NOT_IMPLEMENTED(
      false,
-      "No compiled cutlass_sparse_compress for a compute capability less than "
+      "No compiled cutlass_sparse_compress for a compute capability equal to "
      "CUDA device capability: ",
      version_num);
 }
--- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py
@@ -207,13 +207,14 @@ class CompressedTensorsConfig(QuantizationConfig):
        # because Attention quantization on its own is not supported by vLLM.
        # It is coupled with KV-cache quantization, and if scales are present in the
        # checkpoint, they will be used properly.
+        if "config_groups" in config:
            grps_without_attn_quant = {}
            for k, v in config["config_groups"].items():
                # e.g. LlamaAttention, Qwen3Attention, etc.
                if len(v["targets"]) == 1 and v["targets"][0].endswith("Attention"):
                    logger.warning(
-                    "Skipping CompressedTensors config group for %s. Attention quant "
-                    "is coupled with KV-cache quantization in vLLM.",
+                        "Skipping CompressedTensors config group for %s. Attention "
+                        "quant is coupled with KV-cache quantization in vLLM.",
                        v["targets"][0],
                    )
                    continue

--- a/vllm/model_executor/model_loader/weight_utils.py
+++ b/vllm/model_executor/model_loader/weight_utils.py
@@ -261,6 +261,7 @@ def get_quant_config(
    if (
        hf_quant_config is not None
        and hf_quant_config.get("quant_method") == "compressed-tensors"
+        and "config_groups" in hf_quant_config
    ):
        if hf_text_config is not None:
            n_heads = getattr(hf_text_config, "num_attention_heads", None)