[Bugfix] gptq_marlin: Ensure g_idx_sort_indices is not a Parameter (#5108)

5bf185a1 · Alexander Matveev · GitHub · 4fbcb0f2 · 5bf185a1
Unverified Commit 5bf185a1 authored May 29, 2024 by Alexander Matveev Committed by GitHub May 30, 2024
Show whitespace changes
Inline Side-by-side

Showing with 4 additions and 8 deletions

vllm/model_executor/layers/quantization/gptq_marlin.py vllm/model_executor/layers/quantization/gptq_marlin.py +4 -8

No files found.
--- a/vllm/model_executor/layers/quantization/gptq_marlin.py
+++ b/vllm/model_executor/layers/quantization/gptq_marlin.py
@@ -298,14 +298,10 @@ class GPTQMarlinLinearMethod(LinearMethodBase):
            },
        )
-        g_idx_sort_indices = Parameter(
+        g_idx_sort_indices = torch.empty(
-            torch.empty(
            g_idx.shape,
            dtype=torch.int32,
-            ),
-            requires_grad=False,
        )
-        set_weight_attrs(g_idx_sort_indices, extra_weight_attrs)
        # Scales
        scales = Parameter(
@@ -356,9 +352,9 @@ class GPTQMarlinLinearMethod(LinearMethodBase):
        layer.register_parameter("qweight", qweight)
        layer.register_parameter("g_idx", g_idx)
-        layer.register_parameter("g_idx_sort_indices", g_idx_sort_indices)
        layer.register_parameter("scales", scales)
        layer.register_parameter("qzeros", qzeros)
+        layer.g_idx_sort_indices = g_idx_sort_indices
        layer.workspace = workspace
        layer.input_size_per_partition = input_size_per_partition
        layer.output_size_per_partition = output_size_per_partition