[Kernel] Zero point support in fused MarlinMoE kernel + AWQ Fused MoE (#8973)

Co-authored-by: Dipika <dipikasikka1@gmail.com> Co-authored-by: Dipika Sikka <ds3822@columbia.edu>

[Kernel] Zero point support in fused MarlinMoE kernel + AWQ Fused MoE (#8973)
Co-authored-by: Dipika <dipikasikka1@gmail.com> Co-authored-by: Dipika Sikka <ds3822@columbia.edu>
05d68643 · ElizaWszola · GitHub · 0dcc8cbe · 05d68643 · 05d68643
Unverified Commit 05d68643 authored Oct 04, 2024 by ElizaWszola Committed by GitHub Oct 04, 2024
3 changed files
--- a/vllm/model_executor/layers/quantization/gptq_marlin.py
+++ b/vllm/model_executor/layers/quantization/gptq_marlin.py
@@ -557,14 +557,14 @@ class GPTQMarlinMoEMethod(FusedMoEMethodBase):
            x,
            layer.w13_qweight,
            layer.w2_qweight,
+            layer.w13_scales,
+            layer.w2_scales,
            router_logits,
-            layer.w13_g_idx,
-            layer.w2_g_idx,
-            layer.w13_g_idx_sort_indices,
-            layer.w2_g_idx_sort_indices,
            topk_weights,
            topk_ids,
-            w1_scale=layer.w13_scales,
+            g_idx1=layer.w13_g_idx,
-            w2_scale=layer.w2_scales,
+            g_idx2=layer.w2_g_idx,
+            sort_indices1=layer.w13_g_idx_sort_indices,
+            sort_indices2=layer.w2_g_idx_sort_indices,
            num_bits=self.quant_config.quant_type.size_bits,
        ).to(orig_dtype)
--- a/vllm/model_executor/layers/quantization/utils/marlin_utils.py
+++ b/vllm/model_executor/layers/quantization/utils/marlin_utils.py
@@ -208,6 +208,7 @@ def marlin_moe_permute_scales(
        device=s.device,
        dtype=s.dtype,
    )
    for e in range(num_experts):
        output[e] = marlin_permute_scales(s[e], size_k, size_n, group_size)
    return output
@@ -258,6 +259,20 @@ def awq_to_marlin_zero_points(q_zp_packed: torch.Tensor, size_k: int,
    return marlin_zp
+def moe_awq_to_marlin_zero_points(q_zp_packed: torch.Tensor, size_k: int,
+                                  size_n: int, num_bits: int):
+    num_experts = q_zp_packed.shape[0]
+    output = torch.empty(
+        (num_experts, q_zp_packed.shape[1], q_zp_packed.shape[2]),
+        device=q_zp_packed.device,
+        dtype=q_zp_packed.dtype,
+    )
+    for e in range(num_experts):
+        output[e] = awq_to_marlin_zero_points(q_zp_packed[e], size_k, size_n,
+                                              num_bits)
+    return output
 def apply_gptq_marlin_linear(
        input: torch.Tensor,
        weight: torch.Tensor,

--- a/vllm/model_executor/model_loader/utils.py
+++ b/vllm/model_executor/model_loader/utils.py
@@ -23,7 +23,9 @@ def get_model_architecture(
    architectures = getattr(model_config.hf_config, "architectures", [])
    # Special handling for quantized Mixtral.
    # FIXME(woosuk): This is a temporary hack.
-    mixtral_supported = ["fp8", "compressed-tensors", "gptq_marlin"]
+    mixtral_supported = [
+        "fp8", "compressed-tensors", "gptq_marlin", "awq_marlin"
+    ]
    if (model_config.quantization is not None
            and model_config.quantization not in mixtral_supported