Add support for GPTQ Marlin (#2052)

Add support for GPTQ Marlin kernels GPTQ Marlin extends the Marlin kernels to support common GPTQ configurations: - bits: 4 or 8 - groupsize: -1, 32, 64, or 128 - desc_act: true/false Using the GPTQ Marlin kernels requires repacking the parameters in the Marlin quantizer format. The kernels were contributed by Neural Magic to VLLM. We vendor them here for convenience.

Add support for GPTQ Marlin (#2052)
Add support for GPTQ Marlin kernels GPTQ Marlin extends the Marlin kernels to support common GPTQ configurations: - bits: 4 or 8 - groupsize: -1, 32, 64, or 128 - desc_act: true/false Using the GPTQ Marlin kernels requires repacking the parameters in the Marlin quantizer format. The kernels were contributed by Neural Magic to VLLM. We vendor them here for convenience.
093a27c5 · Daniël de Kok · GitHub · f433f1f7 · 093a27c5 · 093a27c5
Unverified Commit 093a27c5 authored Jun 14, 2024 by Daniël de Kok Committed by GitHub Jun 14, 2024
20 changed files
--- a/server/text_generation_server/models/bloom.py
+++ b/server/text_generation_server/models/bloom.py
@@ -83,7 +83,7 @@ class BLOOMSharded(CausalLM):
            process_group=self.process_group,
            prefix="transformer",
        )
-        if config.quantize == "gptq":
+        if config.quantize in ["gptq", "marlin"]:
            weights._set_gptq_params(model_id, revision)

        model = BloomForCausalLM(config, weights)

--- a/server/text_generation_server/models/custom_modeling/flash_cohere_modeling.py
+++ b/server/text_generation_server/models/custom_modeling/flash_cohere_modeling.py
@@ -166,7 +166,7 @@ def _load_gqa(config, prefix: str, weights):
        dim=0,
    )

-    if config.quantize not in ["gptq", "awq"]:
+    if config.quantize not in ["gptq", "awq", "marlin"]:
        weight = weight.to(dtype=weights.dtype).to(device=weights.device)

        head_size = config.hidden_size // config.num_attention_heads

--- a/server/text_generation_server/models/custom_modeling/flash_santacoder_modeling.py
+++ b/server/text_generation_server/models/custom_modeling/flash_santacoder_modeling.py
@@ -81,16 +81,11 @@ def _load_multi_mqa_gptq(
        qzeros = torch.cat([q_tensor, kv_tensor], dim=1)
        qzeros = qzeros.to(device=weights.device)

-        (
-            bits,
-            groupsize,
-            _,
-            quant_method,
-        ) = weights._get_gptq_params()
-        if quant_method == "gptq":
+        gptq_params = weights._get_gptq_params()
+        if gptq_params.quant_method == "gptq":
            g_idx = weights.get_tensor(f"{prefix}.c_attn.g_idx")
            g_idx = g_idx.to(device=weights.device)
-        elif quant_method == "awq":
+        elif gptq_params.quant_method == "awq":
            g_idx = None
            from text_generation_server.layers.awq.conversion_utils import (
                fast_awq_to_gptq,
@@ -105,8 +100,8 @@ def _load_multi_mqa_gptq(
            qzeros=qzeros,
            scales=scales,
            g_idx=g_idx,
-            bits=bits,
-            groupsize=groupsize,
+            bits=gptq_params.bits,
+            groupsize=gptq_params.groupsize,
            use_exllama=HAS_EXLLAMA,
        )


--- a/server/text_generation_server/models/custom_modeling/flash_starcoder2_modeling.py
+++ b/server/text_generation_server/models/custom_modeling/flash_starcoder2_modeling.py
@@ -130,7 +130,7 @@ def _load_gqa(config, prefix: str, weights):
        dim=0,
    )

-    if config.quantize not in ["gptq", "awq"]:
+    if config.quantize not in ["gptq", "awq", "marlin"]:
        weight = weight.to(dtype=weights.dtype).to(device=weights.device)

        head_size = config.hidden_size // config.num_attention_heads

--- a/server/text_generation_server/models/flash_cohere.py
+++ b/server/text_generation_server/models/flash_cohere.py
@@ -55,7 +55,7 @@ class FlashCohere(FlashCausalLM):

        filenames = weight_files(model_id, revision=revision, extension=".safetensors")
        weights = Weights(filenames, device, dtype, process_group=self.process_group)
-        if config.quantize in ["gptq", "awq"]:
+        if config.quantize in ["gptq", "awq", "marlin"]:
            weights._set_gptq_params(model_id, revision)

        model = FlashCohereForCausalLM(config, weights)

--- a/server/text_generation_server/models/flash_dbrx.py
+++ b/server/text_generation_server/models/flash_dbrx.py
@@ -80,7 +80,7 @@ class FlashDbrx(FlashCausalLM):

        filenames = weight_files(model_id, revision=revision, extension=".safetensors")
        weights = Weights(filenames, device, dtype, process_group=self.process_group)
-        if config.quantize in ["gptq", "awq"]:
+        if config.quantize in ["gptq", "awq", "marlin"]:
            weights._set_gptq_params(model_id, revision)

        model = FlashDbrxForCausalLM(config, weights)

--- a/server/text_generation_server/models/flash_gemma.py
+++ b/server/text_generation_server/models/flash_gemma.py
@@ -53,7 +53,7 @@ class FlashGemma(FlashCausalLM):

        filenames = weight_files(model_id, revision=revision, extension=".safetensors")
        weights = Weights(filenames, device, dtype, process_group=self.process_group)
-        if config.quantize in ["gptq", "awq"]:
+        if config.quantize in ["gptq", "awq", "marlin"]:
            weights._set_gptq_params(model_id, revision)

        # TODO hardcoded

--- a/server/text_generation_server/models/flash_llama.py
+++ b/server/text_generation_server/models/flash_llama.py
@@ -67,7 +67,7 @@ class FlashLlama(FlashCausalLM):

        filenames = weight_files(model_id, revision=revision, extension=".safetensors")
        weights = Weights(filenames, device, dtype, process_group=self.process_group)
-        if config.quantize in ["gptq", "awq", "exl2"]:
+        if config.quantize in ["awq", "exl2", "gptq", "marlin"]:
            weights._set_gptq_params(model_id, revision)

        prefix = ""

--- a/server/text_generation_server/models/flash_mistral.py
+++ b/server/text_generation_server/models/flash_mistral.py
@@ -68,7 +68,7 @@ class BaseFlashMistral(FlashCausalLM):

        filenames = weight_files(model_id, revision=revision, extension=".safetensors")
        weights = Weights(filenames, device, dtype, process_group=self.process_group)
-        if config.quantize in ["gptq", "awq"]:
+        if config.quantize in ["gptq", "awq", "marlin"]:
            weights._set_gptq_params(model_id, revision)

        prefix = ""

--- a/server/text_generation_server/models/flash_neox.py
+++ b/server/text_generation_server/models/flash_neox.py
@@ -58,7 +58,7 @@ class FlashNeoXSharded(FlashCausalLM):
        weights = Weights(
            filenames, device=device, dtype=dtype, process_group=self.process_group
        )
-        if config.quantize == "gptq":
+        if config.quantize in ["gptq", "marlin"]:
            weights._set_gptq_params(model_id, revision)

        model = FlashGPTNeoXForCausalLM(config, weights)

--- a/server/text_generation_server/models/flash_phi.py
+++ b/server/text_generation_server/models/flash_phi.py
@@ -53,7 +53,7 @@ class FlashPhi(FlashCausalLM):

        filenames = weight_files(model_id, revision=revision, extension=".safetensors")
        weights = Weights(filenames, device, dtype, process_group=self.process_group)
-        if config.quantize in ["gptq", "awq"]:
+        if config.quantize in ["gptq", "awq", "marlin"]:
            weights._set_gptq_params(model_id, revision)

        model = FlashPhiForCausalLM(config, weights)

--- a/server/text_generation_server/models/flash_qwen2.py
+++ b/server/text_generation_server/models/flash_qwen2.py
@@ -62,7 +62,7 @@ class FlashQwen2(BaseFlashMistral):

        filenames = weight_files(model_id, revision=revision, extension=".safetensors")
        weights = Weights(filenames, device, dtype, process_group=self.process_group)
-        if config.quantize in ["gptq", "awq"]:
+        if config.quantize in ["gptq", "awq", "marlin"]:
            weights._set_gptq_params(model_id, revision)

        model = Qwen2ForCausalLM(config, weights)

--- a/server/text_generation_server/models/flash_rw.py
+++ b/server/text_generation_server/models/flash_rw.py
@@ -67,7 +67,7 @@ class FlashRWSharded(FlashCausalLM):

        config.quantize = quantize
        config.speculator = speculator
-        if config.quantize == "gptq":
+        if config.quantize in ["gptq", "marlin"]:
            weights._set_gptq_params(model_id, revision)

        model = FlashRWForCausalLM(config, weights)

--- a/server/text_generation_server/models/flash_santacoder.py
+++ b/server/text_generation_server/models/flash_santacoder.py
@@ -69,7 +69,7 @@ class FlashSantacoderSharded(FlashCausalLM):
            process_group=self.process_group,
            aliases={"transformer.wte.weight": ["lm_head.weight"]},
        )
-        if config.quantize == "gptq":
+        if config.quantize in ["gptq", "marlin"]:
            weights._set_gptq_params(model_id, revision)

        model = FlashSantacoderForCausalLM(config, weights)

--- a/server/text_generation_server/models/flash_starcoder2.py
+++ b/server/text_generation_server/models/flash_starcoder2.py
@@ -61,7 +61,7 @@ class FlashStarcoder2(BaseFlashMistral):

        filenames = weight_files(model_id, revision=revision, extension=".safetensors")
        weights = Weights(filenames, device, dtype, process_group=self.process_group)
-        if config.quantize in ["gptq", "awq"]:
+        if config.quantize in ["gptq", "awq", "marlin"]:
            weights._set_gptq_params(model_id, revision)

        model = FlashStarcoder2ForCausalLM(config, weights)

--- a/server/text_generation_server/models/galactica.py
+++ b/server/text_generation_server/models/galactica.py
@@ -205,7 +205,7 @@ class GalacticaSharded(CausalLM):
        weights = Weights(
            filenames, device=device, dtype=dtype, process_group=self.process_group
        )
-        if config.quantize == "gptq":
+        if config.quantize in ["gptq", "marlin"]:
            weights._set_gptq_params(model_id, revision)

        model = OPTForCausalLM(config, weights)

--- a/server/text_generation_server/models/gpt_neox.py
+++ b/server/text_generation_server/models/gpt_neox.py
@@ -58,7 +58,7 @@ class GPTNeoxSharded(CausalLM):
        weights = Weights(
            filenames, device=device, dtype=dtype, process_group=self.process_group
        )
-        if config.quantize == "gptq":
+        if config.quantize in ["gptq", "marlin"]:
            weights._set_gptq_params(model_id, revision)

        model = GPTNeoxForCausalLM(config, weights)

--- a/server/text_generation_server/models/mpt.py
+++ b/server/text_generation_server/models/mpt.py
@@ -82,7 +82,7 @@ class MPTSharded(CausalLM):

        filenames = weight_files(model_id, revision=revision, extension=".safetensors")
        weights = Weights(filenames, device, dtype, process_group=self.process_group)
-        if config.quantize == "gptq":
+        if config.quantize in ["gptq", "marlin"]:
            weights._set_gptq_params(model_id, revision)

        config.quantize = quantize

--- a/server/text_generation_server/models/opt.py
+++ b/server/text_generation_server/models/opt.py
@@ -56,7 +56,7 @@ class OPTSharded(CausalLM):
        weights = Weights(
            filenames, device=device, dtype=dtype, process_group=self.process_group
        )
-        if config.quantize == "gptq":
+        if config.quantize in ["gptq", "marlin"]:
            weights._set_gptq_params(model_id, revision)

        model = OPTForCausalLM(config, weights)

--- a/server/text_generation_server/utils/weights.py
+++ b/server/text_generation_server/utils/weights.py