Enable 4bit bnb prequant MOE (#21548)

Signed-off-by: Jee Jee Li <pandaleefree@gmail.com> Co-authored-by: Jee Jee Li <pandaleefree@gmail.com>

Enable 4bit bnb prequant MOE (#21548)
Signed-off-by: Jee Jee Li <pandaleefree@gmail.com> Co-authored-by: Jee Jee Li <pandaleefree@gmail.com>
9b94d6ec · Andy Chen · GitHub · 1891a265 · 9b94d6ec
Unverified Commit 9b94d6ec authored Aug 11, 2025 by Andy Chen Committed by GitHub Aug 11, 2025
Show whitespace changes
Inline Side-by-side

Showing with 3 additions and 7 deletions

vllm/model_executor/model_loader/bitsandbytes_loader.py vllm/model_executor/model_loader/bitsandbytes_loader.py +3 -7

No files found.
--- a/vllm/model_executor/model_loader/bitsandbytes_loader.py
+++ b/vllm/model_executor/model_loader/bitsandbytes_loader.py
@@ -427,14 +427,10 @@ class BitsAndBytesModelLoader(BaseModelLoader):
            elif isinstance(module, FusedMoE) and hasattr(
                    module.quant_method, "quant_config"):
                # TODO: support FusedMoE with prequant and 8bit.
-                if self.pre_quant:
+                if self.pre_quant and self.load_8bit:
-                    raise ValueError(
-                        "Prequant BitsAndBytes models with FusedMoE is not "
-                        "supported yet.")
-                if self.load_8bit:
                    raise ValueError(
-                        "BitsAndBytes 8bit quantization with FusedMoE is not "
+                        "Prequant BitsAndBytes 8bit models with FusedMoE "
-                        "supported yet.")
+                        "is not supported yet.")
                # Get the corresponding weight name using module name and
                # expert_params_mapping.