[Bugfix] Fix transformers model impl ignored for mixtral quant (#18602)

Signed-off-by: Tristan Leclercq <tristanleclercq@gmail.com>

[Bugfix] Fix transformers model impl ignored for mixtral quant (#18602)
Signed-off-by: Tristan Leclercq <tristanleclercq@gmail.com>
6220f3c6 · Tristan Leclercq · GitHub · 52fb23f4 · 6220f3c6
Unverified Commit 6220f3c6 authored May 23, 2025 by Tristan Leclercq Committed by GitHub May 23, 2025
Show whitespace changes
Inline Side-by-side

Showing with 4 additions and 5 deletions

vllm/model_executor/model_loader/utils.py vllm/model_executor/model_loader/utils.py +4 -5

No files found.
--- a/vllm/model_executor/model_loader/utils.py
+++ b/vllm/model_executor/model_loader/utils.py
@@ -225,17 +225,16 @@ def get_model_architecture(
        "fp8", "compressed-tensors", "gptq_marlin", "awq_marlin", "quark"
    ]
-    if (model_config.quantization is not None
-            and model_config.quantization not in mixtral_supported
-            and "MixtralForCausalLM" in architectures):
-        architectures = ["QuantMixtralForCausalLM"]
    vllm_supported_archs = ModelRegistry.get_supported_archs()
    vllm_not_supported = not any(arch in vllm_supported_archs
                                 for arch in architectures)
    if (model_config.model_impl == ModelImpl.TRANSFORMERS or
            model_config.model_impl != ModelImpl.VLLM and vllm_not_supported):
        architectures = resolve_transformers_arch(model_config, architectures)
+    elif (model_config.quantization is not None
+          and model_config.quantization not in mixtral_supported
+          and "MixtralForCausalLM" in architectures):
+        architectures = ["QuantMixtralForCausalLM"]
    model_cls, arch = ModelRegistry.resolve_model_cls(architectures)
    if model_config.task == "embed":