[Misc] Clean up the BitsAndBytes arguments (#15140)

Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>

[Misc] Clean up the BitsAndBytes arguments (#15140)
Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
10f55fe6 · Jee Jee Li · GitHub · d3ccbd63 · 10f55fe6 · 10f55fe6
Unverified Commit 10f55fe6 authored Mar 21, 2025 by Jee Jee Li Committed by GitHub Mar 20, 2025
3 changed files
--- a/docs/source/features/quantization/bnb.md
+++ b/docs/source/features/quantization/bnb.md
@@ -25,7 +25,7 @@ import torch
 # unsloth/tinyllama-bnb-4bit is a pre-quantized checkpoint.
 model_id = "unsloth/tinyllama-bnb-4bit"
 llm = LLM(model=model_id, dtype=torch.bfloat16, trust_remote_code=True, \
-quantization="bitsandbytes", load_format="bitsandbytes")
+quantization="bitsandbytes")
 ```

 ## Inflight quantization: load as 4bit quantization
@@ -35,7 +35,7 @@ from vllm import LLM
 import torch
 model_id = "huggyllama/llama-7b"
 llm = LLM(model=model_id, dtype=torch.bfloat16, trust_remote_code=True, \
-quantization="bitsandbytes", load_format="bitsandbytes")
+quantization="bitsandbytes")
 ```

 ## OpenAI Compatible Server
@@ -43,5 +43,5 @@ quantization="bitsandbytes", load_format="bitsandbytes")
 Append the following to your 4bit model arguments:

 ```console
--quantization bitsandbytes --load-format bitsandbytes
+--quantization bitsandbytes
 ```
--- a/examples/offline_inference/lora_with_quantization_inference.py
+++ b/examples/offline_inference/lora_with_quantization_inference.py
@@ -83,7 +83,6 @@ def initialize_engine(model: str, quantization: str,
        engine_args = EngineArgs(model=model,
                                 quantization=quantization,
                                 qlora_adapter_name_or_path=lora_repo,
-                                 load_format="bitsandbytes",
                                 enable_lora=True,
                                 max_lora_rank=64)
    else:

--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -1170,22 +1170,15 @@ class EngineArgs:
        )

    def create_load_config(self) -> LoadConfig:
-        # bitsandbytes quantization needs a specific model loader
-        # so we make sure the quant method and the load format are consistent
-        if (self.quantization == "bitsandbytes" or
-           self.qlora_adapter_name_or_path is not None) and \
-           self.load_format != "bitsandbytes":
-            raise ValueError(
-                "BitsAndBytes quantization and QLoRA adapter only support "
-                f"'bitsandbytes' load format, but got {self.load_format}")

-        if (self.load_format == "bitsandbytes" or
-            self.qlora_adapter_name_or_path is not None) and \
+        if(self.qlora_adapter_name_or_path is not None) and \
            self.quantization != "bitsandbytes":
            raise ValueError(
-                "BitsAndBytes load format and QLoRA adapter only support "
+                "QLoRA adapter only support "
                f"'bitsandbytes' quantization, but got {self.quantization}")

+        if self.quantization == "bitsandbytes":
+            self.load_format = "bitsandbytes"
        return LoadConfig(
            load_format=self.load_format,
            download_dir=self.download_dir,