[[Misc]Upgrade bitsandbytes to the latest version 0.44.0 (#8768)

13f9f7a3 · Jee Jee Li · GitHub · 1e7d5c01 · 13f9f7a3 · 13f9f7a3
Unverified Commit 13f9f7a3 authored Sep 25, 2024 by Jee Jee Li Committed by GitHub Sep 24, 2024
7 changed files
--- a/docs/source/quantization/bnb.rst
+++ b/docs/source/quantization/bnb.rst
@@ -11,7 +11,7 @@ Below are the steps to utilize BitsAndBytes with vLLM.

 .. code-block:: console

-    $ pip install bitsandbytes>=0.42.0
+    $ pip install bitsandbytes>=0.44.0

 vLLM reads the model's config file and supports both in-flight quantization and pre-quantized checkpoint.


--- a/examples/lora_with_quantization_inference.py
+++ b/examples/lora_with_quantization_inference.py
@@ -79,23 +79,17 @@ def initialize_engine(model: str, quantization: str,
        # It quantizes the model when loading, with some config info from the
        # LoRA adapter repo. So need to set the parameter of load_format and
        # qlora_adapter_name_or_path as below.
-        engine_args = EngineArgs(
-            model=model,
+        engine_args = EngineArgs(model=model,
                                 quantization=quantization,
                                 qlora_adapter_name_or_path=lora_repo,
                                 load_format="bitsandbytes",
                                 enable_lora=True,
-            max_lora_rank=64,
-            # set it only in GPUs of limited memory
-            enforce_eager=True)
+                                 max_lora_rank=64)
    else:
-        engine_args = EngineArgs(
-            model=model,
+        engine_args = EngineArgs(model=model,
                                 quantization=quantization,
                                 enable_lora=True,
-            max_loras=4,
-            # set it only in GPUs of limited memory
-            enforce_eager=True)
+                                 max_loras=4)
    return LLMEngine.from_engine_args(engine_args)



--- a/requirements-test.txt
+++ b/requirements-test.txt
@@ -30,5 +30,5 @@ datamodel_code_generator # required for minicpm3 test
 aiohttp

 # quantization
-bitsandbytes==0.42.0
+bitsandbytes>=0.44.0
 buildkite-test-collector==0.1.8
--- a/tests/quantization/test_bitsandbytes.py
+++ b/tests/quantization/test_bitsandbytes.py
@@ -107,7 +107,7 @@ def validate_generated_texts(hf_runner,
                     quantization='bitsandbytes',
                     load_format='bitsandbytes',
                     tensor_parallel_size=vllm_tp_size,
-                     enforce_eager=True,
+                     enforce_eager=False,
                     gpu_memory_utilization=0.8) as llm:
        vllm_outputs = llm.generate_greedy(prompts, 8)
        vllm_logs = log_generated_texts(prompts, vllm_outputs, "VllmRunner")

--- a/vllm/config.py
+++ b/vllm/config.py
@@ -222,6 +222,7 @@ class ModelConfig:
        self._verify_embedding_mode()
        self._verify_quantization()
        self._verify_cuda_graph()
+        self._verify_bnb_config()

    def _init_multimodal_config(
        self, limit_mm_per_prompt: Optional[Mapping[str, int]]
@@ -337,6 +338,28 @@ class ModelConfig:
        self.max_seq_len_to_capture = min(self.max_seq_len_to_capture,
                                          self.max_model_len)

+    def _verify_bnb_config(self) -> None:
+        """
+        The current version of bitsandbytes (0.44.0) with 8-bit models does not 
+        yet support CUDA graph.
+        """
+        is_bitsandbytes = self.quantization == "bitsandbytes"
+        has_quantization_config = (getattr(self.hf_config,
+                                           "quantization_config", None)
+                                   is not None)
+        is_8bit = (self.hf_config.quantization_config.get(
+            "load_in_8bit", False) if has_quantization_config else False)
+        if all([
+                is_bitsandbytes,
+                has_quantization_config,
+                is_8bit,
+                not self.enforce_eager,
+        ]):
+            logger.warning(
+                "CUDA graph is not supported on BitAndBytes 8bit yet, "
+                "fallback to the eager mode.")
+            self.enforce_eager = True
+
    def verify_async_output_proc(self, parallel_config, speculative_config,
                                 device_config) -> None:
        if not self.use_async_output_proc:
@@ -401,13 +424,6 @@ class ModelConfig:
                "Pipeline parallelism is only supported for the following "
                f" architectures: {_PP_SUPPORTED_MODELS}.")

-        # Remove the constraint after the bitsandbytes issue is fixed:
-        # https://github.com/bitsandbytes-foundation/bitsandbytes/issues/1308
-        if self.quantization == "bitsandbytes" and self.enforce_eager is False:
-            logger.warning("CUDA graph is not supported on BitAndBytes yet, "
-                           "fallback to the eager mode.")
-            self.enforce_eager = True
-
        if pipeline_parallel_size > 1 and self.use_async_output_proc:
            logger.warning("Async output processor is not supported with "
                           "pipeline parallelism currently. Disabling it.")

--- a/vllm/model_executor/layers/quantization/bitsandbytes.py
+++ b/vllm/model_executor/layers/quantization/bitsandbytes.py
@@ -121,12 +121,12 @@ class BitsAndBytesLinearMethod(LinearMethodBase):
    def __init__(self, quant_config: BitsAndBytesConfig):
        try:
            import bitsandbytes
-            if bitsandbytes.__version__ < "0.42.0":
+            if bitsandbytes.__version__ < "0.44.0":
                raise ImportError("bitsandbytes version is wrong. Please "
-                                  "install bitsandbytes>=0.42.0.")
+                                  "install bitsandbytes>=0.44.0.")
        except ImportError as err:
-            raise ImportError("Please install bitsandbytes>=0.42.0 via "
-                              "`pip install bitsandbytes>=0.42.0` to use "
+            raise ImportError("Please install bitsandbytes>=0.44.0 via "
+                              "`pip install bitsandbytes>=0.44.0` to use "
                              "bitsandbytes quantizer.") from err

        self.quant_config = quant_config

--- a/vllm/model_executor/model_loader/loader.py
+++ b/vllm/model_executor/model_loader/loader.py
@@ -851,12 +851,12 @@ class BitsAndBytesModelLoader(BaseModelLoader):
        # only load the bitsandbytes module when needed
        try:
            import bitsandbytes
-            if bitsandbytes.__version__ < "0.42.0":
+            if bitsandbytes.__version__ < "0.44.0":
                raise ImportError("bitsandbytes version is wrong. Please "
-                                  "install bitsandbytes>=0.42.0.")
+                                  "install bitsandbytes>=0.44.0.")
        except ImportError as err:
-            raise ImportError("Please install bitsandbytes>=0.42.0 via "
-                              "`pip install bitsandbytes>=0.42.0` to use "
+            raise ImportError("Please install bitsandbytes>=0.44.0 via "
+                              "`pip install bitsandbytes>=0.44.0` to use "
                              "bitsandbytes quantizer.") from err

        hf_weights_files, use_safetensors = self._prepare_weights(