Enabled BnB NF4 inference on Gaudi (#20172)

Signed-off-by: Ruheena Suhani Shaik <rsshaik@habana.ai>

Enabled BnB NF4 inference on Gaudi (#20172)
Signed-off-by: Ruheena Suhani Shaik <rsshaik@habana.ai>
016b8d1b · Ruheena Suhani Shaik · GitHub · 80305c1b · 016b8d1b · 016b8d1b
Unverified Commit 016b8d1b authored Jul 15, 2025 by Ruheena Suhani Shaik Committed by GitHub Jul 14, 2025
2 changed files
--- a/vllm/model_executor/layers/quantization/bitsandbytes.py
+++ b/vllm/model_executor/layers/quantization/bitsandbytes.py
@@ -13,6 +13,7 @@ from vllm.model_executor.layers.linear import (LinearBase, LinearMethodBase,
 from vllm.model_executor.layers.quantization import QuantizationMethods
 from vllm.model_executor.layers.quantization.base_config import (
    QuantizationConfig)
+from vllm.platforms import current_platform
 from vllm.utils import direct_register_custom_op
@@ -390,12 +391,11 @@ def _apply_bnb_4bit_fake(
 try:
-    direct_register_custom_op(
+    direct_register_custom_op(op_name="apply_bnb_4bit",
-        op_name="apply_bnb_4bit",
+                              op_func=_apply_bnb_4bit,
-        op_func=_apply_bnb_4bit,
+                              mutates_args=["out"],
-        mutates_args=["out"],
+                              fake_impl=_apply_bnb_4bit_fake,
-        fake_impl=_apply_bnb_4bit_fake,
+                              dispatch_key=current_platform.dispatch_key)
-    )
    apply_bnb_4bit = torch.ops.vllm.apply_bnb_4bit
 except AttributeError as error:

--- a/vllm/model_executor/model_loader/bitsandbytes_loader.py
+++ b/vllm/model_executor/model_loader/bitsandbytes_loader.py
@@ -199,6 +199,10 @@ class BitsAndBytesModelLoader(BaseModelLoader):
        if self.pre_quant:
            if self.load_8bit:
+                if current_platform.is_hpu():
+                    raise ValueError(
+                        "currently hpu supports 4bit quantization only")
                return self._quantized_8bit_generator(
                    hf_weights_files, use_safetensors,
                    quant_state_dict), quant_state_dict
@@ -302,6 +306,10 @@ class BitsAndBytesModelLoader(BaseModelLoader):
                        in temp_state_dict):
                quant_state = _parse_quant_state(mapped_weight_name,
                                                 temp_state_dict)
+                if current_platform.is_hpu():
+                    assert quant_state.quant_type == "nf4", (
+                        "currently hpu supports nf4 quant_type only")
                quant_state_dict[mapped_weight_name] = quant_state
                yield org_weight_name, weight_tensor
            else:
@@ -372,10 +380,12 @@ class BitsAndBytesModelLoader(BaseModelLoader):
                                                      ...]
                # bitsandbytes requires data in GPU
-                if weight_sub_tensor.is_cuda:
+                if (weight_sub_tensor.is_cuda
+                        or weight_sub_tensor.device.type == "hpu"):
                    loaded_weight = weight_sub_tensor
                else:
-                    loaded_weight = weight_sub_tensor.cuda()
+                    loaded_weight = weight_sub_tensor.to(
+                        device=current_platform.device_type)
                # remove the following after the issue is fixed:
                # https://github.com/bitsandbytes-foundation/bitsandbytes/issues/1342