Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
016b8d1b
Unverified
Commit
016b8d1b
authored
Jul 15, 2025
by
Ruheena Suhani Shaik
Committed by
GitHub
Jul 14, 2025
Browse files
Enabled BnB NF4 inference on Gaudi (#20172)
Signed-off-by:
Ruheena Suhani Shaik
<
rsshaik@habana.ai
>
parent
80305c1b
Changes
2
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
18 additions
and
8 deletions
+18
-8
vllm/model_executor/layers/quantization/bitsandbytes.py
vllm/model_executor/layers/quantization/bitsandbytes.py
+6
-6
vllm/model_executor/model_loader/bitsandbytes_loader.py
vllm/model_executor/model_loader/bitsandbytes_loader.py
+12
-2
No files found.
vllm/model_executor/layers/quantization/bitsandbytes.py
View file @
016b8d1b
...
@@ -13,6 +13,7 @@ from vllm.model_executor.layers.linear import (LinearBase, LinearMethodBase,
...
@@ -13,6 +13,7 @@ from vllm.model_executor.layers.linear import (LinearBase, LinearMethodBase,
from
vllm.model_executor.layers.quantization
import
QuantizationMethods
from
vllm.model_executor.layers.quantization
import
QuantizationMethods
from
vllm.model_executor.layers.quantization.base_config
import
(
from
vllm.model_executor.layers.quantization.base_config
import
(
QuantizationConfig
)
QuantizationConfig
)
from
vllm.platforms
import
current_platform
from
vllm.utils
import
direct_register_custom_op
from
vllm.utils
import
direct_register_custom_op
...
@@ -390,12 +391,11 @@ def _apply_bnb_4bit_fake(
...
@@ -390,12 +391,11 @@ def _apply_bnb_4bit_fake(
try
:
try
:
direct_register_custom_op
(
direct_register_custom_op
(
op_name
=
"apply_bnb_4bit"
,
op_name
=
"apply_bnb_4bit"
,
op_func
=
_apply_bnb_4bit
,
op_func
=
_apply_bnb_4bit
,
mutates_args
=
[
"out"
],
mutates_args
=
[
"out"
],
fake_impl
=
_apply_bnb_4bit_fake
,
fake_impl
=
_apply_bnb_4bit_fake
,
dispatch_key
=
current_platform
.
dispatch_key
)
)
apply_bnb_4bit
=
torch
.
ops
.
vllm
.
apply_bnb_4bit
apply_bnb_4bit
=
torch
.
ops
.
vllm
.
apply_bnb_4bit
except
AttributeError
as
error
:
except
AttributeError
as
error
:
...
...
vllm/model_executor/model_loader/bitsandbytes_loader.py
View file @
016b8d1b
...
@@ -199,6 +199,10 @@ class BitsAndBytesModelLoader(BaseModelLoader):
...
@@ -199,6 +199,10 @@ class BitsAndBytesModelLoader(BaseModelLoader):
if
self
.
pre_quant
:
if
self
.
pre_quant
:
if
self
.
load_8bit
:
if
self
.
load_8bit
:
if
current_platform
.
is_hpu
():
raise
ValueError
(
"currently hpu supports 4bit quantization only"
)
return
self
.
_quantized_8bit_generator
(
return
self
.
_quantized_8bit_generator
(
hf_weights_files
,
use_safetensors
,
hf_weights_files
,
use_safetensors
,
quant_state_dict
),
quant_state_dict
quant_state_dict
),
quant_state_dict
...
@@ -302,6 +306,10 @@ class BitsAndBytesModelLoader(BaseModelLoader):
...
@@ -302,6 +306,10 @@ class BitsAndBytesModelLoader(BaseModelLoader):
in
temp_state_dict
):
in
temp_state_dict
):
quant_state
=
_parse_quant_state
(
mapped_weight_name
,
quant_state
=
_parse_quant_state
(
mapped_weight_name
,
temp_state_dict
)
temp_state_dict
)
if
current_platform
.
is_hpu
():
assert
quant_state
.
quant_type
==
"nf4"
,
(
"currently hpu supports nf4 quant_type only"
)
quant_state_dict
[
mapped_weight_name
]
=
quant_state
quant_state_dict
[
mapped_weight_name
]
=
quant_state
yield
org_weight_name
,
weight_tensor
yield
org_weight_name
,
weight_tensor
else
:
else
:
...
@@ -372,10 +380,12 @@ class BitsAndBytesModelLoader(BaseModelLoader):
...
@@ -372,10 +380,12 @@ class BitsAndBytesModelLoader(BaseModelLoader):
...]
...]
# bitsandbytes requires data in GPU
# bitsandbytes requires data in GPU
if
weight_sub_tensor
.
is_cuda
:
if
(
weight_sub_tensor
.
is_cuda
or
weight_sub_tensor
.
device
.
type
==
"hpu"
):
loaded_weight
=
weight_sub_tensor
loaded_weight
=
weight_sub_tensor
else
:
else
:
loaded_weight
=
weight_sub_tensor
.
cuda
()
loaded_weight
=
weight_sub_tensor
.
to
(
device
=
current_platform
.
device_type
)
# remove the following after the issue is fixed:
# remove the following after the issue is fixed:
# https://github.com/bitsandbytes-foundation/bitsandbytes/issues/1342
# https://github.com/bitsandbytes-foundation/bitsandbytes/issues/1342
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment