Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
13f9f7a3
Unverified
Commit
13f9f7a3
authored
Sep 25, 2024
by
Jee Jee Li
Committed by
GitHub
Sep 24, 2024
Browse files
[[Misc]Upgrade bitsandbytes to the latest version 0.44.0 (#8768)
parent
1e7d5c01
Changes
7
Show whitespace changes
Inline
Side-by-side
Showing
7 changed files
with
44 additions
and
34 deletions
+44
-34
docs/source/quantization/bnb.rst
docs/source/quantization/bnb.rst
+1
-1
examples/lora_with_quantization_inference.py
examples/lora_with_quantization_inference.py
+10
-16
requirements-test.txt
requirements-test.txt
+1
-1
tests/quantization/test_bitsandbytes.py
tests/quantization/test_bitsandbytes.py
+1
-1
vllm/config.py
vllm/config.py
+23
-7
vllm/model_executor/layers/quantization/bitsandbytes.py
vllm/model_executor/layers/quantization/bitsandbytes.py
+4
-4
vllm/model_executor/model_loader/loader.py
vllm/model_executor/model_loader/loader.py
+4
-4
No files found.
docs/source/quantization/bnb.rst
View file @
13f9f7a3
...
...
@@ -11,7 +11,7 @@ Below are the steps to utilize BitsAndBytes with vLLM.
.. code-block:: console
$ pip install bitsandbytes>=0.4
2
.0
$ pip install bitsandbytes>=0.4
4
.0
vLLM reads the model's config file and supports both in-flight quantization and pre-quantized checkpoint.
...
...
examples/lora_with_quantization_inference.py
View file @
13f9f7a3
...
...
@@ -79,23 +79,17 @@ def initialize_engine(model: str, quantization: str,
# It quantizes the model when loading, with some config info from the
# LoRA adapter repo. So need to set the parameter of load_format and
# qlora_adapter_name_or_path as below.
engine_args
=
EngineArgs
(
model
=
model
,
engine_args
=
EngineArgs
(
model
=
model
,
quantization
=
quantization
,
qlora_adapter_name_or_path
=
lora_repo
,
load_format
=
"bitsandbytes"
,
enable_lora
=
True
,
max_lora_rank
=
64
,
# set it only in GPUs of limited memory
enforce_eager
=
True
)
max_lora_rank
=
64
)
else
:
engine_args
=
EngineArgs
(
model
=
model
,
engine_args
=
EngineArgs
(
model
=
model
,
quantization
=
quantization
,
enable_lora
=
True
,
max_loras
=
4
,
# set it only in GPUs of limited memory
enforce_eager
=
True
)
max_loras
=
4
)
return
LLMEngine
.
from_engine_args
(
engine_args
)
...
...
requirements-test.txt
View file @
13f9f7a3
...
...
@@ -30,5 +30,5 @@ datamodel_code_generator # required for minicpm3 test
aiohttp
# quantization
bitsandbytes
=
=0.4
2
.0
bitsandbytes
>
=0.4
4
.0
buildkite-test-collector==0.1.8
tests/quantization/test_bitsandbytes.py
View file @
13f9f7a3
...
...
@@ -107,7 +107,7 @@ def validate_generated_texts(hf_runner,
quantization
=
'bitsandbytes'
,
load_format
=
'bitsandbytes'
,
tensor_parallel_size
=
vllm_tp_size
,
enforce_eager
=
Tru
e
,
enforce_eager
=
Fals
e
,
gpu_memory_utilization
=
0.8
)
as
llm
:
vllm_outputs
=
llm
.
generate_greedy
(
prompts
,
8
)
vllm_logs
=
log_generated_texts
(
prompts
,
vllm_outputs
,
"VllmRunner"
)
...
...
vllm/config.py
View file @
13f9f7a3
...
...
@@ -222,6 +222,7 @@ class ModelConfig:
self
.
_verify_embedding_mode
()
self
.
_verify_quantization
()
self
.
_verify_cuda_graph
()
self
.
_verify_bnb_config
()
def
_init_multimodal_config
(
self
,
limit_mm_per_prompt
:
Optional
[
Mapping
[
str
,
int
]]
...
...
@@ -337,6 +338,28 @@ class ModelConfig:
self
.
max_seq_len_to_capture
=
min
(
self
.
max_seq_len_to_capture
,
self
.
max_model_len
)
def
_verify_bnb_config
(
self
)
->
None
:
"""
The current version of bitsandbytes (0.44.0) with 8-bit models does not
yet support CUDA graph.
"""
is_bitsandbytes
=
self
.
quantization
==
"bitsandbytes"
has_quantization_config
=
(
getattr
(
self
.
hf_config
,
"quantization_config"
,
None
)
is
not
None
)
is_8bit
=
(
self
.
hf_config
.
quantization_config
.
get
(
"load_in_8bit"
,
False
)
if
has_quantization_config
else
False
)
if
all
([
is_bitsandbytes
,
has_quantization_config
,
is_8bit
,
not
self
.
enforce_eager
,
]):
logger
.
warning
(
"CUDA graph is not supported on BitAndBytes 8bit yet, "
"fallback to the eager mode."
)
self
.
enforce_eager
=
True
def
verify_async_output_proc
(
self
,
parallel_config
,
speculative_config
,
device_config
)
->
None
:
if
not
self
.
use_async_output_proc
:
...
...
@@ -401,13 +424,6 @@ class ModelConfig:
"Pipeline parallelism is only supported for the following "
f
" architectures:
{
_PP_SUPPORTED_MODELS
}
."
)
# Remove the constraint after the bitsandbytes issue is fixed:
# https://github.com/bitsandbytes-foundation/bitsandbytes/issues/1308
if
self
.
quantization
==
"bitsandbytes"
and
self
.
enforce_eager
is
False
:
logger
.
warning
(
"CUDA graph is not supported on BitAndBytes yet, "
"fallback to the eager mode."
)
self
.
enforce_eager
=
True
if
pipeline_parallel_size
>
1
and
self
.
use_async_output_proc
:
logger
.
warning
(
"Async output processor is not supported with "
"pipeline parallelism currently. Disabling it."
)
...
...
vllm/model_executor/layers/quantization/bitsandbytes.py
View file @
13f9f7a3
...
...
@@ -121,12 +121,12 @@ class BitsAndBytesLinearMethod(LinearMethodBase):
def
__init__
(
self
,
quant_config
:
BitsAndBytesConfig
):
try
:
import
bitsandbytes
if
bitsandbytes
.
__version__
<
"0.4
2
.0"
:
if
bitsandbytes
.
__version__
<
"0.4
4
.0"
:
raise
ImportError
(
"bitsandbytes version is wrong. Please "
"install bitsandbytes>=0.4
2
.0."
)
"install bitsandbytes>=0.4
4
.0."
)
except
ImportError
as
err
:
raise
ImportError
(
"Please install bitsandbytes>=0.4
2
.0 via "
"`pip install bitsandbytes>=0.4
2
.0` to use "
raise
ImportError
(
"Please install bitsandbytes>=0.4
4
.0 via "
"`pip install bitsandbytes>=0.4
4
.0` to use "
"bitsandbytes quantizer."
)
from
err
self
.
quant_config
=
quant_config
...
...
vllm/model_executor/model_loader/loader.py
View file @
13f9f7a3
...
...
@@ -851,12 +851,12 @@ class BitsAndBytesModelLoader(BaseModelLoader):
# only load the bitsandbytes module when needed
try
:
import
bitsandbytes
if
bitsandbytes
.
__version__
<
"0.4
2
.0"
:
if
bitsandbytes
.
__version__
<
"0.4
4
.0"
:
raise
ImportError
(
"bitsandbytes version is wrong. Please "
"install bitsandbytes>=0.4
2
.0."
)
"install bitsandbytes>=0.4
4
.0."
)
except
ImportError
as
err
:
raise
ImportError
(
"Please install bitsandbytes>=0.4
2
.0 via "
"`pip install bitsandbytes>=0.4
2
.0` to use "
raise
ImportError
(
"Please install bitsandbytes>=0.4
4
.0 via "
"`pip install bitsandbytes>=0.4
4
.0` to use "
"bitsandbytes quantizer."
)
from
err
hf_weights_files
,
use_safetensors
=
self
.
_prepare_weights
(
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment