Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
10f55fe6
Unverified
Commit
10f55fe6
authored
Mar 21, 2025
by
Jee Jee Li
Committed by
GitHub
Mar 20, 2025
Browse files
[Misc] Clean up the BitsAndBytes arguments (#15140)
Signed-off-by:
Jee Jee Li
<
pandaleefree@gmail.com
>
parent
d3ccbd63
Changes
3
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
7 additions
and
15 deletions
+7
-15
docs/source/features/quantization/bnb.md
docs/source/features/quantization/bnb.md
+3
-3
examples/offline_inference/lora_with_quantization_inference.py
...les/offline_inference/lora_with_quantization_inference.py
+0
-1
vllm/engine/arg_utils.py
vllm/engine/arg_utils.py
+4
-11
No files found.
docs/source/features/quantization/bnb.md
View file @
10f55fe6
...
@@ -25,7 +25,7 @@ import torch
...
@@ -25,7 +25,7 @@ import torch
# unsloth/tinyllama-bnb-4bit is a pre-quantized checkpoint.
# unsloth/tinyllama-bnb-4bit is a pre-quantized checkpoint.
model_id
=
"unsloth/tinyllama-bnb-4bit"
model_id
=
"unsloth/tinyllama-bnb-4bit"
llm
=
LLM
(
model
=
model_id
,
dtype
=
torch
.
bfloat16
,
trust_remote_code
=
True
,
\
llm
=
LLM
(
model
=
model_id
,
dtype
=
torch
.
bfloat16
,
trust_remote_code
=
True
,
\
quantization
=
"bitsandbytes"
,
load_format
=
"bitsandbytes"
)
quantization
=
"bitsandbytes"
)
```
```
## Inflight quantization: load as 4bit quantization
## Inflight quantization: load as 4bit quantization
...
@@ -35,7 +35,7 @@ from vllm import LLM
...
@@ -35,7 +35,7 @@ from vllm import LLM
import
torch
import
torch
model_id
=
"huggyllama/llama-7b"
model_id
=
"huggyllama/llama-7b"
llm
=
LLM
(
model
=
model_id
,
dtype
=
torch
.
bfloat16
,
trust_remote_code
=
True
,
\
llm
=
LLM
(
model
=
model_id
,
dtype
=
torch
.
bfloat16
,
trust_remote_code
=
True
,
\
quantization
=
"bitsandbytes"
,
load_format
=
"bitsandbytes"
)
quantization
=
"bitsandbytes"
)
```
```
## OpenAI Compatible Server
## OpenAI Compatible Server
...
@@ -43,5 +43,5 @@ quantization="bitsandbytes", load_format="bitsandbytes")
...
@@ -43,5 +43,5 @@ quantization="bitsandbytes", load_format="bitsandbytes")
Append the following to your 4bit model arguments:
Append the following to your 4bit model arguments:
```
console
```
console
--quantization bitsandbytes
--load-format bitsandbytes
--quantization bitsandbytes
```
```
examples/offline_inference/lora_with_quantization_inference.py
View file @
10f55fe6
...
@@ -83,7 +83,6 @@ def initialize_engine(model: str, quantization: str,
...
@@ -83,7 +83,6 @@ def initialize_engine(model: str, quantization: str,
engine_args
=
EngineArgs
(
model
=
model
,
engine_args
=
EngineArgs
(
model
=
model
,
quantization
=
quantization
,
quantization
=
quantization
,
qlora_adapter_name_or_path
=
lora_repo
,
qlora_adapter_name_or_path
=
lora_repo
,
load_format
=
"bitsandbytes"
,
enable_lora
=
True
,
enable_lora
=
True
,
max_lora_rank
=
64
)
max_lora_rank
=
64
)
else
:
else
:
...
...
vllm/engine/arg_utils.py
View file @
10f55fe6
...
@@ -1170,22 +1170,15 @@ class EngineArgs:
...
@@ -1170,22 +1170,15 @@ class EngineArgs:
)
)
def
create_load_config
(
self
)
->
LoadConfig
:
def
create_load_config
(
self
)
->
LoadConfig
:
# bitsandbytes quantization needs a specific model loader
# so we make sure the quant method and the load format are consistent
if
(
self
.
quantization
==
"bitsandbytes"
or
self
.
qlora_adapter_name_or_path
is
not
None
)
and
\
self
.
load_format
!=
"bitsandbytes"
:
raise
ValueError
(
"BitsAndBytes quantization and QLoRA adapter only support "
f
"'bitsandbytes' load format, but got
{
self
.
load_format
}
"
)
if
(
self
.
load_format
==
"bitsandbytes"
or
if
(
self
.
qlora_adapter_name_or_path
is
not
None
)
and
\
self
.
qlora_adapter_name_or_path
is
not
None
)
and
\
self
.
quantization
!=
"bitsandbytes"
:
self
.
quantization
!=
"bitsandbytes"
:
raise
ValueError
(
raise
ValueError
(
"
BitsAndBytes load format and
QLoRA adapter only support "
"QLoRA adapter only support "
f
"'bitsandbytes' quantization, but got
{
self
.
quantization
}
"
)
f
"'bitsandbytes' quantization, but got
{
self
.
quantization
}
"
)
if
self
.
quantization
==
"bitsandbytes"
:
self
.
load_format
=
"bitsandbytes"
return
LoadConfig
(
return
LoadConfig
(
load_format
=
self
.
load_format
,
load_format
=
self
.
load_format
,
download_dir
=
self
.
download_dir
,
download_dir
=
self
.
download_dir
,
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment