Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
66530409
Unverified
Commit
66530409
authored
Aug 26, 2024
by
Dipika Sikka
Committed by
GitHub
Aug 26, 2024
Browse files
[Misc] Update `qqq` to use vLLMParameters (#7805)
parent
2deb029d
Changes
3
Show whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
55 additions
and
65 deletions
+55
-65
tests/weight_loading/models.txt
tests/weight_loading/models.txt
+3
-1
vllm/model_executor/layers/linear.py
vllm/model_executor/layers/linear.py
+1
-1
vllm/model_executor/layers/quantization/qqq.py
vllm/model_executor/layers/quantization/qqq.py
+51
-63
No files found.
tests/weight_loading/models.txt
View file @
66530409
...
...
@@ -18,3 +18,5 @@ awq_marlin, casperhansen/mixtral-instruct-awq, main
fp8, neuralmagic/Meta-Llama-3-8B-Instruct-FP8-KV, main
marlin, nm-testing/zephyr-beta-7b-marlin-g128, main
marlin, robertgshaw2/zephyr-7b-beta-channelwise-marlin, main
qqq, HandH1998/QQQ-Llama-3-8b-g128, main
qqq, HandH1998/QQQ-Llama-3-8b, main
\ No newline at end of file
vllm/model_executor/layers/linear.py
View file @
66530409
...
...
@@ -23,7 +23,7 @@ logger = init_logger(__name__)
WEIGHT_LOADER_V2_SUPPORTED
=
[
"CompressedTensorsLinearMethod"
,
"AWQMarlinLinearMethod"
,
"AWQLinearMethod"
,
"GPTQMarlinLinearMethod"
,
"Fp8LinearMethod"
,
"MarlinLinearMethod"
"MarlinLinearMethod"
,
"QQQLinearMethod"
]
...
...
vllm/model_executor/layers/quantization/qqq.py
View file @
66530409
...
...
@@ -8,7 +8,10 @@ from vllm.logger import init_logger
from
vllm.model_executor.layers.linear
import
LinearBase
,
LinearMethodBase
from
vllm.model_executor.layers.quantization.base_config
import
(
QuantizationConfig
)
from
vllm.model_executor.utils
import
set_weight_attrs
from
vllm.model_executor.parameter
import
(
BasevLLMParameter
,
ChannelQuantScaleParameter
,
GroupQuantScaleParameter
,
PackedvLLMParameter
)
logger
=
init_logger
(
__name__
)
...
...
@@ -133,6 +136,7 @@ class QQQLinearMethod(LinearMethodBase):
params_dtype
:
torch
.
dtype
,
**
extra_weight_attrs
,
):
weight_loader
=
extra_weight_attrs
[
"weight_loader"
]
if
params_dtype
!=
torch
.
float16
:
raise
ValueError
(
f
"The params dtype must be float16, but got
{
params_dtype
}
"
)
...
...
@@ -170,90 +174,74 @@ class QQQLinearMethod(LinearMethodBase):
"Each permutation group must reside on the same gpu"
)
# Quantized 4Bit weights packed into Int32.
qweight
=
Parameter
(
torch
.
empty
(
qweight
=
PackedvLLM
Parameter
(
data
=
torch
.
empty
(
input_size_per_partition
//
self
.
quant_config
.
tile_size
,
output_size_per_partition
*
self
.
quant_config
.
tile_size
//
self
.
quant_config
.
pack_factor
,
device
=
"cuda"
,
dtype
=
torch
.
int32
,
),
requires_grad
=
False
,
)
set_weight_attrs
(
qweight
,
{
"input_dim"
:
0
,
"output_dim"
:
1
,
"packed_dim"
:
1
,
"pack_factor"
:
self
.
quant_config
.
pack_factor
,
"marlin_tile_size"
:
self
.
quant_config
.
tile_size
,
},
)
s_channel
=
Parameter
(
torch
.
empty
(
input_dim
=
0
,
output_dim
=
1
,
packed_dim
=
1
,
packed_factor
=
self
.
quant_config
.
pack_factor
,
marlin_tile_size
=
self
.
quant_config
.
tile_size
,
weight_loader
=
weight_loader
)
s_channel
=
ChannelQuantScaleParameter
(
data
=
torch
.
empty
(
1
,
output_size_per_partition
,
device
=
"cuda"
,
dtype
=
torch
.
float
,
),
requires_grad
=
False
,
)
set_weight_attrs
(
s_channel
,
{
"input_dim"
:
None
,
"output_dim"
:
1
,
},
)
weight_loader
=
weight_loader
,
output_dim
=
1
)
if
self
.
quant_config
.
group_size
==
-
1
:
s_group
=
Parameter
(
torch
.
tensor
(
s_group_data
=
torch
.
tensor
(
[],
device
=
"cuda"
,
dtype
=
torch
.
half
,
),
requires_grad
=
False
,
)
else
:
s_group
=
Parameter
(
torch
.
empty
(
s_group_data
=
torch
.
empty
(
input_size_per_partition
//
self
.
quant_config
.
group_size
,
output_size_per_partition
,
device
=
"cuda"
,
dtype
=
torch
.
half
,
),
requires_grad
=
False
,
)
s
et_weight_attrs
(
s_group
,
{
"input_dim"
:
None
if
self
.
quant_config
.
group_size
==
-
1
else
0
,
"output_dim"
:
None
if
self
.
quant_config
.
group_size
==
-
1
else
1
,
}
,
)
s
_group_attr
=
{
"data"
:
s_group_data
,
"weight_loader"
:
weight_loader
}
if
self
.
quant_config
.
group_size
==
-
1
:
s_group
=
BasevLLMParameter
(
**
s_group_attr
)
else
:
s_group
=
GroupQuantScaleParameter
(
output_dim
=
1
,
input_dim
=
0
,
**
s_group_attr
)
# Allocate workspace (Used for internal locking mechanism)
max_workspace_size
=
(
output_size_per_partition
//
self
.
quant_config
.
min_n_threads
)
*
self
.
quant_config
.
max_parallel
workspace
=
Parameter
(
torch
.
zeros
(
max_workspace_size
,
workspace
=
BasevLLMParameter
(
data
=
torch
.
zeros
(
max_workspace_size
,
device
=
"cuda"
,
dtype
=
torch
.
int
),
requires_grad
=
False
)
weight_loader
=
weight_loader
)
layer
.
register_parameter
(
"B"
,
qweight
)
set_weight_attrs
(
qweight
,
extra_weight_attrs
)
layer
.
register_parameter
(
"s_channel"
,
s_channel
)
set_weight_attrs
(
s_channel
,
extra_weight_attrs
)
layer
.
register_parameter
(
"s_group"
,
s_group
)
set_weight_attrs
(
s_group
,
extra_weight_attrs
)
layer
.
register_parameter
(
"workspace"
,
workspace
)
set_weight_attrs
(
workspace
,
extra_weight_attrs
)
def
process_weights_after_loading
(
self
,
layer
:
torch
.
nn
.
Module
)
->
None
:
# required by torch.compile
layer
.
B
=
Parameter
(
layer
.
B
.
data
,
requires_grad
=
False
)
layer
.
s_channel
=
Parameter
(
layer
.
s_channel
.
data
,
requires_grad
=
False
)
layer
.
s_group
=
Parameter
(
layer
.
s_group
.
data
,
requires_grad
=
False
)
layer
.
workspace
=
Parameter
(
layer
.
workspace
.
data
,
requires_grad
=
False
)
def
apply
(
self
,
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment