Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
f9a56006
Unverified
Commit
f9a56006
authored
Aug 06, 2024
by
Michael Goin
Committed by
GitHub
Aug 06, 2024
Browse files
[Bugfix] Fix GPTQ and GPTQ Marlin CPU Offloading (#7225)
parent
fd95e026
Changes
4
Hide whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
33 additions
and
14 deletions
+33
-14
tests/basic_correctness/test_cpu_offload.py
tests/basic_correctness/test_cpu_offload.py
+21
-4
tests/utils.py
tests/utils.py
+3
-2
vllm/model_executor/layers/quantization/gptq.py
vllm/model_executor/layers/quantization/gptq.py
+9
-7
vllm/model_executor/layers/quantization/gptq_marlin.py
vllm/model_executor/layers/quantization/gptq_marlin.py
+0
-1
No files found.
tests/basic_correctness/test_cpu_offload.py
View file @
f9a56006
...
@@ -22,11 +22,28 @@ def test_cpu_offload_fp8():
...
@@ -22,11 +22,28 @@ def test_cpu_offload_fp8():
[
"--cpu-offload-gb"
,
"2"
])
[
"--cpu-offload-gb"
,
"2"
])
@
pytest
.
mark
.
skipif
(
not
is_quant_method_supported
(
"awq"
),
@
pytest
.
mark
.
skipif
(
not
is_quant_method_supported
(
"gptq_marlin"
),
reason
=
"awq is not supported on this GPU type."
)
reason
=
"gptq_marlin is not supported on this GPU type."
)
def
test_cpu_offload_gptq
():
# Test GPTQ Marlin
compare_two_settings
(
"Qwen/Qwen2-1.5B-Instruct-GPTQ-Int4"
,
[],
[
"--cpu-offload-gb"
,
"1"
])
# Test GPTQ
compare_two_settings
(
"Qwen/Qwen2-1.5B-Instruct-GPTQ-Int4"
,
[
"--quantization"
,
"gptq"
],
[
"--quantization"
,
"gptq"
,
"--cpu-offload-gb"
,
"1"
])
@
pytest
.
mark
.
skipif
(
not
is_quant_method_supported
(
"awq_marlin"
),
reason
=
"awq_marlin is not supported on this GPU type."
)
def
test_cpu_offload_awq
():
def
test_cpu_offload_awq
():
compare_two_settings
(
"casperhansen/llama-3-8b-instruct-awq"
,
[],
# Test AWQ Marlin
[
"--cpu-offload-gb"
,
"2"
])
compare_two_settings
(
"Qwen/Qwen2-1.5B-Instruct-AWQ"
,
[],
[
"--cpu-offload-gb"
,
"1"
])
# Test AWQ
compare_two_settings
(
"Qwen/Qwen2-1.5B-Instruct-AWQ"
,
[
"--quantization"
,
"awq"
],
[
"--quantization"
,
"awq"
,
"--cpu-offload-gb"
,
"1"
])
@
pytest
.
mark
.
skipif
(
not
is_quant_method_supported
(
"gptq_marlin"
),
@
pytest
.
mark
.
skipif
(
not
is_quant_method_supported
(
"gptq_marlin"
),
...
...
tests/utils.py
View file @
f9a56006
...
@@ -266,8 +266,9 @@ def compare_two_settings(model: str,
...
@@ -266,8 +266,9 @@ def compare_two_settings(model: str,
arg1_results
=
results
[:
n
]
arg1_results
=
results
[:
n
]
arg2_results
=
results
[
n
:]
arg2_results
=
results
[
n
:]
for
arg1_result
,
arg2_result
in
zip
(
arg1_results
,
arg2_results
):
for
arg1_result
,
arg2_result
in
zip
(
arg1_results
,
arg2_results
):
assert
arg1_result
==
arg2_result
,
\
assert
arg1_result
==
arg2_result
,
(
f
"Results for
{
model
=
}
are not the same with
{
arg1
=
}
and
{
arg2
=
}
"
f
"Results for
{
model
=
}
are not the same with
{
arg1
=
}
and
{
arg2
=
}
. "
f
"
{
arg1_result
=
}
!=
{
arg2_result
=
}
"
)
def
init_test_distributed_environment
(
def
init_test_distributed_environment
(
...
...
vllm/model_executor/layers/quantization/gptq.py
View file @
f9a56006
...
@@ -204,13 +204,7 @@ class GPTQLinearMethod(LinearMethodBase):
...
@@ -204,13 +204,7 @@ class GPTQLinearMethod(LinearMethodBase):
layer
.
exllama_state
=
exllama_state
layer
.
exllama_state
=
exllama_state
def
apply
(
self
,
def
process_weights_after_loading
(
self
,
layer
:
torch
.
nn
.
Module
)
->
None
:
layer
:
torch
.
nn
.
Module
,
x
:
torch
.
Tensor
,
bias
:
Optional
[
torch
.
Tensor
]
=
None
)
->
torch
.
Tensor
:
qweight
=
layer
.
qweight
out_shape
=
x
.
shape
[:
-
1
]
+
(
qweight
.
shape
[
-
1
],
)
reshaped_x
=
x
.
reshape
(
-
1
,
x
.
shape
[
-
1
])
# exllama needs to shuffle the weight after the weight is loaded
# exllama needs to shuffle the weight after the weight is loaded
# here we do the shuffle on first forward pass
# here we do the shuffle on first forward pass
if
layer
.
exllama_state
==
ExllamaState
.
UNINITIALIZED
:
if
layer
.
exllama_state
==
ExllamaState
.
UNINITIALIZED
:
...
@@ -222,6 +216,14 @@ class GPTQLinearMethod(LinearMethodBase):
...
@@ -222,6 +216,14 @@ class GPTQLinearMethod(LinearMethodBase):
layer
.
exllama_state
=
ExllamaState
.
READY
layer
.
exllama_state
=
ExllamaState
.
READY
ops
.
gptq_shuffle
(
layer
.
qweight
,
layer
.
g_idx
,
ops
.
gptq_shuffle
(
layer
.
qweight
,
layer
.
g_idx
,
self
.
quant_config
.
weight_bits
)
self
.
quant_config
.
weight_bits
)
def
apply
(
self
,
layer
:
torch
.
nn
.
Module
,
x
:
torch
.
Tensor
,
bias
:
Optional
[
torch
.
Tensor
]
=
None
)
->
torch
.
Tensor
:
out_shape
=
x
.
shape
[:
-
1
]
+
(
layer
.
qweight
.
shape
[
-
1
],
)
reshaped_x
=
x
.
reshape
(
-
1
,
x
.
shape
[
-
1
])
output
=
ops
.
gptq_gemm
(
reshaped_x
,
layer
.
qweight
,
layer
.
qzeros
,
output
=
ops
.
gptq_gemm
(
reshaped_x
,
layer
.
qweight
,
layer
.
qzeros
,
layer
.
scales
,
layer
.
g_idx
,
layer
.
scales
,
layer
.
g_idx
,
layer
.
exllama_state
==
ExllamaState
.
READY
,
layer
.
exllama_state
==
ExllamaState
.
READY
,
...
...
vllm/model_executor/layers/quantization/gptq_marlin.py
View file @
f9a56006
...
@@ -251,7 +251,6 @@ class GPTQMarlinLinearMethod(LinearMethodBase):
...
@@ -251,7 +251,6 @@ class GPTQMarlinLinearMethod(LinearMethodBase):
scales_and_zp_size
,
scales_and_zp_size
,
output_size_per_partition
//
self
.
quant_config
.
pack_factor
,
output_size_per_partition
//
self
.
quant_config
.
pack_factor
,
dtype
=
torch
.
int32
,
dtype
=
torch
.
int32
,
device
=
"meta"
,
),
),
requires_grad
=
False
,
requires_grad
=
False
,
)
)
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment