Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
066e63c2
Commit
066e63c2
authored
Aug 15, 2024
by
zhuwenwen
Browse files
Merge branch 'lmslim_awq' into 'v0.5.0-dtk24.04.1'
Lmslim awq See merge request dcutoolkit/deeplearing/vllm!10
parents
3b2b3046
70267267
Changes
2
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
20 additions
and
16 deletions
+20
-16
vllm/_custom_ops.py
vllm/_custom_ops.py
+6
-0
vllm/model_executor/layers/quantization/awq.py
vllm/model_executor/layers/quantization/awq.py
+14
-16
No files found.
vllm/_custom_ops.py
View file @
066e63c2
...
@@ -143,6 +143,12 @@ def fused_add_rms_norm(input: torch.Tensor, residual: torch.Tensor,
...
@@ -143,6 +143,12 @@ def fused_add_rms_norm(input: torch.Tensor, residual: torch.Tensor,
# quantization ops
# quantization ops
# awq
# awq
def
GetAWQShareWorkspaceSize
()
->
int
:
return
quant_ops
.
GetAWQShareWorkspaceSize
()
def
GetAWQShareWorkspace
()
->
torch
.
Tensor
:
return
quant_ops
.
GetAWQShareWorkspace
()
def
awq_dequantize
(
qweight
:
torch
.
Tensor
,
scales
:
torch
.
Tensor
,
def
awq_dequantize
(
qweight
:
torch
.
Tensor
,
scales
:
torch
.
Tensor
,
zeros
:
torch
.
Tensor
,
split_k_iters
:
int
,
thx
:
int
,
zeros
:
torch
.
Tensor
,
split_k_iters
:
int
,
thx
:
int
,
thy
:
int
)
->
torch
.
Tensor
:
thy
:
int
)
->
torch
.
Tensor
:
...
...
vllm/model_executor/layers/quantization/awq.py
View file @
066e63c2
...
@@ -20,12 +20,8 @@ class AWQShareWorkSpace:
...
@@ -20,12 +20,8 @@ class AWQShareWorkSpace:
return
cls
.
_instance
return
cls
.
_instance
def
_initialize
(
self
):
def
_initialize
(
self
):
self
.
awqworkshapcesize
=
2
<<
29
self
.
awqworkshapcesize
=
ops
.
GetAWQShareWorkspaceSize
()
self
.
awqworkshapce
=
torch
.
zeros
(
self
.
awqworkshapcesize
//
2
+
1
,
dtype
=
torch
.
float16
).
cuda
()
self
.
awqworkshapce
=
ops
.
GetAWQShareWorkspace
()
#print("AWQShareWorkSpace _initialize\n")
#print("self.awqworkshapce.device:",self.awqworkshapce.device)
class
AWQConfig
(
QuantizationConfig
):
class
AWQConfig
(
QuantizationConfig
):
"""Config class for AWQ.
"""Config class for AWQ.
...
@@ -201,8 +197,9 @@ class AWQLinearMethod(LinearMethodBase):
...
@@ -201,8 +197,9 @@ class AWQLinearMethod(LinearMethodBase):
padding_group
=
2
padding_group
=
2
else
:
else
:
padding_group
=
0
padding_group
=
0
out
=
ops
.
awq_gemm
(
reshaped_x
,
if
m
<
4096
:
out
=
ops
.
awq_gemm
(
reshaped_x
,
qweight
,
qweight
,
zeros_and_scales
,
zeros_and_scales
,
m
,
m
,
...
@@ -212,14 +209,15 @@ class AWQLinearMethod(LinearMethodBase):
...
@@ -212,14 +209,15 @@ class AWQLinearMethod(LinearMethodBase):
padding_group
,
padding_group
,
self
.
awqsingleton
.
awqworkshapce
,
self
.
awqsingleton
.
awqworkshapce
,
self
.
awqsingleton
.
awqworkshapcesize
)
self
.
awqsingleton
.
awqworkshapcesize
)
#下面是采用rocblas的做法
else
:
# deqweight=ops.dequant_w4_gemm_colmajor( #shape[n,k/8]--->[n,k]
#下面是采用rocblas的做法
# qweight,
deqweight
=
ops
.
dequant_w4_gemm_colmajor
(
#shape[n,k/8]--->[n,k]
# zeros_and_scales,
qweight
,
# k,
zeros_and_scales
,
# n,
k
,
# self.quant_config.group_size)
n
,
# output=F.linear(reshaped_x, deqweight)
self
.
quant_config
.
group_size
)
out
=
F
.
linear
(
reshaped_x
,
deqweight
[:,
0
:
k
])
if
bias
is
not
None
:
if
bias
is
not
None
:
out
.
add_
(
bias
)
out
.
add_
(
bias
)
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment