Merge branch 'lmslim_awq' into 'v0.5.0-dtk24.04.1'

Lmslim awq See merge request dcutoolkit/deeplearing/vllm!10

Merge branch 'lmslim_awq' into 'v0.5.0-dtk24.04.1'
Lmslim awq See merge request dcutoolkit/deeplearing/vllm!10
066e63c2 · zhuwenwen · 3b2b3046 · 70267267 · 066e63c2 · 066e63c2
Commit 066e63c2 authored Aug 15, 2024 by zhuwenwen
Hide whitespace changes
Inline Side-by-side

Showing with 20 additions and 16 deletions

vllm/_custom_ops.py vllm/_custom_ops.py +6 -0

vllm/model_executor/layers/quantization/awq.py vllm/model_executor/layers/quantization/awq.py +14 -16

No files found.
--- a/vllm/_custom_ops.py
+++ b/vllm/_custom_ops.py
@@ -143,6 +143,12 @@ def fused_add_rms_norm(input: torch.Tensor, residual: torch.Tensor,
 # quantization ops
 # awq
+def GetAWQShareWorkspaceSize()->int:
+    return quant_ops.GetAWQShareWorkspaceSize()
+def GetAWQShareWorkspace()->torch.Tensor:
+    return quant_ops.GetAWQShareWorkspace()
 def awq_dequantize(qweight: torch.Tensor, scales: torch.Tensor,
                   zeros: torch.Tensor, split_k_iters: int, thx: int,
                   thy: int) -> torch.Tensor:

--- a/vllm/model_executor/layers/quantization/awq.py
+++ b/vllm/model_executor/layers/quantization/awq.py
@@ -20,12 +20,8 @@ class AWQShareWorkSpace:
        return cls._instance
    def _initialize(self):
-        self.awqworkshapcesize = 2 << 29
+        self.awqworkshapcesize = ops.GetAWQShareWorkspaceSize()
-        self.awqworkshapce = torch.zeros(self.awqworkshapcesize // 2 + 1, dtype=torch.float16).cuda()
+        self.awqworkshapce = ops.GetAWQShareWorkspace()
-        #print("AWQShareWorkSpace _initialize\n")
-        #print("self.awqworkshapce.device:",self.awqworkshapce.device)
 class AWQConfig(QuantizationConfig):
    """Config class for AWQ.
@@ -201,8 +197,9 @@ class AWQLinearMethod(LinearMethodBase):
            padding_group=2
        else:
            padding_group=0
-        out = ops.awq_gemm(reshaped_x,
+        if m<4096: 
+            out = ops.awq_gemm(reshaped_x,
                            qweight,
                            zeros_and_scales,
                            m,
@@ -212,14 +209,15 @@ class AWQLinearMethod(LinearMethodBase):
                            padding_group,
                            self.awqsingleton.awqworkshapce,
                            self.awqsingleton.awqworkshapcesize)
-        #下面是采用rocblas的做法
+        else: 
-        # deqweight=ops.dequant_w4_gemm_colmajor(    #shape[n,k/8]--->[n,k]
+            #下面是采用rocblas的做法
-        #                   qweight, 
+            deqweight=ops.dequant_w4_gemm_colmajor(    #shape[n,k/8]--->[n,k]
-        #                   zeros_and_scales,
+                            qweight, 
-        #                   k,
+                            zeros_and_scales,
-        #                   n,
+                            k,
-        #                   self.quant_config.group_size)
+                            n,
-        # output=F.linear(reshaped_x, deqweight)    
+                            self.quant_config.group_size)
+            out=F.linear(reshaped_x, deqweight[:,0:k])    
        if bias is not None:
            out.add_(bias)