修改awq workspace 申请

8cd246bd · zhuwenwen · 17928589 · 8cd246bd · 8cd246bd
Commit 8cd246bd authored Aug 15, 2024 by zhuwenwen
Hide whitespace changes
Inline Side-by-side

Showing with 20 additions and 11 deletions

vllm/_custom_ops.py vllm/_custom_ops.py +6 -0

vllm/model_executor/layers/quantization/awq.py vllm/model_executor/layers/quantization/awq.py +14 -11

No files found.
--- a/vllm/_custom_ops.py
+++ b/vllm/_custom_ops.py
@@ -183,6 +183,12 @@ def advance_step(num_seqs: int, num_queries: int, block_size: int,
 # quantization ops
 # awq
+def GetAWQShareWorkspaceSize()->int:
+    return quant_ops.GetAWQShareWorkspaceSize()
+def GetAWQShareWorkspace()->torch.Tensor:
+    return quant_ops.GetAWQShareWorkspace()
 def awq_dequantize(qweight: torch.Tensor, scales: torch.Tensor,
                   zeros: torch.Tensor, split_k_iters: int, thx: int,
                   thy: int) -> torch.Tensor:

--- a/vllm/model_executor/layers/quantization/awq.py
+++ b/vllm/model_executor/layers/quantization/awq.py
@@ -20,8 +20,8 @@ class AWQShareWorkSpace:
        return cls._instance
    def _initialize(self):
-        self.awqworkshapcesize = 2 << 29
+        self.awqworkshapcesize = ops.GetAWQShareWorkspaceSize()
-        self.awqworkshapce = torch.zeros(self.awqworkshapcesize // 2 + 1, dtype=torch.float16).cuda()
+        self.awqworkshapce = ops.GetAWQShareWorkspace()
 class AWQConfig(QuantizationConfig):
@@ -200,7 +200,8 @@ class AWQLinearMethod(LinearMethodBase):
        else:
            padding_group=0
-        out = ops.awq_gemm(reshaped_x,
+        if m<4096: 
+            out = ops.awq_gemm(reshaped_x,
                            qweight,
                            zeros_and_scales,
                            m,
@@ -210,15 +211,17 @@ class AWQLinearMethod(LinearMethodBase):
                            padding_group,
                            self.awqsingleton.awqworkshapce,
                            self.awqsingleton.awqworkshapcesize)
-        #下面是采用rocblas的做法
+        else: 
-        # deqweight=ops.dequant_w4_gemm_colmajor(    #shape[n,k/8]--->[n,k]
+            #下面是采用rocblas的做法
-        #                   qweight, 
+            deqweight=ops.dequant_w4_gemm_colmajor(    #shape[n,k/8]--->[n,k]
-        #                   zeros_and_scales,
+                            qweight, 
-        #                   k,
+                            zeros_and_scales,
-        #                   n,
+                            k,
-        #                   self.quant_config.group_size)
+                            n,
-        # output=F.linear(reshaped_x, deqweight)    
+                            self.quant_config.group_size)
+            out=F.linear(reshaped_x, deqweight[:,0:k])    
        if bias is not None:
            out.add_(bias)
        return out.reshape(out_shape)