Up to 60% faster context processing (#316)

149236e4 · Casper · GitHub · c6c7b065 · 149236e4
Unverified Commit 149236e4 authored Jan 24, 2024 by Casper Committed by GitHub Jan 24, 2024
Hide whitespace changes
Inline Side-by-side

Showing with 17 additions and 3 deletions

awq/modules/linear/gemm.py awq/modules/linear/gemm.py +17 -3

No files found.
--- a/awq/modules/linear/gemm.py
+++ b/awq/modules/linear/gemm.py
@@ -153,9 +153,23 @@ class WQLinear_GEMM(nn.Module):
            x = x.half()

        if AWQ_INSTALLED:
-            out = awq_ext.gemm_forward_cuda(
-                x.reshape(-1, x.shape[-1]), self.qweight, self.scales, self.qzeros, 8
-            )
+            FP16_MATMUL_HEURISTIC_CONDITION = x.shape[0]*x.shape[1] >= 1024
+
+            if FP16_MATMUL_HEURISTIC_CONDITION:
+                out = awq_ext.dequantize_weights_cuda(
+                    self.qweight,
+                    self.scales,
+                    self.qzeros,
+                    0,
+                    0,
+                    0,
+                    False
+                )
+                out = torch.matmul(x, out)
+            else:
+                out = awq_ext.gemm_forward_cuda(
+                    x.reshape(-1, x.shape[-1]), self.qweight, self.scales, self.qzeros, 8
+                )
        else:
            out = dequantize_gemm(
                self.qweight,