[ROCm][GPTQ][Bugfix] Fix GPTQ GEMM kernel output zeroing race condition (#30719)

Signed-off-by: Andreas Karatzas <akaratza@amd.com>

[ROCm][GPTQ][Bugfix] Fix GPTQ GEMM kernel output zeroing race condition (#30719)
Signed-off-by: Andreas Karatzas <akaratza@amd.com>
3ecfdc37 · Andreas Karatzas · GitHub · 45c1ca1c · 3ecfdc37
Unverified Commit 3ecfdc37 authored Dec 29, 2025 by Andreas Karatzas Committed by GitHub Dec 29, 2025
Hide whitespace changes
Inline Side-by-side

Showing with 1 addition and 27 deletions

csrc/quantization/gptq/q_gemm.cu csrc/quantization/gptq/q_gemm.cu +1 -27

No files found.
--- a/csrc/quantization/gptq/q_gemm.cu
+++ b/csrc/quantization/gptq/q_gemm.cu
@@ -233,11 +233,6 @@ __global__ void gemm_half_q_half_gptq_4bit_kernel(
  // Zero output
  if (n >= size_n) return;

-  if (blockIdx.z == 0) {
-    for (int m = 0; m < m_count; m++)
-      *((uint64_t*)c_.item_ptr(offset_m + m, n)) = 0;
-  }
-
  __syncthreads();

  // Find initial group
@@ -372,11 +367,6 @@ __global__ void gemm_half_q_half_gptq_2bit_kernel(
  // Zero output
  if (n >= size_n) return;

-  if (blockIdx.z == 0) {
-    for (int m = 0; m < m_count; m++)
-      *((uint64_t*)c_.item_ptr(offset_m + m, n)) = 0;
-  }
-
  __syncthreads();

  // Find initial group
@@ -494,11 +484,6 @@ __global__ void gemm_half_q_half_gptq_3bit_kernel(
  // Zero output
  if (n >= size_n) return;

-  if (blockIdx.z == 0) {
-    for (int m = 0; m < m_count; m++)
-      *((uint64_t*)c_.item_ptr(offset_m + m, n)) = 0;
-  }
-
  __syncthreads();

  // Find initial group
@@ -623,11 +608,6 @@ __global__ void gemm_half_q_half_gptq_8bit_kernel(
  // Zero output
  if (n >= size_n) return;

-  if (blockIdx.z == 0) {
-    for (int m = 0; m < m_count; m++)
-      *((uint64_t*)c_.item_ptr(offset_m + m, n)) = 0;
-  }
-
  __syncthreads();

  // Find initial group
@@ -1224,9 +1204,6 @@ __global__ void gemm_half_q_half_alt_4bit_kernel(
        __halves2half2(__int2half_rn(val & 0xF), __int2half_rn(val >> 4));
  }

-  if (blockIdx.z == 0) {
-    for (int m = 0; m < b_end; m++) mul[(b + m) * width + w] = __int2half_rn(0);
-  }
  __syncthreads();

  int i = width * h + w;
@@ -1319,9 +1296,6 @@ __global__ void gemm_half_q_half_alt_8bit_kernel(
    }
  }

-  if (blockIdx.z == 0) {
-    for (int m = 0; m < b_end; m++) mul[(b + m) * width + w] = __int2half_rn(0);
-  }
  __syncthreads();

  int i = width * h + w;
@@ -1857,7 +1831,7 @@ torch::Tensor gptq_gemm(torch::Tensor a, torch::Tensor b_q_weight,
                        bool use_exllama, bool use_v2_format, int64_t bit) {
  const at::cuda::OptionalCUDAGuard device_guard(device_of(a));
  auto options = torch::TensorOptions().dtype(a.dtype()).device(a.device());
-  at::Tensor c = torch::empty({a.size(0), b_q_weight.size(1)}, options);
+  at::Tensor c = torch::zeros({a.size(0), b_q_weight.size(1)}, options);
  at::Tensor temp_dq = torch::empty(
      {b_q_weight.size(0) * 32 / bit, b_q_weight.size(1)}, options);