[Bugfix][Kernel] Prevent integer overflow in fp8 dynamic per-token quantize kernel (#9425)

c3fab5f7 · Tyler Michael Smith · GitHub · 776dbd74 · c3fab5f7
Unverified Commit c3fab5f7 authored Oct 16, 2024 by Tyler Michael Smith Committed by GitHub Oct 16, 2024
Hide whitespace changes
Inline Side-by-side

Showing with 4 additions and 2 deletions

csrc/quantization/fp8/common.cu csrc/quantization/fp8/common.cu +4 -2

No files found.
--- a/csrc/quantization/fp8/common.cu
+++ b/csrc/quantization/fp8/common.cu
@@ -204,8 +204,10 @@ __global__ void dynamic_per_token_scaled_fp8_quant_kernel(
  int const tid = threadIdx.x;
  int const token_idx = blockIdx.x;
-  scalar_t const* __restrict__ token_input = &input[token_idx * hidden_size];
+  // Use int64 to avoid overflowing an int32 when calculating this offset
-  FP8_TYPE* __restrict__ token_output = &out[token_idx * hidden_size];
+  int64_t offset = static_cast<int64_t>(token_idx) * hidden_size;
+  scalar_t const* __restrict__ token_input = &input[offset];
+  FP8_TYPE* __restrict__ token_output = &out[offset];
  // For vectorization, token_input and token_output pointers need to be
  // aligned at 8-byte and 4-byte addresses respectively.