[Perf] Using `__nv_fp8_e4m3` instead of `c10::e4m3` for `per_token_group_quant` (#21867)

Signed-off-by: yewentao256 <zhyanwentao@126.com>

[Perf] Using `__nv_fp8_e4m3` instead of `c10::e4m3` for `per_token_group_quant` (#21867)
Signed-off-by: yewentao256 <zhyanwentao@126.com>
1b0a1555 · Wentao Ye · GitHub · 44bc46da · 1b0a1555
Unverified Commit 1b0a1555 authored Jul 29, 2025 by Wentao Ye Committed by GitHub Jul 29, 2025
Show whitespace changes
Inline Side-by-side

Showing with 2 additions and 4 deletions

csrc/quantization/fp8/per_token_group_quant.cu csrc/quantization/fp8/per_token_group_quant.cu +2 -4

No files found.
--- a/csrc/quantization/fp8/per_token_group_quant.cu
+++ b/csrc/quantization/fp8/per_token_group_quant.cu
 #include <ATen/cuda/CUDAContext.h>
-#include <c10/util/Float8_e4m3fn.h>

 #include "../per_token_group_quant_8bit.h"

 #include <cmath>

-#include <cuda_fp16.h>
-#include <cuda_bf16.h>
+#include <cuda_fp8.h>

 #include <torch/all.h>

@@ -199,7 +197,7 @@ void per_token_group_quant_8bit(const torch::Tensor& input,
  VLLM_DISPATCH_FLOATING_TYPES(
      input.scalar_type(), "per_token_group_quant_8bit", ([&] {
        if (dst_type == at::ScalarType::Float8_e4m3fn) {
-          LAUNCH_KERNEL(scalar_t, c10::Float8_e4m3fn);
+          LAUNCH_KERNEL(scalar_t, __nv_fp8_e4m3);
        } else if (dst_type == at::ScalarType::Char) {
          LAUNCH_KERNEL(scalar_t, int8_t);
        }