fused qknorm+rope kernel optimization for SM9.0 (#37376)

Signed-off-by: EricccYang <yangyang4991@gmail.com> Signed-off-by: Kaicheng Yang <53411596+EricccYang@users.noreply.github.com> Co-authored-by: Claude Sonnet 4.6 <noreply@anthropic.com>

fused qknorm+rope kernel optimization for SM9.0 (#37376)
Signed-off-by: EricccYang <yangyang4991@gmail.com> Signed-off-by: Kaicheng Yang <53411596+EricccYang@users.noreply.github.com> Co-authored-by: Claude Sonnet 4.6 <noreply@anthropic.com>
4beeb068 · Kaicheng Yang · GitHub · cae98406 · 4beeb068 · 4beeb068
Unverified Commit 4beeb068 authored Apr 13, 2026 by Kaicheng Yang Committed by GitHub Apr 12, 2026
7 changed files
--- a/csrc/async_util.cuh
+++ b/csrc/async_util.cuh
+/*
+ * Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+namespace vllm {
+namespace cuda_async {
+__device__ __forceinline__ void cp_async_shared_global_16_cg(
+    void* smem_ptr, const void* glob_ptr) {
+#if defined(USE_ROCM)
+  *reinterpret_cast<int4*>(smem_ptr) = *reinterpret_cast<const int4*>(glob_ptr);
+#elif defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800
+  uint32_t smem = static_cast<uint32_t>(__cvta_generic_to_shared(smem_ptr));
+  asm volatile("cp.async.cg.shared.global [%0], [%1], 16;\n"
+               :
+               : "r"(smem), "l"(glob_ptr));
+#elif defined(__CUDA_ARCH__)
+  *reinterpret_cast<int4*>(smem_ptr) = *reinterpret_cast<const int4*>(glob_ptr);
+#else
+  (void)smem_ptr;
+  (void)glob_ptr;
+#endif
+}
+__device__ __forceinline__ void cp_async_shared_global_ca(void* smem_ptr,
+                                                          const void* glob_ptr,
+                                                          int size_bytes) {
+#if defined(USE_ROCM)
+  if (size_bytes == 4) {
+    *reinterpret_cast<uint32_t*>(smem_ptr) =
+        *reinterpret_cast<const uint32_t*>(glob_ptr);
+  } else if (size_bytes == 8) {
+    *reinterpret_cast<uint64_t*>(smem_ptr) =
+        *reinterpret_cast<const uint64_t*>(glob_ptr);
+  } else {
+    *reinterpret_cast<int4*>(smem_ptr) =
+        *reinterpret_cast<const int4*>(glob_ptr);
+  }
+#elif defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800
+  uint32_t smem = static_cast<uint32_t>(__cvta_generic_to_shared(smem_ptr));
+  if (size_bytes == 4) {
+    asm volatile("cp.async.ca.shared.global [%0], [%1], 4;\n"
+                 :
+                 : "r"(smem), "l"(glob_ptr));
+  } else if (size_bytes == 8) {
+    asm volatile("cp.async.ca.shared.global [%0], [%1], 8;\n"
+                 :
+                 : "r"(smem), "l"(glob_ptr));
+  } else {
+    asm volatile("cp.async.ca.shared.global [%0], [%1], 16;\n"
+                 :
+                 : "r"(smem), "l"(glob_ptr));
+  }
+#elif defined(__CUDA_ARCH__)
+  if (size_bytes == 4) {
+    *reinterpret_cast<uint32_t*>(smem_ptr) =
+        *reinterpret_cast<const uint32_t*>(glob_ptr);
+  } else if (size_bytes == 8) {
+    *reinterpret_cast<uint64_t*>(smem_ptr) =
+        *reinterpret_cast<const uint64_t*>(glob_ptr);
+  } else {
+    *reinterpret_cast<int4*>(smem_ptr) =
+        *reinterpret_cast<const int4*>(glob_ptr);
+  }
+#else
+  (void)smem_ptr;
+  (void)glob_ptr;
+  (void)size_bytes;
+#endif
+}
+__device__ __forceinline__ void cp_async_commit_group() {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800 && !defined(USE_ROCM)
+  asm volatile("cp.async.commit_group;\n" ::);
+#endif
+}
+template <int n>
+__device__ __forceinline__ void cp_async_wait_group() {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800 && !defined(USE_ROCM)
+  asm volatile("cp.async.wait_group %0;\n" : : "n"(n));
+#endif
+}
+}  // namespace cuda_async
+}  // namespace vllm
--- a/csrc/fused_qknorm_rope_kernel.cu
+++ b/csrc/fused_qknorm_rope_kernel.cu
--- a/csrc/ops.h
+++ b/csrc/ops.h
@@ -96,7 +96,8 @@ void fused_qk_norm_rope(torch::Tensor& qkv, int64_t num_heads_q,
                        int64_t num_heads_k, int64_t num_heads_v,
                        int64_t head_dim, double eps, torch::Tensor& q_weight,
                        torch::Tensor& k_weight, torch::Tensor& cos_sin_cache,
-                        bool is_neox, torch::Tensor& position_ids);
+                        bool is_neox, torch::Tensor& position_ids,
+                        int64_t forced_token_heads_per_warp);
 void apply_repetition_penalties_(torch::Tensor& logits,
                                 const torch::Tensor& prompt_mask,
@@ -320,4 +321,4 @@ std::tuple<torch::Tensor, torch::Tensor> minimax_allreduce_rms_qk(
    torch::Tensor const& norm_weight_k, torch::Tensor workspace,
    int64_t const q_size, int64_t const kv_size, int64_t const rank,
    int64_t const nranks, double const eps);
 #endif
\ No newline at end of file
--- a/csrc/torch_bindings.cpp
+++ b/csrc/torch_bindings.cpp
@@ -173,7 +173,8 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
      "fused_qk_norm_rope(Tensor! qkv, int num_heads_q, "
      "int num_heads_k, int num_heads_v, int head_dim, float eps, "
      "Tensor q_weight, Tensor k_weight, Tensor cos_sin_cache, "
-      "bool is_neox, Tensor position_ids) -> ()");
+      "bool is_neox, Tensor position_ids, "
+      "int forced_token_heads_per_warp=-1) -> ()");
  ops.impl("fused_qk_norm_rope", torch::kCUDA, &fused_qk_norm_rope);
  // Apply repetition penalties to logits in-place

--- a/vllm/_custom_ops.py
+++ b/vllm/_custom_ops.py
@@ -435,6 +435,7 @@ def fused_qk_norm_rope(
    cos_sin_cache: torch.Tensor,
    is_neox: bool,
    position_ids: torch.Tensor,
+    forced_token_heads_per_warp: int = -1,
 ) -> None:
    torch.ops._C.fused_qk_norm_rope(
        qkv,
@@ -448,6 +449,7 @@ def fused_qk_norm_rope(
        cos_sin_cache,
        is_neox,
        position_ids,
+        forced_token_heads_per_warp,
    )

--- a/vllm/compilation/passes/fusion/qk_norm_rope_fusion.py
+++ b/vllm/compilation/passes/fusion/qk_norm_rope_fusion.py
@@ -164,6 +164,7 @@ class QkNormRopePattern:
                cos_sin_cache=cos_sin_cache,
                is_neox=self.is_neox,
                position_ids=positions.view(-1),
+                forced_token_heads_per_warp=-1,
            )
            result_qkv = result[1]

--- a/vllm/compilation/passes/utility/fix_functionalization.py
+++ b/vllm/compilation/passes/utility/fix_functionalization.py
@@ -168,6 +168,7 @@ class FixFunctionalizationPass(VllmInductorPass):
                    "cos_sin_cache",
                    "is_neox",
                    "position_ids",
+                    "forced_token_heads_per_warp",
                )
                self.defunctionalize(graph, node, mutated_args=mutated_args, args=args)
            elif (