Unverified Commit 4beeb068 authored by Kaicheng Yang's avatar Kaicheng Yang Committed by GitHub
Browse files

fused qknorm+rope kernel optimization for SM9.0 (#37376)


Signed-off-by: default avatarEricccYang <yangyang4991@gmail.com>
Signed-off-by: default avatarKaicheng Yang <53411596+EricccYang@users.noreply.github.com>
Co-authored-by: default avatarClaude Sonnet 4.6 <noreply@anthropic.com>
parent cae98406
/*
* Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#pragma once
namespace vllm {
namespace cuda_async {
__device__ __forceinline__ void cp_async_shared_global_16_cg(
void* smem_ptr, const void* glob_ptr) {
#if defined(USE_ROCM)
*reinterpret_cast<int4*>(smem_ptr) = *reinterpret_cast<const int4*>(glob_ptr);
#elif defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800
uint32_t smem = static_cast<uint32_t>(__cvta_generic_to_shared(smem_ptr));
asm volatile("cp.async.cg.shared.global [%0], [%1], 16;\n"
:
: "r"(smem), "l"(glob_ptr));
#elif defined(__CUDA_ARCH__)
*reinterpret_cast<int4*>(smem_ptr) = *reinterpret_cast<const int4*>(glob_ptr);
#else
(void)smem_ptr;
(void)glob_ptr;
#endif
}
__device__ __forceinline__ void cp_async_shared_global_ca(void* smem_ptr,
const void* glob_ptr,
int size_bytes) {
#if defined(USE_ROCM)
if (size_bytes == 4) {
*reinterpret_cast<uint32_t*>(smem_ptr) =
*reinterpret_cast<const uint32_t*>(glob_ptr);
} else if (size_bytes == 8) {
*reinterpret_cast<uint64_t*>(smem_ptr) =
*reinterpret_cast<const uint64_t*>(glob_ptr);
} else {
*reinterpret_cast<int4*>(smem_ptr) =
*reinterpret_cast<const int4*>(glob_ptr);
}
#elif defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800
uint32_t smem = static_cast<uint32_t>(__cvta_generic_to_shared(smem_ptr));
if (size_bytes == 4) {
asm volatile("cp.async.ca.shared.global [%0], [%1], 4;\n"
:
: "r"(smem), "l"(glob_ptr));
} else if (size_bytes == 8) {
asm volatile("cp.async.ca.shared.global [%0], [%1], 8;\n"
:
: "r"(smem), "l"(glob_ptr));
} else {
asm volatile("cp.async.ca.shared.global [%0], [%1], 16;\n"
:
: "r"(smem), "l"(glob_ptr));
}
#elif defined(__CUDA_ARCH__)
if (size_bytes == 4) {
*reinterpret_cast<uint32_t*>(smem_ptr) =
*reinterpret_cast<const uint32_t*>(glob_ptr);
} else if (size_bytes == 8) {
*reinterpret_cast<uint64_t*>(smem_ptr) =
*reinterpret_cast<const uint64_t*>(glob_ptr);
} else {
*reinterpret_cast<int4*>(smem_ptr) =
*reinterpret_cast<const int4*>(glob_ptr);
}
#else
(void)smem_ptr;
(void)glob_ptr;
(void)size_bytes;
#endif
}
__device__ __forceinline__ void cp_async_commit_group() {
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800 && !defined(USE_ROCM)
asm volatile("cp.async.commit_group;\n" ::);
#endif
}
template <int n>
__device__ __forceinline__ void cp_async_wait_group() {
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800 && !defined(USE_ROCM)
asm volatile("cp.async.wait_group %0;\n" : : "n"(n));
#endif
}
} // namespace cuda_async
} // namespace vllm
This diff is collapsed.
...@@ -96,7 +96,8 @@ void fused_qk_norm_rope(torch::Tensor& qkv, int64_t num_heads_q, ...@@ -96,7 +96,8 @@ void fused_qk_norm_rope(torch::Tensor& qkv, int64_t num_heads_q,
int64_t num_heads_k, int64_t num_heads_v, int64_t num_heads_k, int64_t num_heads_v,
int64_t head_dim, double eps, torch::Tensor& q_weight, int64_t head_dim, double eps, torch::Tensor& q_weight,
torch::Tensor& k_weight, torch::Tensor& cos_sin_cache, torch::Tensor& k_weight, torch::Tensor& cos_sin_cache,
bool is_neox, torch::Tensor& position_ids); bool is_neox, torch::Tensor& position_ids,
int64_t forced_token_heads_per_warp);
void apply_repetition_penalties_(torch::Tensor& logits, void apply_repetition_penalties_(torch::Tensor& logits,
const torch::Tensor& prompt_mask, const torch::Tensor& prompt_mask,
...@@ -320,4 +321,4 @@ std::tuple<torch::Tensor, torch::Tensor> minimax_allreduce_rms_qk( ...@@ -320,4 +321,4 @@ std::tuple<torch::Tensor, torch::Tensor> minimax_allreduce_rms_qk(
torch::Tensor const& norm_weight_k, torch::Tensor workspace, torch::Tensor const& norm_weight_k, torch::Tensor workspace,
int64_t const q_size, int64_t const kv_size, int64_t const rank, int64_t const q_size, int64_t const kv_size, int64_t const rank,
int64_t const nranks, double const eps); int64_t const nranks, double const eps);
#endif #endif
\ No newline at end of file
...@@ -173,7 +173,8 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) { ...@@ -173,7 +173,8 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
"fused_qk_norm_rope(Tensor! qkv, int num_heads_q, " "fused_qk_norm_rope(Tensor! qkv, int num_heads_q, "
"int num_heads_k, int num_heads_v, int head_dim, float eps, " "int num_heads_k, int num_heads_v, int head_dim, float eps, "
"Tensor q_weight, Tensor k_weight, Tensor cos_sin_cache, " "Tensor q_weight, Tensor k_weight, Tensor cos_sin_cache, "
"bool is_neox, Tensor position_ids) -> ()"); "bool is_neox, Tensor position_ids, "
"int forced_token_heads_per_warp=-1) -> ()");
ops.impl("fused_qk_norm_rope", torch::kCUDA, &fused_qk_norm_rope); ops.impl("fused_qk_norm_rope", torch::kCUDA, &fused_qk_norm_rope);
// Apply repetition penalties to logits in-place // Apply repetition penalties to logits in-place
......
...@@ -435,6 +435,7 @@ def fused_qk_norm_rope( ...@@ -435,6 +435,7 @@ def fused_qk_norm_rope(
cos_sin_cache: torch.Tensor, cos_sin_cache: torch.Tensor,
is_neox: bool, is_neox: bool,
position_ids: torch.Tensor, position_ids: torch.Tensor,
forced_token_heads_per_warp: int = -1,
) -> None: ) -> None:
torch.ops._C.fused_qk_norm_rope( torch.ops._C.fused_qk_norm_rope(
qkv, qkv,
...@@ -448,6 +449,7 @@ def fused_qk_norm_rope( ...@@ -448,6 +449,7 @@ def fused_qk_norm_rope(
cos_sin_cache, cos_sin_cache,
is_neox, is_neox,
position_ids, position_ids,
forced_token_heads_per_warp,
) )
......
...@@ -164,6 +164,7 @@ class QkNormRopePattern: ...@@ -164,6 +164,7 @@ class QkNormRopePattern:
cos_sin_cache=cos_sin_cache, cos_sin_cache=cos_sin_cache,
is_neox=self.is_neox, is_neox=self.is_neox,
position_ids=positions.view(-1), position_ids=positions.view(-1),
forced_token_heads_per_warp=-1,
) )
result_qkv = result[1] result_qkv = result[1]
......
...@@ -168,6 +168,7 @@ class FixFunctionalizationPass(VllmInductorPass): ...@@ -168,6 +168,7 @@ class FixFunctionalizationPass(VllmInductorPass):
"cos_sin_cache", "cos_sin_cache",
"is_neox", "is_neox",
"position_ids", "position_ids",
"forced_token_heads_per_warp",
) )
self.defunctionalize(graph, node, mutated_args=mutated_args, args=args) self.defunctionalize(graph, node, mutated_args=mutated_args, args=args)
elif ( elif (
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment