hipify code

1a8114bf · fengzch-das · c0177256 · 1a8114bf · 1a8114bf · 1a8114bf
Commit 1a8114bf authored Nov 21, 2025 by fengzch-das
20 changed files
--- a/src/kernels/dwconv.cu
+++ b/src/kernels/dwconv.cu
@@ -3,7 +3,7 @@
 #include "dispatch_cutlass.h"
-#include <cuda_runtime.h>
+#include <hip/hip_runtime.h>
 #include "cutlass/cutlass.h"
 #include "cutlass/conv/device/direct_convolution.h"
@@ -74,7 +74,7 @@ static cutlass::Status depthwise_conv2d_kernel_run(cutlass::conv::Conv2dProblemS
                                            UnderlyingKernel::ElementA *A, UnderlyingKernel::ElementB *B,
                                            UnderlyingKernel::ElementC *C, UnderlyingKernel::ElementC *D,
                                            ElementCompute alpha, ElementCompute beta, std::string split_k_mode,
-                                            cudaStream_t stream, int device_id = 0)
+                                            hipStream_t stream, int device_id = 0)
 {
    // create the tensor references
    cutlass::Tensor4DCoord tensor_coord_A = cutlass::conv::implicit_gemm_tensor_a_extent(
@@ -183,7 +183,7 @@ Tensor depthwise_conv2d_kernel(Tensor A, Tensor B) {
    Tensor D = Tensor::allocate({N, P, Q, K}, A.dtype(), A.device());
-    auto stream = getCurrentCUDAStream();
+    auto stream = getCurrentHIPStreamMasqueradingAsCUDA();
    cutlass::Status status = depthwise_conv2d_kernel_run(
        &problem_size,
@@ -319,7 +319,7 @@ Tensor dwconv_f16(Tensor input, Tensor weight, Tensor out, Tensor bias) {
        size_t workspace_size = implicit_gemm_op.get_workspace_size(arguments);
        BufferCUDA workspace(workspace_size);
-        auto stream = getCurrentCUDAStream();
+        auto stream = getCurrentHIPStreamMasqueradingAsCUDA();
        cutlass::Status status = implicit_gemm_op.can_implement(arguments);
        if (status != cutlass::Status::kSuccess) {

--- a/src/kernels/gemm_batched.cu
+++ b/src/kernels/gemm_batched.cu
--- a/src/kernels/gemm_f16.cu
+++ b/src/kernels/gemm_f16.cu
--- a/src/kernels/gemm_w8a8.cu
+++ b/src/kernels/gemm_w8a8.cu
--- a/src/kernels/layernorm_kernels.h
+++ b/src/kernels/layernorm_kernels.h
@@ -2,7 +2,7 @@
 #include "common.h"
 #include "Tensor.h"
-#include <cuda_fp16.h>
+#include <hip/hip_fp16.h>
 void rms_norm(Tensor &out,    // [num_tokens, hidden_size]
              Tensor &input,  // [num_tokens, hidden_size]

--- a/src/kernels/layernorm_kernels.cu
+++ b/src/kernels/layernorm_kernels.cu
+#include "hip/hip_runtime.h"
 #include "layernorm_kernels_impl.cuh"
 #include "dispatch_utils.h"
@@ -10,17 +11,17 @@ void rms_norm(Tensor &out,    // [..., hidden_size]
    int num_tokens  = input.numel() / hidden_size;
    dim3 grid(num_tokens);
    dim3 block(std::min(hidden_size, 1024));
-    const cudaStream_t stream = getCurrentCUDAStream();
+    const hipStream_t stream = getCurrentHIPStreamMasqueradingAsCUDA();
    VLLM_DISPATCH_FLOATING_TYPES(input.scalar_type(), "rms_norm_kernel", [&] {
        if (use_quant) {
-            vllm::rms_norm_kernel<scalar_t, int8_t, true><<<grid, block, 0, stream>>>(out.data_ptr<int8_t>(),
+           hipLaunchKernelGGL(( vllm::rms_norm_kernel<scalar_t, int8_t, true>), dim3(grid), dim3(block), 0, stream, out.data_ptr<int8_t>(),
                                                                                      input.data_ptr<scalar_t>(),
                                                                                      weight.data_ptr<scalar_t>(),
                                                                                      epsilon,
                                                                                      num_tokens,
                                                                                      hidden_size);
        } else {
-            vllm::rms_norm_kernel<scalar_t, scalar_t, false><<<grid, block, 0, stream>>>(out.data_ptr<scalar_t>(),
+           hipLaunchKernelGGL(( vllm::rms_norm_kernel<scalar_t, scalar_t, false>), dim3(grid), dim3(block), 0, stream, out.data_ptr<scalar_t>(),
                                                                                         input.data_ptr<scalar_t>(),
                                                                                         weight.data_ptr<scalar_t>(),
                                                                                         epsilon,
@@ -39,10 +40,10 @@ void layernorm_general(Tensor out, Tensor input, Tensor weight, Tensor bias, flo
    size_t size_shmem = input.scalar_size() * hidden_size;
-    const cudaStream_t stream = getCurrentCUDAStream();
+    const hipStream_t stream = getCurrentHIPStreamMasqueradingAsCUDA();
    VLLM_DISPATCH_FLOATING_TYPES(input.scalar_type(), "generalLayerNorm", [&] {
        using T = typename packed_as<scalar_t, 2>::type;
-        vllm::generalLayerNorm<T, half, true><<<grid, block, size_shmem, stream>>>(
+       hipLaunchKernelGGL(( vllm::generalLayerNorm<T, half, true>), dim3(grid), dim3(block), size_shmem, stream, 
            reinterpret_cast<T *>(input.data_ptr<scalar_t>()),
            weight.valid() ? reinterpret_cast<T *>(weight.data_ptr<scalar_t>()) : nullptr,
            bias.valid() ? reinterpret_cast<T *>(bias.data_ptr<scalar_t>()) : nullptr,
@@ -69,13 +70,13 @@ void rms_norm_general(Tensor &out,     // [..., hidden_size]
    dim3 block(std::min(hidden_size, 1024));
    block.x = 32 * ((block.x + 31) / 32);
-    const cudaStream_t stream = getCurrentCUDAStream();
+    const hipStream_t stream = getCurrentHIPStreamMasqueradingAsCUDA();
    VLLM_DISPATCH_FLOATING_TYPES(input.scalar_type(), "generalLayerNorm", [&] {
        using T = scalar_t;
        if (use_per_token_quant) {
            // per-token
-            vllm::generalLayerNorm<T, half>
+           hipLaunchKernelGGL(( vllm::generalLayerNorm<T, half>)
-                <<<grid, block, 0, stream>>>(reinterpret_cast<T *>(input.data_ptr<scalar_t>()),
+                , dim3(grid), dim3(block), 0, stream, reinterpret_cast<T *>(input.data_ptr<scalar_t>()),
                                             reinterpret_cast<T *>(weight.data_ptr<scalar_t>()),
                                             nullptr,
                                             nullptr,
@@ -92,8 +93,8 @@ void rms_norm_general(Tensor &out,     // [..., hidden_size]
            // weight.data_ptr<scalar_t>(), epsilon, num_tokens, hidden_size);
        } else {
            // per-tensor
-            vllm::generalLayerNorm<T, half>
+           hipLaunchKernelGGL(( vllm::generalLayerNorm<T, half>)
-                <<<grid, block, 0, stream>>>(reinterpret_cast<T *>(input.data_ptr<scalar_t>()),
+                , dim3(grid), dim3(block), 0, stream, reinterpret_cast<T *>(input.data_ptr<scalar_t>()),
                                             reinterpret_cast<T *>(weight.data_ptr<scalar_t>()),
                                             nullptr,
                                             nullptr,
@@ -121,13 +122,13 @@ void rms_norm_general_fuse_sum(Tensor &out,       // [..., hidden_size]
    dim3 block(std::min(hidden_size, 1024));
    block.x = 32 * ((block.x + 31) / 32);
-    const cudaStream_t stream = getCurrentCUDAStream();
+    const hipStream_t stream = getCurrentHIPStreamMasqueradingAsCUDA();
    VLLM_DISPATCH_FLOATING_TYPES(input.scalar_type(), "generalLayerNorm_fuse_sum", [&] {
        using T = scalar_t;
        if (use_per_token_quant) {
            // per-token
-            vllm::generalLayerNorm_fuse_sum<T, half>
+           hipLaunchKernelGGL(( vllm::generalLayerNorm_fuse_sum<T, half>)
-                <<<grid, block, 0, stream>>>(reinterpret_cast<T *>(input.data_ptr<scalar_t>()),
+                , dim3(grid), dim3(block), 0, stream, reinterpret_cast<T *>(input.data_ptr<scalar_t>()),
                                             reinterpret_cast<T *>(weight.data_ptr<scalar_t>()),
                                             nullptr,
                                             nullptr,
@@ -149,8 +150,8 @@ void rms_norm_general_fuse_sum(Tensor &out,       // [..., hidden_size]
            // Not implemented per-tensor input_sum
            assert(false);
-            vllm::generalLayerNorm_fuse_sum<T, half>
+           hipLaunchKernelGGL(( vllm::generalLayerNorm_fuse_sum<T, half>)
-                <<<grid, block, 0, stream>>>(reinterpret_cast<T *>(input.data_ptr<scalar_t>()),
+                , dim3(grid), dim3(block), 0, stream, reinterpret_cast<T *>(input.data_ptr<scalar_t>()),
                                             reinterpret_cast<T *>(weight.data_ptr<scalar_t>()),
                                             nullptr,
                                             nullptr,
@@ -176,10 +177,10 @@ void invoke_dequant_add_residual_rms_norm_quant(Tensor &out,      // [..., hidde
    int num_tokens  = input.numel() / hidden_size;
    dim3 grid(num_tokens);
    dim3 block(std::min(hidden_size, 1024));
-    const cudaStream_t stream = getCurrentCUDAStream();
+    const hipStream_t stream = getCurrentHIPStreamMasqueradingAsCUDA();
    VLLM_DISPATCH_FLOATING_TYPES(residual.scalar_type(), "dequant_add_residual_rms_norm_quant_kernel", [&] {
-        vllm::dequant_add_residual_rms_norm_quant_kernel<scalar_t, half, false>
+       hipLaunchKernelGGL(( vllm::dequant_add_residual_rms_norm_quant_kernel<scalar_t, half, false>)
-            <<<grid, block, 0, stream>>>(input.data_ptr<int32_t>(),
+            , dim3(grid), dim3(block), 0, stream, input.data_ptr<int32_t>(),
                                         residual.data_ptr<scalar_t>(),
                                         out.data_ptr<int8_t>(),
                                         gamma.data_ptr<scalar_t>(),
@@ -202,10 +203,10 @@ void invoke_dequant_add_residual_rms_norm_quant(Tensor &out,      // [..., hidde
    dim3 grid(num_tokens);
    dim3 block(std::min(hidden_size, 1024));
-    const cudaStream_t stream = getCurrentCUDAStream();
+    const hipStream_t stream = getCurrentHIPStreamMasqueradingAsCUDA();
    VLLM_DISPATCH_FLOATING_TYPES(residual.scalar_type(), "dequant_add_residual_rms_norm_quant_kernel", [&] {
-        vllm::dequant_add_residual_rms_norm_quant_kernel<scalar_t, half *, true>
+       hipLaunchKernelGGL(( vllm::dequant_add_residual_rms_norm_quant_kernel<scalar_t, half *, true>)
-            <<<grid, block, 0, stream>>>(input.data_ptr<int32_t>(),
+            , dim3(grid), dim3(block), 0, stream, input.data_ptr<int32_t>(),
                                         residual.data_ptr<scalar_t>(),
                                         out.data_ptr<int8_t>(),
                                         gamma.data_ptr<scalar_t>(),

--- a/src/kernels/layernorm_kernels_impl.cuh
+++ b/src/kernels/layernorm_kernels_impl.cuh
-#include <cuda_bf16.h>
+#include "hip/hip_runtime.h"
+#include <hip/hip_bf16.h>
 #define ENABLE_BF16 1

--- a/src/kernels/misc_kernels.cu
+++ b/src/kernels/misc_kernels.cu
+#include "hip/hip_runtime.h"
 #include "misc_kernels_impl.cuh"
 #include "misc_kernels.h"
 #include "dispatch_utils.h"
@@ -13,12 +14,12 @@ Tensor add(Tensor a, Tensor b) {
    int threadsPerBlock = 1024;
    int blocksPerGrid   = (a.numel() + threadsPerBlock - 1) / threadsPerBlock;
-    auto stream = getCurrentCUDAStream();
+    auto stream = getCurrentHIPStreamMasqueradingAsCUDA();
    Tensor out = Tensor::empty_like(a);
    dispatch(out.scalar_type(), [&]<typename scalar_t>() {
-        add_kernel<<<blocksPerGrid, threadsPerBlock, 0, stream>>>(
+       hipLaunchKernelGGL(( add_kernel), dim3(blocksPerGrid), dim3(threadsPerBlock), 0, stream, 
            a.data_ptr<scalar_t>(), b.data_ptr<scalar_t>(), out.data_ptr<scalar_t>(), out.numel());
    });
@@ -46,12 +47,12 @@ void mul_add(Tensor x, Tensor scale, Tensor bias) {
    int threadsPerBlock = 1024;
    int blocksPerGrid   = (x.numel() + threadsPerBlock * unroll - 1) / (threadsPerBlock * unroll);
-    auto stream = getCurrentCUDAStream();
+    auto stream = getCurrentHIPStreamMasqueradingAsCUDA();
    dispatch(x.scalar_type(), [&]<typename scalar_t>() {
        if (scale.valid()) {
-            mul_add_kernel<scalar_t, unroll, false>
+           hipLaunchKernelGGL(( mul_add_kernel<scalar_t, unroll, false>)
-                <<<blocksPerGrid, threadsPerBlock, 0, stream>>>(x.data_ptr<scalar_t>(),
+                , dim3(blocksPerGrid), dim3(threadsPerBlock), 0, stream, x.data_ptr<scalar_t>(),
                                                                scale.data_ptr<scalar_t>(),
                                                                bias.data_ptr<scalar_t>(),
                                                                0,
@@ -62,7 +63,7 @@ void mul_add(Tensor x, Tensor scale, Tensor bias) {
                                                                0,
                                                                0);
        } else {
-            mul_add_kernel<scalar_t, unroll, true><<<blocksPerGrid, threadsPerBlock, 0, stream>>>(
+           hipLaunchKernelGGL(( mul_add_kernel<scalar_t, unroll, true>), dim3(blocksPerGrid), dim3(threadsPerBlock), 0, stream, 
                x.data_ptr<scalar_t>(), nullptr, bias.data_ptr<scalar_t>(), 0, x.numel(), 1, bias.numel(), 0, 0, 0);
        }
    });
@@ -96,12 +97,12 @@ void mul_add_batch(Tensor x, Tensor scale, bool batch_scale, double scale_shift,
    int threadsPerBlock = 1024;
    dim3 grid(ceilDiv(numel, threadsPerBlock * unroll), batch_size);
-    auto stream = getCurrentCUDAStream();
+    auto stream = getCurrentHIPStreamMasqueradingAsCUDA();
    dispatch(x.scalar_type(), [&]<typename scalar_t>() {
        if (scale.valid()) {
-            mul_add_kernel<scalar_t, unroll, false>
+           hipLaunchKernelGGL(( mul_add_kernel<scalar_t, unroll, false>)
-                <<<grid, threadsPerBlock, 0, stream>>>(x.data_ptr<scalar_t>(),
+                , dim3(grid), dim3(threadsPerBlock), 0, stream, x.data_ptr<scalar_t>(),
                                                       scale.data_ptr<scalar_t>(),
                                                       bias.data_ptr<scalar_t>(),
                                                       (scalar_t)scale_shift,
@@ -112,8 +113,8 @@ void mul_add_batch(Tensor x, Tensor scale, bool batch_scale, double scale_shift,
                                                       batch_scale ? scale.stride(0) : 0,
                                                       batch_bias ? bias.stride(0) : 0);
        } else {
-            mul_add_kernel<scalar_t, unroll, true>
+           hipLaunchKernelGGL(( mul_add_kernel<scalar_t, unroll, true>)
-                <<<grid, threadsPerBlock, 0, stream>>>(x.data_ptr<scalar_t>(),
+                , dim3(grid), dim3(threadsPerBlock), 0, stream, x.data_ptr<scalar_t>(),
                                                       nullptr,
                                                       bias.data_ptr<scalar_t>(),
                                                       (scalar_t)scale_shift,
@@ -134,12 +135,12 @@ Tensor embedding(Tensor input_id, Tensor lookup) {
    auto shapeOut = input_id.shape;
    shapeOut.dataExtent.push_back(lookup.shape[-1]);
-    auto stream = getCurrentCUDAStream();
+    auto stream = getCurrentHIPStreamMasqueradingAsCUDA();
    Tensor out = Tensor::empty(shapeOut, lookup.scalar_type(), input_id.device());
    dispatch(out.scalar_type(), [&]<typename scalar_t>() {
-        EmbeddingKernel<<<input_id.numel(), std::min(lookup.shape[-1], 1024), 0, stream>>>(
+       hipLaunchKernelGGL(( EmbeddingKernel), dim3(input_id.numel()), dim3(std::min(lookup.shape[-1], 1024)), 0, stream, 
            input_id.data_ptr<int32_t>(), out.data_ptr<scalar_t>(), lookup.data_ptr<scalar_t>(), lookup.shape[-1]);
    });
@@ -149,12 +150,12 @@ Tensor embedding(Tensor input_id, Tensor lookup) {
 Tensor argmax_sample(Tensor logits) {
    assert(logits.ndims() == 2);
-    auto stream = getCurrentCUDAStream();
+    auto stream = getCurrentHIPStreamMasqueradingAsCUDA();
    Tensor out = Tensor::empty({logits.shape[0]}, Tensor::INT32, logits.device());
    dispatch(logits.scalar_type(), [&]<typename scalar_t>() {
-        argmax_sample_kernel<<<logits.shape[0], std::min(logits.shape[1], 1024), 0, stream>>>(
+       hipLaunchKernelGGL(( argmax_sample_kernel), dim3(logits.shape[0]), dim3(std::min(logits.shape[1], 1024)), 0, stream, 
            logits.data_ptr<scalar_t>(), out.data_ptr<int32_t>(), logits.shape[1]);
    });
@@ -167,7 +168,7 @@ void splitqkv(Tensor qkv, Tensor q, Tensor k, Tensor v) {
    // assert(qkv.shape[0] == k.shape[0]);
    // assert(qkv.shape[0] == v.shape[0]);
-    auto stream = getCurrentCUDAStream();
+    auto stream = getCurrentHIPStreamMasqueradingAsCUDA();
    int dim_q = q.shape[-1] * q.shape[-2];
    int dim_k = k.shape[-1] * k.shape[-2];
@@ -179,7 +180,7 @@ void splitqkv(Tensor qkv, Tensor q, Tensor k, Tensor v) {
    int num_tokens = qkv.numel() / qkv.shape[-1];
    dispatch(qkv.scalar_type(), [&]<typename scalar_t>() {
-        splitqkv_kernel<<<num_tokens, std::min(qkv.shape[-1], 1024), 0, stream>>>(qkv.data_ptr<scalar_t>(),
+       hipLaunchKernelGGL(( splitqkv_kernel), dim3(num_tokens), dim3(std::min(qkv.shape[-1], 1024)), 0, stream, qkv.data_ptr<scalar_t>(),
                                                                                  q.data_ptr<scalar_t>(),
                                                                                  k.data_ptr<scalar_t>(),
                                                                                  v.data_ptr<scalar_t>(),
@@ -195,7 +196,7 @@ std::array<Tensor, N> split_mod(Tensor input) {
    int threadsPerBlock = 1024;
    int blocksPerGrid   = (input.numel() + threadsPerBlock - 1) / threadsPerBlock;
-    auto stream = getCurrentCUDAStream();
+    auto stream = getCurrentHIPStreamMasqueradingAsCUDA();
    auto shapeOut = TensorShape(input.shape.dataExtent);
    shapeOut[-1] /= N;
@@ -210,7 +211,7 @@ std::array<Tensor, N> split_mod(Tensor input) {
        for (int k = 0; k < N; k++) {
            outPtr[k] = out[k].template data_ptr<scalar_t>();
        }
-        split_mod_kernel<<<blocksPerGrid, threadsPerBlock, 0, stream>>>(
+       hipLaunchKernelGGL(( split_mod_kernel), dim3(blocksPerGrid), dim3(threadsPerBlock), 0, stream, 
            input.data_ptr<scalar_t>(), outPtr, input.numel());
    });
@@ -227,10 +228,10 @@ Tensor quant_static(Tensor x, float scale) {
    int threadsPerBlock = 1024;
    int blocksPerGrid   = (x.numel() + threadsPerBlock * unroll - 1) / (threadsPerBlock * unroll);
-    auto stream = getCurrentCUDAStream();
+    auto stream = getCurrentHIPStreamMasqueradingAsCUDA();
    dispatch(x.scalar_type(), [&]<typename scalar_t>() {
-        quant_kernel_static<scalar_t, unroll><<<blocksPerGrid, threadsPerBlock, 0, stream>>>(
+       hipLaunchKernelGGL(( quant_kernel_static<scalar_t, unroll>), dim3(blocksPerGrid), dim3(threadsPerBlock), 0, stream, 
            x.data_ptr<scalar_t>(), out.data_ptr<int8_t>(), (scalar_t)scale, x.numel());
    });
@@ -247,10 +248,10 @@ Tensor quant_static_fuse_gelu(Tensor x, float scale) {
    int threadsPerBlock = 1024;
    int blocksPerGrid   = (x.numel() + threadsPerBlock * unroll - 1) / (threadsPerBlock * unroll);
-    auto stream = getCurrentCUDAStream();
+    auto stream = getCurrentHIPStreamMasqueradingAsCUDA();
    dispatch(x.scalar_type(), [&]<typename scalar_t>() {
-        quant_kernel_static_fuse_gelu<scalar_t, unroll><<<blocksPerGrid, threadsPerBlock, 0, stream>>>(
+       hipLaunchKernelGGL(( quant_kernel_static_fuse_gelu<scalar_t, unroll>), dim3(blocksPerGrid), dim3(threadsPerBlock), 0, stream, 
            x.data_ptr<scalar_t>(), out.data_ptr<int8_t>(), (scalar_t)scale, x.numel());
    });
@@ -266,7 +267,7 @@ void cast(Tensor input, Tensor output) {
        assert(input.scalar_size() == output.scalar_size());
    }
-    auto stream = getCurrentCUDAStream();
+    auto stream = getCurrentHIPStreamMasqueradingAsCUDA();
    dispatch(input.scalar_type(), [&]<typename input_t>() {
        dispatch(output.scalar_type(), [&]<typename output_t>() {
@@ -275,10 +276,10 @@ void cast(Tensor input, Tensor output) {
            int threadsPerBlock = 1024;
            int blocksPerGrid   = (int)ceilDiv<int64_t>(input.numel(), threadsPerBlock * unroll);
-            cast_kernel<input_t, output_t, unroll><<<blocksPerGrid, threadsPerBlock, 0, stream>>>(
+           hipLaunchKernelGGL(( cast_kernel<input_t, output_t, unroll>), dim3(blocksPerGrid), dim3(threadsPerBlock), 0, stream, 
                input.data_ptr<input_t>(), output.data_ptr<output_t>(), input.numel());
-            checkCUDA(cudaGetLastError());
+            checkCUDA(hipGetLastError());
        });
    });
 }
@@ -298,7 +299,7 @@ Tensor topk(Tensor x, int k) {
    Tensor out = Tensor::empty(outShape, Tensor::INT32, x.device());
-    auto stream = getCurrentCUDAStream();
+    auto stream = getCurrentHIPStreamMasqueradingAsCUDA();
    dispatchVal(k, std::make_integer_sequence<int, MAXK + 1>(), [&]<int K>() {
        if constexpr (K == 0) {
@@ -307,9 +308,9 @@ Tensor topk(Tensor x, int k) {
        }
        if constexpr (K > 0) {
            dispatch(x.scalar_type(), [&]<typename scalar_t>() {
-                topk_kernel<scalar_t, K><<<ceilDiv(batch, 32), 32, 0, stream>>>(
+               hipLaunchKernelGGL(( topk_kernel<scalar_t, K>), dim3(ceilDiv(batch, 32)), dim3(32), 0, stream, 
                    x.data_ptr<scalar_t>(), out.data_ptr<int>(), N, x.stride(-2), batch);
-                checkCUDA(cudaGetLastError());
+                checkCUDA(hipGetLastError());
            });
        }
    });

--- a/src/kernels/misc_kernels_impl.cuh
+++ b/src/kernels/misc_kernels_impl.cuh
+#include "hip/hip_runtime.h"
 #include "reduction_utils.cuh"
 #include <array>
-#include <cuda_fp16.h>
+#include <hip/hip_fp16.h>
-#include <cuda_bf16.h>
+#include <hip/hip_bf16.h>
 #include "utils.cuh"
 #include "activation_kernels_impl.cuh"

--- a/src/kernels/reduction_utils.cuh
+++ b/src/kernels/reduction_utils.cuh
+#include "hip/hip_runtime.h"
 /*
 * Adapted from
 * https://github.com/NVIDIA/FasterTransformer/blob/release/v5.3_tag/src/fastertransformer/kernels/reduce_kernel_utils.cuh

--- a/src/kernels/utils.cuh
+++ b/src/kernels/utils.cuh
+#include "hip/hip_runtime.h"
 // Adated from FasterTransformer,
 // https://github.com/NVIDIA/FasterTransformer/blob/release/v5.3_tag/src/fastertransformer/kernels/decoder_masked_multihead_attention/decoder_masked_multihead_attention_template.hpp
 #pragma once
@@ -9,10 +10,10 @@
 #include <cstdio>
-#include <cuda_fp16.h>
+#include <hip/hip_fp16.h>
 #ifdef ENABLE_BF16
-#include <cuda_bf16.h>
+#include <hip/hip_bf16.h>
 #endif
 __device__ __forceinline__ static void trap_unsupported_arch() {
@@ -24,11 +25,11 @@ __device__ __forceinline__ static void trap_unsupported_arch() {
    __trap();
 }
-#if defined(ENABLE_BF16) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800
+#if defined(ENABLE_BF16) && defined(__DTK_ARCH__) && __DTK_ARCH__ < 800
-__device__ __forceinline__ static __nv_bfloat162
+__device__ __forceinline__ static __hip_bfloat162
-__hfma2(const __nv_bfloat162 a, const __nv_bfloat162 b, const __nv_bfloat162 c) {
+__hfma2(const __hip_bfloat162 a, const __hip_bfloat162 b, const __hip_bfloat162 c) {
    trap_unsupported_arch();
-    return __nv_bfloat162(0.0f, 0.0f);
+    return __hip_bfloat162(0.0f, 0.0f);
 }
 #endif
@@ -56,11 +57,11 @@ struct num_elems<half2> {
 };
 #ifdef ENABLE_BF16
 template<>
-struct num_elems<__nv_bfloat16> {
+struct num_elems<__hip_bfloat16> {
    static constexpr int value = 1;
 };
 template<>
-struct num_elems<__nv_bfloat162> {
+struct num_elems<__hip_bfloat162> {
    static constexpr int value = 2;
 };
 #endif
@@ -107,12 +108,12 @@ struct packed_as<float2, 1> {
 };
 #ifdef ENABLE_BF16
 template<>
-struct packed_as<__nv_bfloat16, 2> {
+struct packed_as<__hip_bfloat16, 2> {
-    using type = __nv_bfloat162;
+    using type = __hip_bfloat162;
 };
 template<>
-struct packed_as<__nv_bfloat162, 1> {
+struct packed_as<__hip_bfloat162, 1> {
-    using type = __nv_bfloat16;
+    using type = __hip_bfloat16;
 };
 #endif
 #ifdef ENABLE_FP8
@@ -169,8 +170,8 @@ inline __device__ T ldg(const T *val) {
 #define bf1622float2 __bfloat1622float2
 #define float22bf162 __float22bfloat162_rn
 #define bf162bf162 __bfloat162bfloat162
-inline __device__ int16_t bf1622int16(__nv_bfloat162 val) {
+inline __device__ int16_t bf1622int16(__hip_bfloat162 val) {
-#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800
+#if defined(__DTK_ARCH__) && __DTK_ARCH__ < 800
    float2 f_val;
    f_val.x = max(min(__low2float(val), 127.f), -128.f);
    f_val.y = max(min(__high2float(val), 127.f), -128.f);
@@ -201,8 +202,8 @@ inline __device__ int16_t bf1622int16(__nv_bfloat162 val) {
 #if ENABLE_BF16
 template<>
-inline __device__ __nv_bfloat162 ldg(const __nv_bfloat162 *val) {
+inline __device__ __hip_bfloat162 ldg(const __hip_bfloat162 *val) {
-#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800
+#if defined(__DTK_ARCH__) && __DTK_ARCH__ < 800
    return val[0];
 #else
    return __ldg(val);
@@ -210,8 +211,8 @@ inline __device__ __nv_bfloat162 ldg(const __nv_bfloat162 *val) {
 }
 template<>
-inline __device__ __nv_bfloat16 ldg(const __nv_bfloat16 *val) {
+inline __device__ __hip_bfloat16 ldg(const __hip_bfloat16 *val) {
-#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800
+#if defined(__DTK_ARCH__) && __DTK_ARCH__ < 800
    return val[0];
 #else
    return __ldg(val);
@@ -330,81 +331,81 @@ __device__ inline float2 cuda_cast<float2, int16_t>(int16_t val) {
 #ifdef ENABLE_BF16
 template<>
-__device__ inline __nv_bfloat16 cuda_cast(int32_t val) {
+__device__ inline __hip_bfloat16 cuda_cast(int32_t val) {
    return static_cast<float>(val);
 }
 template<>
-__device__ inline __nv_bfloat16 cuda_cast(int8_t val) {
+__device__ inline __hip_bfloat16 cuda_cast(int8_t val) {
    return static_cast<float>(val);
 }
 template<>
-__device__ inline int8_t cuda_cast(__nv_bfloat16 val) {
+__device__ inline int8_t cuda_cast(__hip_bfloat16 val) {
    return static_cast<float>(val);
 }
 template<>
-__device__ inline float cuda_cast<float, __nv_bfloat16>(__nv_bfloat16 val) {
+__device__ inline float cuda_cast<float, __hip_bfloat16>(__hip_bfloat16 val) {
    return __bfloat162float(val);
 }
 template<>
-__device__ inline float2 cuda_cast<float2, __nv_bfloat162>(__nv_bfloat162 val) {
+__device__ inline float2 cuda_cast<float2, __hip_bfloat162>(__hip_bfloat162 val) {
    return bf1622float2(val);
 }
 template<>
-__device__ inline half cuda_cast<half, __nv_bfloat16>(__nv_bfloat16 val) {
+__device__ inline half cuda_cast<half, __hip_bfloat16>(__hip_bfloat16 val) {
    return __float2half(__bfloat162float(val));
 }
 template<>
-__device__ inline int16_t cuda_cast<int16_t, __nv_bfloat162>(__nv_bfloat162 val) {
+__device__ inline int16_t cuda_cast<int16_t, __hip_bfloat162>(__hip_bfloat162 val) {
    return bf1622int16(val);
 }
 template<>
-__device__ inline __nv_bfloat16 cuda_cast<__nv_bfloat16, float>(float val) {
+__device__ inline __hip_bfloat16 cuda_cast<__hip_bfloat16, float>(float val) {
    return __float2bfloat16(val);
 }
 template<>
-__device__ inline __nv_bfloat16 cuda_cast<__nv_bfloat16, half>(half val) {
+__device__ inline __hip_bfloat16 cuda_cast<__hip_bfloat16, half>(half val) {
    return __float2bfloat16(__half2float(val));
 }
 template<>
-__device__ inline __nv_bfloat162 cuda_cast<__nv_bfloat162, __nv_bfloat16>(__nv_bfloat16 val) {
+__device__ inline __hip_bfloat162 cuda_cast<__hip_bfloat162, __hip_bfloat16>(__hip_bfloat16 val) {
    return bf162bf162(val);
 }
 template<>
-__device__ inline __nv_bfloat162 cuda_cast<__nv_bfloat162, float>(float val) {
+__device__ inline __hip_bfloat162 cuda_cast<__hip_bfloat162, float>(float val) {
    return __float2bfloat162_rn(val);
 }
 template<>
-__device__ inline __nv_bfloat162 cuda_cast<__nv_bfloat162, float2>(float2 val) {
+__device__ inline __hip_bfloat162 cuda_cast<__hip_bfloat162, float2>(float2 val) {
    return float22bf162(val);
 }
 template<>
-__device__ inline __nv_bfloat162 cuda_cast<__nv_bfloat162, int16_t>(int16_t val) {
+__device__ inline __hip_bfloat162 cuda_cast<__hip_bfloat162, int16_t>(int16_t val) {
    union {
        int8_t int8[2];
        int16_t int16;
    };
    int16 = val;
-    __nv_bfloat162 res;
+    __hip_bfloat162 res;
-    res.x = cuda_cast<__nv_bfloat16>(int8[0]);
+    res.x = cuda_cast<__hip_bfloat16>(int8[0]);
-    res.y = cuda_cast<__nv_bfloat16>(int8[1]);
+    res.y = cuda_cast<__hip_bfloat16>(int8[1]);
    return res;
 }
 template<>
-__device__ inline __nv_bfloat162 cuda_cast<__nv_bfloat162, half2>(half2 val) {
+__device__ inline __hip_bfloat162 cuda_cast<__hip_bfloat162, half2>(half2 val) {
    return float22bf162(__half22float2(val));
 }
@@ -420,7 +421,7 @@ __device__ __forceinline__ packed_as<half, 2>::type f162f162<half>(half x) {
 #ifdef ENABLE_BF16
 template<>
-__device__ __forceinline__ packed_as<__nv_bfloat16, 2>::type f162f162<__nv_bfloat16>(__nv_bfloat16 x) {
+__device__ __forceinline__ packed_as<__hip_bfloat16, 2>::type f162f162<__hip_bfloat16>(__hip_bfloat16 x) {
    return __bfloat162bfloat162(x);
 }
 #endif
@@ -453,8 +454,8 @@ __device__ inline half cuda_max(half2 val) {
 #ifdef ENABLE_BF16
 template<>
-__device__ inline __nv_bfloat16 cuda_max(__nv_bfloat162 val) {
+__device__ inline __hip_bfloat16 cuda_max(__hip_bfloat162 val) {
-#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800))
+#if (defined(__DTK_ARCH__) && (__DTK_ARCH__ >= 800))
    return __hmax(val.x, val.y);
 #else
    assert(false);
@@ -497,14 +498,14 @@ __device__ inline half2 cuda_abs(half2 val) {
 #ifdef ENABLE_BF16
-#if __CUDA_ARCH__ >= 800 || !defined(__CUDA_ARCH__)
+#if __DTK_ARCH__ >= 800 || !defined(__DTK_ARCH__)
 template<>
-__device__ inline __nv_bfloat16 cuda_abs(__nv_bfloat16 val) {
+__device__ inline __hip_bfloat16 cuda_abs(__hip_bfloat16 val) {
    return __habs(val);
 }
 template<>
-__device__ inline __nv_bfloat162 cuda_abs(__nv_bfloat162 val) {
+__device__ inline __hip_bfloat162 cuda_abs(__hip_bfloat162 val) {
    return __habs2(val);
 }
 #endif

--- a/src/kernels/zgemm/attention.cuh
+++ b/src/kernels/zgemm/attention.cuh
+#include "hip/hip_runtime.h"
 #pragma once
 #include "gemm_base.cuh"
@@ -26,8 +27,8 @@ struct AttentionFP16Config {
    using half_t  = half;
    using half2_t = half2;
-    using epilogue_half_t  = typename std::conditional_t<bf16out, __nv_bfloat16, half>;
+    using epilogue_half_t  = typename std::conditional_t<bf16out, __hip_bfloat16, half>;
-    using epilogue_half2_t = typename std::conditional_t<bf16out, __nv_bfloat162, half2>;
+    using epilogue_half2_t = typename std::conditional_t<bf16out, __hip_bfloat162, half2>;
 };
 using AttentionFP16Config_FP16 = AttentionFP16Config<false>;
@@ -60,7 +61,7 @@ public:
    using typename AttentionConfig::epilogue_half_t;
    using typename AttentionConfig::epilogue_half2_t;
-#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800
+#if defined(__DTK_ARCH__) && __DTK_ARCH__ >= 800
    static constexpr bool IS_SM80 = true;
 #else
    static constexpr bool IS_SM80 = false;
@@ -657,7 +658,7 @@ public:
    template<typename Epilogue>
    struct attention_fp16_kernel {
-        static constexpr int MIN_ARCH   = std::is_same_v<half_t, __nv_bfloat16> ? 800 : 750;
+        static constexpr int MIN_ARCH   = std::is_same_v<half_t, __hip_bfloat16> ? 800 : 750;
        static constexpr int SHMEM_SIZE = 0; // sizeof(q_shmem_t);
        __device__ void operator()(const packed_q_t *ptr_q,

--- a/src/kernels/zgemm/attention.cu
+++ b/src/kernels/zgemm/attention.cu
+#include "hip/hip_runtime.h"
 #include "zgemm.h"
 #include "attention.cuh"
@@ -71,10 +72,10 @@ void attention_fp16(Tensor q, // packed [Batch, Head, TokensQ, HEAD_DIM]
            shmem = std::max(shmem, Attention::template attention_fp16_kernel<Epilogue>::SHMEM_SIZE);
            if (shmem >= 24 * 1024) {
-                checkCUDA(cudaFuncSetAttribute(func, cudaFuncAttributeMaxDynamicSharedMemorySize, shmem));
+                checkCUDA(hipFuncSetAttribute(func, hipFuncAttributeMaxDynamicSharedMemorySize, shmem));
            }
-            func<<<grid, GEMM::WARP_SIZE * GEMM::NUM_WARPS, shmem, getCurrentCUDAStream()>>>(q.data_ptr<packed_q_t>(),
+           hipLaunchKernelGGL(( func), dim3(grid), dim3(GEMM::WARP_SIZE * GEMM::NUM_WARPS), shmem, getCurrentHIPStreamMasqueradingAsCUDA(), q.data_ptr<packed_q_t>(),
                                                                                             k.data_ptr<packed_k_t>(),
                                                                                             v.data_ptr<packed_v_t>(),
                                                                                             scale,
@@ -82,7 +83,7 @@ void attention_fp16(Tensor q, // packed [Batch, Head, TokensQ, HEAD_DIM]
                                                                                             numTokensKV,
                                                                                             args,
                                                                                             false);
-            checkCUDA(cudaGetLastError());
+            checkCUDA(hipGetLastError());
        };
        launch.template operator()<typename GEMM::EpilogueDefault>(typename GEMM::EpilogueDefault::Arguments{

--- a/src/kernels/zgemm/epilogues.cuh
+++ b/src/kernels/zgemm/epilogues.cuh
+#include "hip/hip_runtime.h"
 #pragma once
 #include "gemm_base.cuh"
@@ -702,7 +703,7 @@ public:
        // q:   [batch_size, #blocks, block_size, #heads, HEAD_DIM]
        // vk:  [batch_size, #heads, HEAD_DIM+1, HEAD_DIM]
        struct vk_mul_q_kernel {
-            static constexpr int MIN_ARCH = std::is_same_v<half_t, __nv_bfloat16> ? 800 : 750;
+            static constexpr int MIN_ARCH = std::is_same_v<half_t, __hip_bfloat16> ? 800 : 750;
            // FIXME FIXME FIXME
            __device__ void operator()(half_t *q, const float *vk, float eps, int num_tokens) {
                const int block_id = blockIdx.x;
@@ -762,7 +763,7 @@ public:
    template<typename Epilogue>
    struct test_epilogue_kernel {
-        static constexpr int MIN_ARCH = std::is_same_v<half_t, __nv_bfloat16> ? 800 : 750;
+        static constexpr int MIN_ARCH = std::is_same_v<half_t, __hip_bfloat16> ? 800 : 750;
        static constexpr size_t SHMEM_PER_WARP =
            ceilDiv<size_t>(Base::template load_act_to_fpsum<false>::SHMEM_SIZE, 128) * 128;
        static constexpr size_t SHMEM_SIZE = SHMEM_PER_WARP * NUM_WARPS;

--- a/src/kernels/zgemm/gemm_base.cuh
+++ b/src/kernels/zgemm/gemm_base.cuh
+#include "hip/hip_runtime.h"
 #pragma once
 #include "common.h"
@@ -44,8 +45,8 @@ public:
    // may generate incorrect results in certain circumstances
    static constexpr bool FASTER_I2F = faster_i2f;
-    using half_t  = typename std::conditional_t<bf16, __nv_bfloat16, half>;
+    using half_t  = typename std::conditional_t<bf16, __hip_bfloat16, half>;
-    using half2_t = typename std::conditional_t<bf16, __nv_bfloat162, half2>;
+    using half2_t = typename std::conditional_t<bf16, __hip_bfloat162, half2>;
 };
 using GEMMConfig_W4A4_FP16           = GEMMConfig_W4A4<false>;
@@ -67,8 +68,8 @@ public:
    using half_t  = half;
    using half2_t = half2;
 #else
-    using half_t  = __nv_bfloat16;
+    using half_t  = __hip_bfloat16;
-    using half2_t = __nv_bfloat162;
+    using half2_t = __hip_bfloat162;
 #endif
 };
@@ -202,9 +203,9 @@ public:
    __device__ __forceinline__ static packed_f32psum_t
    mma_f16xf16_f32(packed_fpsum_t a, packed_fpsum_t b, packed_f32psum_t psum) {
-        static_assert(std::is_same_v<half_t, half> || std::is_same_v<half_t, __nv_bfloat16>);
+        static_assert(std::is_same_v<half_t, half> || std::is_same_v<half_t, __hip_bfloat16>);
-        static constexpr bool is_bf16 = std::is_same_v<half_t, __nv_bfloat16>;
+        static constexpr bool is_bf16 = std::is_same_v<half_t, __hip_bfloat16>;
        uint4 out1 = mma_m16n8k16_f32f16f16f32<is_bf16>(
            kernels::bit_cast<uint4>(a),
@@ -890,8 +891,8 @@ constexpr int max_arch() {
 template<typename kernel, typename... T>
 __global__ static void invoke_kernel(T... args) {
-#ifdef __CUDA_ARCH__
+#ifdef __DTK_ARCH__
-    if constexpr (__CUDA_ARCH__ >= min_arch<kernel>() && __CUDA_ARCH__ <= max_arch<kernel>()) {
+    if constexpr (__DTK_ARCH__ >= min_arch<kernel>() && __DTK_ARCH__ <= max_arch<kernel>()) {
        kernel()(args...);
    } else {
        trap_unsupported_arch();
@@ -916,8 +917,8 @@ template<typename T>
 static void test_sizeof() {
    printf("typeid = %s\n", typeid(T).name());
    test_sizeof_host<T>();
-    test_sizeof_device<T><<<1, 1>>>();
+   hipLaunchKernelGGL(( test_sizeof_device<T>), dim3(1), dim3(1), 0, 0, );
-    checkCUDA(cudaDeviceSynchronize());
+    checkCUDA(hipDeviceSynchronize());
 }
 }; // namespace nunchaku::kernels
--- a/src/kernels/zgemm/gemm_utils.cuh
+++ b/src/kernels/zgemm/gemm_utils.cuh
@@ -163,7 +163,7 @@ __device__ __forceinline__ static float2 half22float2(half2 val) {
    return __half22float2(val);
 }
-__device__ __forceinline__ static float2 half22float2(__nv_bfloat162 val) {
+__device__ __forceinline__ static float2 half22float2(__hip_bfloat162 val) {
    return __bfloat1622float2(val);
 }
@@ -176,7 +176,7 @@ __device__ __forceinline__ half2 float22half2<half2>(float2 val) {
 }
 template<>
-__device__ __forceinline__ __nv_bfloat162 float22half2<__nv_bfloat162>(float2 val) {
+__device__ __forceinline__ __hip_bfloat162 float22half2<__hip_bfloat162>(float2 val) {
    return __float22bfloat162_rn(val);
 }
@@ -334,13 +334,13 @@ __device__ __forceinline__ static half2 h2div(half2 a, half2 b) {
    of.y = __fdividef(af.y, bf.y);
    return float22half2<half2>(of);
 };
-__device__ __forceinline__ static __nv_bfloat162 h2div(__nv_bfloat162 a, __nv_bfloat162 b) {
+__device__ __forceinline__ static __hip_bfloat162 h2div(__hip_bfloat162 a, __hip_bfloat162 b) {
    float2 af = half22float2(a);
    float2 bf = half22float2(b);
    float2 of;
    of.x = __fdividef(af.x, bf.x);
    of.y = __fdividef(af.y, bf.y);
-    return float22half2<__nv_bfloat162>(of);
+    return float22half2<__hip_bfloat162>(of);
 };
 __device__ __forceinline__ static void reduce_add(float *addr, float val) {

--- a/src/kernels/zgemm/gemm_w4a4.cuh
+++ b/src/kernels/zgemm/gemm_w4a4.cuh
+#include "hip/hip_runtime.h"
 #pragma once
 #include "gemm_base.cuh"
@@ -25,7 +26,7 @@ public:
    // micro-scales for FP4 MMA
    // each uint32_t is a 4*32 matrix of scales (for MMA of 64*32)
-#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 1200
+#if defined(__DTK_ARCH__) && __DTK_ARCH__ >= 1200
    static constexpr bool FP4_AVAILABLE = true;
 #else
    static constexpr bool FP4_AVAILABLE = false;
@@ -623,7 +624,7 @@ public:
    // each thread block (1 warp) quantize WARP_M * WARP_K tile (32 * 64)
    struct quantize_w4a4_act_kernel {
-        static constexpr int MIN_ARCH = std::is_same_v<half_t, __nv_bfloat16> ? 800 : 750;
+        static constexpr int MIN_ARCH = std::is_same_v<half_t, __hip_bfloat16> ? 800 : 750;
        __device__ void operator()(const half_t *input, packed_act_t *output, packed_ascale_t *oscales, int K) {
            const int laneId = threadIdx.x % WARP_SIZE;
@@ -660,7 +661,7 @@ public:
    // each thread block (1 warp) quantize WARP_N * WARP_K tile (128 * 64)
    struct quantize_w4a4_wgt_kernel {
-        static constexpr int MIN_ARCH = std::is_same_v<half_t, __nv_bfloat16> ? 800 : 750;
+        static constexpr int MIN_ARCH = std::is_same_v<half_t, __hip_bfloat16> ? 800 : 750;
        __device__ void operator()(const half_t *input, packed_wgt_t *output, packed_wscale_t *oscales, int K) {
            const int laneId = threadIdx.x % WARP_SIZE;
@@ -721,9 +722,9 @@ public:
    template<bool ACT_UNSIGNED, typename T>
    __device__ __forceinline__ static void
    compute(act_warp A, wgt_warp W, ascale_warp ascale, wscale_warp wscale, T &fpsum) {
-#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ == 800
+#if defined(__DTK_ARCH__) && __DTK_ARCH__ == 800
        using int2half2 = i2f_sm80;
-#elif defined(__CUDA_ARCH__) && __CUDA_ARCH__ == 750
+#elif defined(__DTK_ARCH__) && __DTK_ARCH__ == 750
        using int2half2 = std::conditional_t<Config::FASTER_I2F, i2f_sm75_fast, i2f_sm75>;
        ;
 #else
@@ -901,7 +902,7 @@ public:
                compute<ACT_UNSIGNED>(A[k2], W[k2], ascale[k2], wscale[k2], fpsum);
-                // #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800
+                // #if defined(__DTK_ARCH__) && __DTK_ARCH__ >= 800
                if (alwaysfalse) {
                    dummy = clock();
                }
@@ -1045,7 +1046,7 @@ public:
    template<typename Epilogue, bool ACT_UNSIGNED>
    struct gemm_w4a4_kernel {
-        static constexpr int MIN_ARCH = std::is_same_v<half_t, __nv_bfloat16> ? 800 : 750;
+        static constexpr int MIN_ARCH = std::is_same_v<half_t, __hip_bfloat16> ? 800 : 750;
        static constexpr int MAX_ARCH = Config::FASTER_I2F ? 750 : INT_MAX; // FASTER_I2F is only needed on sm_75
        __device__ void operator()(const packed_act_t *act,
@@ -1098,7 +1099,7 @@ public:
    struct quantize_w4a4_fuse_lora_kernel {
        using oscales_t = typename std::conditional_t<use_fp4, packed_amscale_t, packed_ascale_t>;
-        static constexpr int MIN_ARCH = std::is_same_v<half_t, __nv_bfloat16> ? 800 : 750;
+        static constexpr int MIN_ARCH = std::is_same_v<half_t, __hip_bfloat16> ? 800 : 750;
        static constexpr size_t SHMEM_PER_WARP =
            ceilDiv<size_t>(Base::template load_act_to_fpsum<fuse_glu>::SHMEM_SIZE, 128) * 128;
        static constexpr size_t SHMEM_SIZE = SHMEM_PER_WARP * NUM_WARPS;

--- a/src/kernels/zgemm/gemm_w4a4.cu
+++ b/src/kernels/zgemm/gemm_w4a4.cu
--- a/src/kernels/zgemm/gemm_w4a4_launch_bf16_fp4.cu
+++ b/src/kernels/zgemm/gemm_w4a4_launch_bf16_fp4.cu
--- a/src/kernels/zgemm/gemm_w4a4_launch_bf16_int4.cu
+++ b/src/kernels/zgemm/gemm_w4a4_launch_bf16_int4.cu