attention.cu

#include "zgemm.h"
#include "attention.cuh"

#ifndef M_LOG2E
#define M_LOG2E 1.4426950408889634074
#endif

namespace nunchaku::kernels {

void attention_fp16(Tensor q, // packed [Batch, Head, TokensQ, HEAD_DIM]
                    Tensor k, // packed [Batch, Head, TokensKV, HEAD_DIM]
                    Tensor v, // packed [Batch, Head, TokensKV, HEAD_DIM]
                    Tensor o, // linear [Batch, TokensQ, Head * HEAD_DIM]
                    float scale) {
    int sizeBatch   = q.shape[0];
    int numHeads    = q.shape[1];
    int numTokensQ  = q.shape[2];
    int headDim     = q.shape[3];
    int numTokensKV = k.shape[2];

    assert(o.ndims() == 3);
    assert(o.shape[0] == sizeBatch);
    assert(o.shape[1] == numTokensQ);
    assert(o.shape[2] == numHeads * headDim);

    spdlog::trace("attention_fp16: B={} H={} NQ={} NK={}", sizeBatch, numHeads, numTokensQ, numTokensKV);
    spdlog::trace("q at {}", q.data_ptr());
    spdlog::trace("k at {}", k.data_ptr());
    spdlog::trace("v at {}", v.data_ptr());
    spdlog::trace("o at {}", o.data_ptr());
    spdlog::trace("scale={}", scale);

    dispatchBool(o.scalar_type() == Tensor::BF16, [&]<bool bf16out>() {
#ifndef __INTELLISENSE__
        using Attention = typename nunchaku::kernels::Attention<AttentionFP16Config<bf16out>>;
#else
        using Attention = typename nunchaku::kernels::Attention<AttentionFP16Config<true>>;
#endif
        using GEMM = typename Attention::GEMM;

        assert(isTypeMatch<typename Attention::half_t>(q.scalar_type()));
        assert(isTypeMatch<typename Attention::half_t>(k.scalar_type()));
        assert(isTypeMatch<typename Attention::half_t>(v.scalar_type()));
        assert(isTypeMatch<typename Attention::epilogue_half_t>(o.scalar_type()));

        int shmem = 0;

        // we use exp2 instead of exp in the kernel
        scale *= M_LOG2E;

        assert(numTokensQ % Attention::BLOCK_M == 0);
        assert(numTokensKV % Attention::WARP_K == 0);
        assert(headDim == Attention::HEAD_DIM);

        auto launch = [&]<typename Epilogue>(Epilogue::Arguments args) {
            dim3 grid(numTokensQ / Attention::BLOCK_M, numHeads, sizeBatch);
            using packed_q_t = typename Attention::packed_q_t;
            using packed_k_t = typename Attention::packed_k_t;
            using packed_v_t = typename Attention::packed_v_t;

            auto func = invoke_kernel<typename Attention::attention_fp16_kernel<Epilogue>,
                                      const packed_q_t *,
                                      const packed_k_t *,
                                      const packed_v_t *,
                                      float,
                                      int,
                                      int,
                                      typename Epilogue::Arguments,
                                      bool>;

            shmem = std::max(shmem, Attention::template attention_fp16_kernel<Epilogue>::SHMEM_SIZE);

            if (shmem >= 24 * 1024) {
                checkCUDA(cudaFuncSetAttribute(func, cudaFuncAttributeMaxDynamicSharedMemorySize, shmem));
            }

            func<<<grid, GEMM::WARP_SIZE * GEMM::NUM_WARPS, shmem, getCurrentCUDAStream()>>>(q.data_ptr<packed_q_t>(),
                                                                                             k.data_ptr<packed_k_t>(),
                                                                                             v.data_ptr<packed_v_t>(),
                                                                                             scale,
                                                                                             numTokensQ,
                                                                                             numTokensKV,
                                                                                             args,
                                                                                             false);
            checkCUDA(cudaGetLastError());
        };

        launch.template operator()<typename GEMM::EpilogueDefault>(typename GEMM::EpilogueDefault::Arguments{
            .out     = o.data_ptr<typename GEMM::half_t>(),
            .actualM = sizeBatch * numTokensQ,
            .actualN = numHeads * headDim,
        });
    });
}

}; // namespace nunchaku::kernels