恢复支持旧接口

c353b35b · zhanghj2 · c566af36 · c353b35b · c353b35b · c353b35b
Commit c353b35b authored Feb 27, 2026 by zhanghj2
20 changed files
--- a/csrc/api/api.cpp
+++ b/csrc/api/api.cpp
@@ -5,7 +5,7 @@
 #include "dense_decode.h"
 #include "dense_decode_qkvfp8.h"
 #include "dense_decode_kvfp8.h"
-
+#include "../extension/flash_api.h"
 PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
    m.doc() = "FlashMLA";
    m.def("sparse_decode_fwd", &sparse_attn_decode_interface);
@@ -13,4 +13,11 @@ PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
    m.def("dense_decode_fwd_qkvfp8", &dense_attn_decode_qkvfp8_interface);
    m.def("dense_decode_fwd_kvfp8", &dense_attn_decode_kvfp8_interface);
    m.def("sparse_prefill_fwd", &sparse_attn_prefill_interface);
+
+    m.def("get_mla_decoding_metadata_dense_fp8", &get_mla_decoding_metadata_dense_fp8);
+    m.def("fwd_kvcache_quantization_mla", &mha_fwd_kvcache_quantization_mla);
+    m.def("fwd_kvcache_quantization_q_nope_pe_mla", &mha_fwd_kvcache_quantization_q_nope_pe_mla);
+    m.def("fwd_kvcache_mla_nope_pe", &mha_fwd_kvcache_mla_nope_pe);
+    m.def("fwd_kvcache_mla_fp8", &mha_fwd_kvcache_mla_fp8);
+    m.def("fwd_kvcache_mla_fp8_with_cat", &mha_fwd_kvcache_mla_fp8_with_cat);
 }
--- a/csrc/extension/flash_api.h
+++ b/csrc/extension/flash_api.h
--- a/csrc/extension/flash_fwd_mla_bf16_gfx936.cu
+++ b/csrc/extension/flash_fwd_mla_bf16_gfx936.cu
+#include "flash_fwd_mla_kernel.h"
+
+template void run_mha_fwd_splitkv_mla<cutlass::bfloat16_t, 576>(Flash_fwd_mla_params &params, const std::string& kv_cache_dtype, cudaStream_t stream, bool is_q_nope_pe = false);
--- a/csrc/extension/flash_fwd_mla_fp16_gfx936.cu
+++ b/csrc/extension/flash_fwd_mla_fp16_gfx936.cu
+#include "flash_fwd_mla_kernel.h"
+
+template void run_mha_fwd_splitkv_mla<cutlass::half_t, 576>(Flash_fwd_mla_params &params, const std::string& kv_cache_dtype, cudaStream_t stream, bool is_q_nope_pe = false);
--- a/csrc/extension/flash_fwd_mla_fp8_gfx938.cu
+++ b/csrc/extension/flash_fwd_mla_fp8_gfx938.cu
+#include "flash_fwd_mla_kernel_fp8.h"
+
+template void run_mha_fwd_splitkv_mla_fp8<cutlass::float_e4m3_t,cutlass::bfloat16_t,576>(Flash_fwd_mla_params &params, cudaStream_t stream, bool is_with_cat);
--- a/csrc/extension/flash_fwd_mla_fp8_qbf16_gfx938.cu
+++ b/csrc/extension/flash_fwd_mla_fp8_qbf16_gfx938.cu
+#include "flash_fwd_mla_kernel_fp8.h"
+
+template void run_mha_fwd_splitkv_mla_fp8<cutlass::bfloat16_t,cutlass::bfloat16_t,576>(Flash_fwd_mla_params &params, cudaStream_t stream, bool is_with_cat);
\ No newline at end of file
--- a/csrc/extension/flash_fwd_mla_kernel.h
+++ b/csrc/extension/flash_fwd_mla_kernel.h
--- a/csrc/extension/flash_fwd_mla_kernel_fp8.h
+++ b/csrc/extension/flash_fwd_mla_kernel_fp8.h
--- a/csrc/extension/flash_fwd_mla_metadata.cu
+++ b/csrc/extension/flash_fwd_mla_metadata.cu
+#include "flash_fwd_mla_kernel.h"
+
+static constexpr int MaxBatchSize = 4096;
+
+__global__ void __launch_bounds__(64, 1)
+get_mla_metadata_kernel(const Mla_metadata_params params) {
+    int *seqlens_k_ptr = params.seqlens_k_ptr;
+    int *tile_scheduler_metadata_ptr = params.tile_scheduler_metadata_ptr;
+    int *num_splits_ptr = params.num_splits_ptr;
+    int batch_size = params.batch_size;
+    int block_size_n = params.block_size_n;
+    int fixed_overhead_num_blocks = params.fixed_overhead_num_blocks;
+    int num_sm_parts = params.num_sm_parts;
+
+    __shared__ int num_blocks_shared[MaxBatchSize];
+    __shared__ int num_splits_shared[MaxBatchSize];
+
+    int total_num_blocks = 0;
+    for (int i = threadIdx.x; i < batch_size; i += 64) {
+        int num_blocks = cutlass::ceil_div(seqlens_k_ptr[i], block_size_n);
+        total_num_blocks += num_blocks + fixed_overhead_num_blocks;
+        num_blocks_shared[i] = num_blocks;
+    }
+    for (int offset = 32; offset >= 1; offset /= 2) {
+        // total_num_blocks += __shfl_xor(uint32_t(-1), total_num_blocks, offset);
+        total_num_blocks += __shfl_xor(total_num_blocks, offset, 64);
+    }
+    __syncthreads();
+
+    if (threadIdx.x == 0) {
+        int payload = cutlass::ceil_div(total_num_blocks, num_sm_parts) + fixed_overhead_num_blocks;
+
+        int now_idx = 0, now_block = 0, now_n_split_idx = 0, cum_num_splits = 0;
+        num_splits_shared[0] = 0;
+        for (int i = 0; i < num_sm_parts; ++i) {
+            int tile_scheduler_metadata0[4], tile_scheduler_metadata1;
+            tile_scheduler_metadata0[0] = now_idx;
+            tile_scheduler_metadata0[1] = now_block * block_size_n;
+            tile_scheduler_metadata1 = now_n_split_idx;
+            int remain_payload = payload;
+            while (now_idx < batch_size) {
+                int num_blocks = num_blocks_shared[now_idx];
+                int now_remain_blocks = num_blocks - now_block;
+                if (remain_payload >= now_remain_blocks + fixed_overhead_num_blocks) {
+                    cum_num_splits += now_n_split_idx + 1;
+                    num_splits_shared[now_idx + 1] = cum_num_splits;
+                    remain_payload -= now_remain_blocks + fixed_overhead_num_blocks;
+                    ++now_idx;
+                    now_block = 0;
+                    now_n_split_idx = 0;
+                } else {
+                    if (remain_payload - fixed_overhead_num_blocks > 0) {
+                        now_block += remain_payload - fixed_overhead_num_blocks;
+                        ++now_n_split_idx;
+                        remain_payload = 0;
+                    }
+                    break;
+                }
+            }
+            tile_scheduler_metadata0[2] = now_block > 0 ? now_idx : now_idx - 1;
+            tile_scheduler_metadata0[3] = now_block > 0 ? now_block * block_size_n : seqlens_k_ptr[now_idx - 1];
+            *reinterpret_cast<int4 *>(tile_scheduler_metadata_ptr + i * TileSchedulerMetaDataSize) = *reinterpret_cast<int4 *>(tile_scheduler_metadata0);
+            tile_scheduler_metadata_ptr[i * TileSchedulerMetaDataSize + 4] = tile_scheduler_metadata1;
+        }
+        //FLASH_DEVICE_ASSERT(now_idx == batch_size && now_block == 0 && now_n_split_idx == 0);
+    }
+    __syncthreads();
+
+    for (int i = threadIdx.x; i <= batch_size; i += 64) {
+        num_splits_ptr[i] = num_splits_shared[i];
+    }
+}
+
+void get_mla_metadata_func(Mla_metadata_params &params, cudaStream_t stream) {
+    FLASH_ASSERT(params.batch_size < MaxBatchSize);
+    get_mla_metadata_kernel<<<1, 64, 0, stream>>>(params);
+    CHECK_CUDA_KERNEL_LAUNCH();
+}
+
+
+__global__ void __launch_bounds__(64, 1)
+get_mla_decoding_metadata_kernel(const GetDecodingMetadataParams params) {
+    int *seqlens_k_ptr = params.seqlens_k_ptr;
+    int *tile_scheduler_metadata_ptr = params.tile_scheduler_metadata_ptr;
+    int *num_splits_ptr = params.num_splits_ptr;
+    int batch_size = params.batch_size;
+    int block_size_n = params.block_size_n;
+    int fixed_overhead_num_blocks = params.fixed_overhead_num_blocks;
+    int num_sm_parts = params.num_sm_parts;
+
+    extern __shared__ int shared_mem[];
+    int* num_blocks_shared = shared_mem; // [batch_size]
+    int* num_splits_shared = shared_mem + batch_size; // [batch_size+1]
+    int* seqlens_k_shared = shared_mem + batch_size*2+1; // [batch_size]
+    int* first_block_idx_shared = shared_mem + batch_size*3+1; // [batch_size]
+    int* last_block_idx_shared = shared_mem + batch_size*4+1; // [batch_size]
+
+    int total_num_blocks = 0;
+    for (int i = threadIdx.x; i < batch_size; i += 64) {
+        int cur_s_k = params.topk == -1 ? __ldg(seqlens_k_ptr + i) : params.topk;
+        seqlens_k_shared[i] = cur_s_k;
+        int first_token_idx = 0;
+        int last_token_idx = max(cur_s_k-1, 0);
+        int cur_first_block_idx = first_token_idx / block_size_n;
+        int cur_last_block_idx = last_token_idx / block_size_n;
+        // NOTE Should attend to tokens [first_token_idx, last_token_idx], i.e. blocks [cur_first_block_idx, cur_last_block_idx]
+        // NOTE Before clamping, first_token_idx <= last_token_idx always holds, so after clamping, first_token_idx <= last_token_idx still holds.
+        // NOTE if seqlens_k is 0, then first_token_idx == last_token_idx == cur_first_block_idx == cur_last_block_idx == 0. So the sequence will have 1 block. We will correct this later in this kernel.
+        int num_blocks = cur_last_block_idx - cur_first_block_idx + 1;
+        total_num_blocks += num_blocks + fixed_overhead_num_blocks;
+        num_blocks_shared[i] = num_blocks;
+        first_block_idx_shared[i] = cur_first_block_idx;
+        last_block_idx_shared[i] = cur_last_block_idx;
+    }
+    for (int offset = 32; offset >= 1; offset /= 2) {
+        // total_num_blocks += __shfl_xor_sync(uint32_t(-1), total_num_blocks, offset);
+        total_num_blocks += __shfl_xor(total_num_blocks, offset, 64);
+
+    }
+    __syncthreads();
+
+    if (threadIdx.x == 0) {
+        int payload = cutlass::ceil_div(total_num_blocks, num_sm_parts) + fixed_overhead_num_blocks;
+
+        int now_idx = 0, now_block = 0, now_n_split_idx = 0, cum_num_splits = 0;
+        num_splits_shared[0] = 0;
+        for (int i = 0; i < num_sm_parts; ++i) {
+            int tile_scheduler_metadata0[4], tile_scheduler_metadata1;
+            tile_scheduler_metadata0[0] = now_idx;
+            tile_scheduler_metadata0[1] = now_block + first_block_idx_shared[now_idx];
+            tile_scheduler_metadata1 = now_n_split_idx;
+            int remain_payload = payload;
+            while (now_idx < batch_size) {
+                int num_blocks = num_blocks_shared[now_idx];
+                int now_remain_blocks = num_blocks - now_block;
+                if (remain_payload >= now_remain_blocks + fixed_overhead_num_blocks) {
+                    cum_num_splits += now_n_split_idx + 1;
+                    num_splits_shared[now_idx + 1] = cum_num_splits;
+                    remain_payload -= now_remain_blocks + fixed_overhead_num_blocks;
+                    ++now_idx;
+                    now_block = 0;
+                    now_n_split_idx = 0;
+                } else {
+                    if (remain_payload - fixed_overhead_num_blocks > 0) {
+                        now_block += remain_payload - fixed_overhead_num_blocks;
+                        ++now_n_split_idx;
+                        remain_payload = 0;
+                    }
+                    break;
+                }
+            }
+            tile_scheduler_metadata0[2] = now_block > 0 ? now_idx : now_idx - 1;
+            tile_scheduler_metadata0[3] = now_block > 0 ? now_block + first_block_idx_shared[now_idx] : (seqlens_k_shared[now_idx-1] == 0 ? 0 : last_block_idx_shared[now_idx-1] + 1);
+            *reinterpret_cast<int4 *>(tile_scheduler_metadata_ptr + i * TileSchedulerMetaDataSize) = *reinterpret_cast<int4 *>(tile_scheduler_metadata0);
+            tile_scheduler_metadata_ptr[i * TileSchedulerMetaDataSize + 4] = tile_scheduler_metadata1;
+        }
+        // FLASH_DEVICE_ASSERT(now_idx == batch_size && now_block == 0 && now_n_split_idx == 0);
+    }
+    __syncthreads();
+
+    for (int i = threadIdx.x; i <= batch_size; i += 64) {
+        num_splits_ptr[i] = num_splits_shared[i];
+    }
+}
+
+
+void run_get_mla_metadata_kernel(GetDecodingMetadataParams &params, cudaStream_t stream) {
+    int smem_size = sizeof(int) * (params.batch_size*5+1);
+    // CHECK_CUDA(cudaFuncSetAttribute(get_mla_metadata_kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, smem_size));
+    get_mla_decoding_metadata_kernel<<<1, 64, smem_size, stream>>>(params);
+    CHECK_CUDA_KERNEL_LAUNCH();
+}
\ No newline at end of file
--- a/csrc/extension/flash_mla.h
+++ b/csrc/extension/flash_mla.h
+#pragma once
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+struct Flash_fwd_mla_params {
+    using index_t = int64_t;
+
+    int b, seqlen_q, d, d_v;
+    int h, h_h_k_ratio, ngroups;
+    bool is_causal;
+    float scale_softmax, scale_softmax_log2;
+    int *__restrict__ cu_seqlens_k;
+
+    void *__restrict__ q_ptr;
+    void *__restrict__ q_nope_ptr;
+    void *__restrict__ q_pe_ptr;
+    void *__restrict__ k_ptr;
+    void *__restrict__ v_ptr;
+    void *__restrict__ o_ptr;
+    void *__restrict__ softmax_lse_ptr;
+
+    index_t q_batch_stride;
+    index_t q_nope_batch_stride;
+    index_t q_pe_batch_stride;
+    index_t k_batch_stride;
+    index_t v_batch_stride;
+    index_t o_batch_stride;
+    index_t q_row_stride;
+    index_t q_nope_row_stride;
+    index_t q_pe_row_stride;
+    index_t k_row_stride;
+    index_t v_row_stride;
+    index_t o_row_stride;
+    index_t q_head_stride;
+    index_t q_nope_head_stride;
+    index_t q_pe_head_stride;
+    index_t k_head_stride;
+    index_t v_head_stride;
+    index_t o_head_stride;
+
+    int *__restrict__ block_table;
+    index_t block_table_batch_stride;
+    int page_block_size;
+
+    int *__restrict__ tile_scheduler_metadata_ptr;
+    int num_sm_parts;
+    int *__restrict__ num_splits_ptr;
+
+    void *__restrict__ softmax_lseaccum_ptr;
+    void *__restrict__ oaccum_ptr;
+
+    void *__restrict__ k_scale_ptr;
+    float * __restrict__ descale_q_ptr ;
+    float * __restrict__ descale_k_ptr ;
+
+};
+
+struct SparsePrefillParams {
+    int s_q, s_kv, h_q, h_kv, d_qk, d_v, topk;
+    float sm_scale, sm_scale_div_log2;
+
+    // Input tensors
+    void* __restrict__ q;    // [s_q, h_q, d_qk]
+    void* __restrict__ kv;   // [s_kv, h_kv, d_qk]
+    void* __restrict__ indices;   // [s_q, h_kv, topk]
+
+    int stride_q_s_q; 
+    int stride_q_h_q;
+    int stride_kv_s_kv; 
+    int stride_kv_h_kv;
+    int stride_indices_s_q; 
+    int stride_indices_h_kv;
+
+    // Output tensors
+    void* __restrict__ out;   // [s_q, h_q, d_v]
+    void* __restrict__ max_logits; // [s_q, h_q]
+    void* __restrict__ lse; // [s_q, h_q]
+
+    // cudaStream_t stream;
+};
+
+static constexpr int TileSchedulerMetaDataSize = 8;
+// [begin_idx, begin_seqlen, end_idx, end_seqlen, begin_n_split_idx, _, _, _]
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template<typename T, int Headdim>
+void run_mha_fwd_splitkv_mla(Flash_fwd_mla_params &params, const std::string& kv_cache_dtype, hipStream_t stream, bool is_q_nope_pe = false);
+
+template<typename T, typename To, int Headdim>
+void run_mha_fwd_splitkv_mla_fp8(Flash_fwd_mla_params &params,hipStream_t stream, bool is_with_cat);
+
+template<typename T, int Headdim>
+void run_mha_fwd_sparse_prefill(const SparsePrefillParams &params,  hipStream_t stream);
+
+struct Mla_metadata_params {
+    int *__restrict__ seqlens_k_ptr;
+    int *__restrict__ tile_scheduler_metadata_ptr;
+    int *__restrict__ num_splits_ptr;
+    int batch_size;
+    int block_size_n;
+    int fixed_overhead_num_blocks;
+    int num_sm_parts;
+};
+
+void get_mla_metadata_func(Mla_metadata_params &params, cudaStream_t stream);
+
+enum class Fp8KVCacheDataType {
+  kAuto = 0,
+  kFp8E4M3 = 1,
+  kFp8E5M2 = 2,
+  kInt8 = 3,
+};
+
+static inline bool get_env_(const char *env_var) {
+  if (char *value = std::getenv(env_var)) {
+    if (strcmp(value, "0") == 0) {
+      return false;
+    }
+    return true;
+  }
+  return false;
+}
+
+struct GetDecodingMetadataParams {
+    int *__restrict__ seqlens_k_ptr;
+    int *__restrict__ tile_scheduler_metadata_ptr;
+    int *__restrict__ num_splits_ptr;
+    int batch_size;
+    int block_size_n;
+    int fixed_overhead_num_blocks;
+    int num_sm_parts;
+    int topk;
+};
+
+
+void run_get_mla_metadata_kernel(GetDecodingMetadataParams &params, cudaStream_t stream);
+
+struct DecodingParams {
+    using index_t = int64_t;
+
+    int b;              // batch size
+    int s_q;
+    int q_seq_per_hk;   // The number of q(s) per KV head, = h_q / h_k * s_q
+    int d, d_v;         // K/V dimension
+    int h_q, h_k;       // The number of Q/K heads
+    int num_blocks;     // Number of blocks in total
+    int q_head_per_hk;  // The number of q_head(s) per KV head, = h_q / h_k
+    bool is_causal;
+    float scale_softmax, scale_softmax_log2;
+    int topk;
+    
+    void *__restrict__ q_ptr;
+    void *__restrict__ k_ptr;
+    void *__restrict__ o_ptr;
+    void *__restrict__ softmax_lse_ptr;
+    int *__restrict__ indices_ptr;
+
+    index_t q_batch_stride;
+    index_t k_batch_stride;
+    index_t o_batch_stride;
+    index_t q_row_stride;
+    index_t k_row_stride;
+    index_t o_row_stride;
+    index_t q_head_stride;
+    index_t k_head_stride;
+    index_t o_head_stride;
+    index_t indices_batch_stride;
+    index_t indices_row_stride;
+
+    int *__restrict__ block_table;
+    index_t block_table_batch_stride;
+    int page_block_size;
+    int *__restrict__ seqlens_k_ptr;
+
+    int *__restrict__ tile_scheduler_metadata_ptr;
+    int num_sm_parts;
+    int *__restrict__ num_splits_ptr;
+
+    int total_num_splits;
+    void *__restrict__ softmax_lseaccum_ptr;
+    void *__restrict__ oaccum_ptr;
+};
+
+template<typename T, int Headdim>
+void run_flash_splitkv_sparse_mla_kernel(const DecodingParams &params,  cudaStream_t stream);
\ No newline at end of file
--- a/csrc/extension/softmax.h
+++ b/csrc/extension/softmax.h
--- a/csrc/extension/static_switch.h
+++ b/csrc/extension/static_switch.h
+#pragma once
+
+#define CHECK_CUDA(call)                                                                                  \
+    do {                                                                                                  \
+        cudaError_t status_ = call;                                                                       \
+        if (status_ != cudaSuccess) {                                                                     \
+            fprintf(stderr, "CUDA error (%s:%d): %s\n", __FILE__, __LINE__, cudaGetErrorString(status_)); \
+            exit(1);                                                                                      \
+        }                                                                                                 \
+    } while(0)
+
+#define CHECK_CUDA_KERNEL_LAUNCH() CHECK_CUDA(cudaGetLastError())
+
+
+#define FLASH_ASSERT(cond)                                                                                \
+    do {                                                                                                  \
+        if (not (cond)) {                                                                                 \
+            fprintf(stderr, "Assertion failed (%s:%d): %s\n", __FILE__, __LINE__, #cond);                 \
+            exit(1);                                                                                      \
+        }                                                                                                 \
+    } while(0)
+
+
+#define FLASH_DEVICE_ASSERT(cond)                                                                         \
+    do {                                                                                                  \
+        if (not (cond)) {                                                                                 \
+            printf("Assertion failed (%s:%d): %s\n", __FILE__, __LINE__, #cond);                          \
+            asm("S_ENDPGM;");                                                                                 \
+        }                                                                                                 \
+    } while(0)
+
+
+#define BOOL_SWITCH(COND, CONST_NAME, ...)      \
+  [&] {                                         \
+    if (COND) {                                 \
+      constexpr static bool CONST_NAME = true;  \
+      return __VA_ARGS__();                     \
+    } else {                                    \
+      constexpr static bool CONST_NAME = false; \
+      return __VA_ARGS__();                     \
+    }                                           \
+  }()
+
+
+#define MLA_NUM_SPLITS_SWITCH(NUM_SPLITS, NAME, ...) \
+  [&] {                                              \
+    if (NUM_SPLITS <= 32) {                          \
+      constexpr static int NAME = 32;                \
+      return __VA_ARGS__();                          \
+    } else if (NUM_SPLITS <= 64) {                   \
+      constexpr static int NAME = 64;                \
+      return __VA_ARGS__();                          \
+    } else if (NUM_SPLITS <= 72) {                   \
+      constexpr static int NAME = 72;                \
+      return __VA_ARGS__();                          \
+    } else if (NUM_SPLITS <= 96) {                   \
+      constexpr static int NAME = 96;                \
+      return __VA_ARGS__();                          \
+    } else if (NUM_SPLITS <= 128) {                  \
+      constexpr static int NAME = 128;               \
+      return __VA_ARGS__();                          \
+    } else if (NUM_SPLITS <= 144) {                  \
+      constexpr static int NAME = 144;               \
+      return __VA_ARGS__();                          \
+    } else if (NUM_SPLITS <= 160) {                  \
+      constexpr static int NAME = 160;               \
+      return __VA_ARGS__();                          \
+    } else {                                         \
+      FLASH_ASSERT(false);                           \
+    }                                                \
+  }()
+
--- a/csrc/extension/utils.h
+++ b/csrc/extension/utils.h
--- a/csrc/gfx93/decode/dense_kvfp8/splitkv_mla.cuh
+++ b/csrc/gfx93/decode/dense_kvfp8/splitkv_mla.cuh
@@ -17,6 +17,7 @@ compute_attn_1rowblock_splitkv_mla_kvfp8(const DenseAttnDecodeParams_fp8 &params
                                        const int n_split_idx, const int seqlen_k,
                                        const int n_block_min, const int n_block_max, const bool NoSplit)
 {
+#if 0
    constexpr static bool Is_causal = T::Is_causal;
    constexpr int kBlockM = T::kBlockM;
    constexpr int kBlockN = T::kBlockN;
@@ -384,9 +385,7 @@ compute_attn_1rowblock_splitkv_mla_kvfp8(const DenseAttnDecodeParams_fp8 &params
            }
        }
    }
-
-
-
+#endif
 }

 template<typename T>

--- a/csrc/params.h
+++ b/csrc/params.h
@@ -184,9 +184,9 @@ inline constexpr bool is_decode_v = std::bool_constant<FWD_MODE == SparseAttnFwd
 template<SparseAttnFwdMode FWD_MODE>
 using SparseFwdArgT = std::conditional_t<is_decode_v<FWD_MODE>, SparseAttnDecodeParams, SparseAttnFwdParams>;

-enum class Fp8KVCacheDataType {
-  kAuto = 0,
-  kFp8E4M3 = 1,
-  kFp8E5M2 = 2,
-  kInt8 = 3,
-};
+// enum class Fp8KVCacheDataType {
+//   kAuto = 0,
+//   kFp8E4M3 = 1,
+//   kFp8E5M2 = 2,
+//   kInt8 = 3,
+// };
--- a/csrc/utils.h
+++ b/csrc/utils.h
@@ -1081,7 +1081,7 @@ void wait_vmcnt() {
                "s_barrier; \n\t"                
                :: "n"(N));
 }
-
+#if 0
 template<typename Element, bool is_scale_equal_one, Fp8KVCacheDataType KV_DTYPE, typename Tensor0, typename Tensor1, typename Tensor2, typename Tensor3, typename Tensor4, 
         typename TiledMma, typename TiledCopy, typename ThrCopy>
 __forceinline__ __device__ void gemm_rs_fp8(Tensor0 &acc, Tensor1 &tCrA, Tensor2 &tCrB_int8, Tensor3 &tCrB, Tensor4 const& tCsB,
@@ -1302,7 +1302,7 @@ __forceinline__ __device__ void gemm_k_rs_fp8(Tensor0 &acc, Tensor1 &tCrA,  Tens
    }
    cute::gemm(tiled_mma, tCrA(_, _, k_idx), tCrB(_, _, k_idx), acc);
 }
-
+#endif
 template <
        bool Is_even_MN=true, 
        bool Is_even_K=true, 
@@ -1367,7 +1367,7 @@ buffer_load_copy_fp8(

    }
 }
-
+#if 0
 template<typename Element, bool is_scale_equal_one, Fp8KVCacheDataType KV_DTYPE, typename Tensor0, typename Tensor1, typename Tensor3, typename Tensor4, 
         typename TiledMma, typename TiledCopy, typename ThrCopy>
 __forceinline__ __device__ void gemm1_rs_fp8(Tensor0 &acc, Tensor1 &tCrA, Tensor3 &tCrB, Tensor4 const& tCsB,
@@ -1500,8 +1500,7 @@ __forceinline__ __device__ void gemm1_rs_fp8(Tensor0 &acc, Tensor1 &tCrA, Tensor
    }

 }
-
-
+#endif


 }
\ No newline at end of file
--- a/flash_mla/__init__.py
+++ b/flash_mla/__init__.py
@@ -4,14 +4,27 @@ from flash_mla.flash_mla_interface import (
    get_mla_metadata,
    flash_mla_with_kvcache,
    flash_mla_sparse_fwd,
-    flash_mla_with_kvcache_qkvfp8,
-    flash_mla_with_kvcache_kvfp8
+    get_mla_decoding_metadata_dense_fp8,
+    flash_mla_with_kvcache_quantization,
+    flash_mla_with_kvcache_q_nope_pe,
+    flash_mla_with_kvcache_quantization_q_nope_pe,
+    flash_mla_with_kvcache_fp8,
+    flash_mla_with_kvcache_fp8_with_cat
 )

 __all__ = [
    "get_mla_metadata",
    "flash_mla_with_kvcache",
    "flash_mla_sparse_fwd",
-    "flash_mla_with_kvcache_qkvfp8",
-    "flash_mla_with_kvcache_kvfp8"
+    "get_mla_decoding_metadata_dense_fp8",
+    "flash_mla_with_kvcache_quantization",
+    "flash_mla_with_kvcache_q_nope_pe",
+    "flash_mla_with_kvcache_quantization_q_nope_pe",
+    "flash_mla_with_kvcache_fp8",
+    "flash_mla_with_kvcache_fp8_with_cat"
 ]
+
+import os
+FLASH_MLA_ROOT_DIR = os.path.dirname(os.path.abspath(__file__))
+# print(FLUX_ROOT_DIR)
+os.environ["FLASH_MLA_ROOT_DIR"] = FLASH_MLA_ROOT_DIR + "/asm/"
\ No newline at end of file
--- a/flash_mla/asm/build.sh
+++ b/flash_mla/asm/build.sh
+clang -x assembler -target amdgcn-amd-amdhsa -mcode-object-version=4 -mcpu=gfx938:sramecc+ -c -o flash_fwd_mla_fp8_gfx938-hip-amdgcn-amd-amdhsa-gfx938.o flash_fwd_mla_fp8_gfx938-hip-amdgcn-amd-amdhsa-gfx938.s 
+clang -target amdgcn-amd-amdhsa  flash_fwd_mla_fp8_gfx938-hip-amdgcn-amd-amdhsa-gfx938.o -o flash_fwd_mla_fp8_gfx938-hip-amdgcn-amd-amdhsa-gfx938.co
--- a/flash_mla/asm/flash_fwd_mla_fp8_gfx938-hip-amdgcn-amd-amdhsa-gfx938.co
+++ b/flash_mla/asm/flash_fwd_mla_fp8_gfx938-hip-amdgcn-amd-amdhsa-gfx938.co
--- a/flash_mla/asm/flash_fwd_mla_fp8_gfx938-hip-amdgcn-amd-amdhsa-gfx938.s
+++ b/flash_mla/asm/flash_fwd_mla_fp8_gfx938-hip-amdgcn-amd-amdhsa-gfx938.s