Commit 518a5f4d authored by hly's avatar hly
Browse files

import aicc-master-dev

parent c2a1b310
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
...@@ -60,11 +60,6 @@ struct Flash_fwd_kernel_traits : public Base { ...@@ -60,11 +60,6 @@ struct Flash_fwd_kernel_traits : public Base {
static constexpr size_t k_smem_size = (STAGES * (kWaveN / 32) * (kBlockK / 32) * (32 * 34)) * sizeof(Element); static constexpr size_t k_smem_size = (STAGES * (kWaveN / 32) * (kBlockK / 32) * (32 * 34)) * sizeof(Element);
static constexpr size_t v_smem_size = (STAGES * kBlockK * 32/*WARP_K*/) * sizeof(Element); static constexpr size_t v_smem_size = (STAGES * kBlockK * 32/*WARP_K*/) * sizeof(Element);
#if (TARGET == 928)
static constexpr int kSmemSize = std::max(q_smem_size, v_smem_size) + k_smem_size * 2;
#else
static constexpr int kSmemSize = std::max(std::max(q_smem_size, v_smem_size), k_smem_size * 2);
#endif
}; };
// Is_V_in_regs is an option to reduce smem usage, but will increase register pressue. // Is_V_in_regs is an option to reduce smem usage, but will increase register pressue.
......
This diff is collapsed.
...@@ -69,7 +69,7 @@ __forceinline__ __device__ void int8_kvcache_qk_gemm_prefetch_v_3stage( ...@@ -69,7 +69,7 @@ __forceinline__ __device__ void int8_kvcache_qk_gemm_prefetch_v_3stage(
auto BUFFER_LOAD_FUNC = &inline_buffer_load_dword_lds<Element_q, 2>; auto BUFFER_LOAD_FUNC = &inline_buffer_load_dword_lds<Element_q, 2>;
// load 指令发下去之后, 先做一些初始化运算 // load 指令发下去之后, 先做一些初始化运算
#if defined(__gfx936__) || defined(__gfx938__) #if defined(__gfx936__) || defined(__gfx938__) || defined(__gfx946__)
if constexpr (M_MMAC_COUNT == 1) { if constexpr (M_MMAC_COUNT == 1) {
inline_vgpr4_init_zero_1x2x4(s_reg); inline_vgpr4_init_zero_1x2x4(s_reg);
} else { } else {
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment