Unverified Commit 4987f13d authored by Gabe Goodhart's avatar Gabe Goodhart Committed by GitHub
Browse files

Llama cpp bump (df1b612): granite docling / mamba2 optimizations / multimodal...


Llama cpp bump (df1b612): granite docling / mamba2 optimizations / multimodal encoding fixes (#12552)

* feat: Bump llama.cpp to df1b612

Branch: LlamaCPPBump-GraniteDocling
Signed-off-by: default avatarGabe Goodhart <ghart@us.ibm.com>

* fix(mtmd): Correctly encode text chunks during mtmd tokenization

There can be text chunks that appear interspersed with the image embeddings
that contain template delimiter tokens for some models. These need to be
correctly translated to text tokens.

Branch: LlamaCPPBump-GraniteDocling
Signed-off-by: default avatarGabe Goodhart <ghart@us.ibm.com>

* tests: Use MtmdChunk in image_test

Branch: LlamaCPPBump-GraniteDocling
Signed-off-by: default avatarGabe Goodhart <ghart@us.ibm.com>

* style: Fix unnecessary conversion linting

Branch: LlamaCPPBump-GraniteDocling
Signed-off-by: default avatarGabe Goodhart <ghart@us.ibm.com>

* fix(ggml): Revert changes to ggml_hip.cpp

These changes were done largely by our code assistant and are likely wrong

Branch: LlamaCPPBump-GraniteDocling
Signed-off-by: default avatarGabe Goodhart <ghart@us.ibm.com>

* fix: Revert changes in mem_nvml.cpp

Branch: LlamaCPPBump-GraniteDocling
Signed-off-by: default avatarGabe Goodhart <ghart@us.ibm.com>

* feat: Update sync point to 1deee0

This brings in several more optimization commits and model support for
EmbeddingGemma

Branch: LlamaCPPBump-GraniteDocling
Signed-off-by: default avatarGabe Goodhart <ghart@us.ibm.com>

* feat: Update patches for 1deee0

Branch: LlamaCPPBump-GraniteDocling
Signed-off-by: default avatarGabe Goodhart <ghart@us.ibm.com>

* feat: sync for bump to 1deee0

Branch: LlamaCPPBump-GraniteDocling
Signed-off-by: default avatarGabe Goodhart <ghart@us.ibm.com>

* fix: Bad patch updates with errant `+`

Branch: LlamaCPPBump-GraniteDocling
Signed-off-by: default avatarGabe Goodhart <ghart@us.ibm.com>

* feat: Bump llama.cpp/ggml to 7049736

Branch: LlamaCPPBump-GraniteDocling
Signed-off-by: default avatarGabe Goodhart <ghart@us.ibm.com>

* fix: format-patches after latest bump

Branch: LlamaCPPBump-GraniteDocling
Signed-off-by: default avatarGabe Goodhart <ghart@us.ibm.com>

---------
Signed-off-by: default avatarGabe Goodhart <ghart@us.ibm.com>
parent e638f2ac
...@@ -44,6 +44,7 @@ void ggml_vec_dot_bf16(int n, float * GGML_RESTRICT s, size_t bs, ggml_bf16_t * ...@@ -44,6 +44,7 @@ void ggml_vec_dot_bf16(int n, float * GGML_RESTRICT s, size_t bs, ggml_bf16_t *
void ggml_vec_dot_f16(int n, float * GGML_RESTRICT s, size_t bs, ggml_fp16_t * GGML_RESTRICT x, size_t bx, ggml_fp16_t * GGML_RESTRICT y, size_t by, int nrc); void ggml_vec_dot_f16(int n, float * GGML_RESTRICT s, size_t bs, ggml_fp16_t * GGML_RESTRICT x, size_t bx, ggml_fp16_t * GGML_RESTRICT y, size_t by, int nrc);
void ggml_vec_silu_f32(const int n, float * y, const float * x); void ggml_vec_silu_f32(const int n, float * y, const float * x);
ggml_float ggml_vec_cvar_f32(const int n, float * y, const float * x, const float mean); //it will also center y ( y = y - mean )
ggml_float ggml_vec_soft_max_f32(const int n, float * y, const float * x, float max); ggml_float ggml_vec_soft_max_f32(const int n, float * y, const float * x, float max);
ggml_float ggml_vec_log_soft_max_f32(const int n, float * y, const float * x, float max); ggml_float ggml_vec_log_soft_max_f32(const int n, float * y, const float * x, float max);
...@@ -143,14 +144,14 @@ inline static void ggml_vec_dot_f16_unroll(const int n, const int xs, float * GG ...@@ -143,14 +144,14 @@ inline static void ggml_vec_dot_f16_unroll(const int n, const int xs, float * GG
for (int i = 0; i < np; i += ggml_f16_step) { for (int i = 0; i < np; i += ggml_f16_step) {
ay1 = GGML_F16x_VEC_LOAD(y + i + 0 * ggml_f16_epr, 0); // 8 elements ay1 = GGML_F16x_VEC_LOAD(y + i + 0 * ggml_f16_epr, 0); // 8 elements
ax1 = GGML_F16x_VEC_LOAD(x[0] + i + 0*ggml_f16_epr, 0); // 8 elemnst ax1 = GGML_F16x_VEC_LOAD(x[0] + i + 0*ggml_f16_epr, 0); // 8 elements
sum_00 = GGML_F16x_VEC_FMA(sum_00, ax1, ay1); // sum_00 = sum_00+ax1*ay1 sum_00 = GGML_F16x_VEC_FMA(sum_00, ax1, ay1); // sum_00 = sum_00+ax1*ay1
ax1 = GGML_F16x_VEC_LOAD(x[1] + i + 0*ggml_f16_epr, 0); // 8 elements ax1 = GGML_F16x_VEC_LOAD(x[1] + i + 0*ggml_f16_epr, 0); // 8 elements
sum_10 = GGML_F16x_VEC_FMA(sum_10, ax1, ay1); sum_10 = GGML_F16x_VEC_FMA(sum_10, ax1, ay1);
ay2 = GGML_F16x_VEC_LOAD(y + i + 1 * ggml_f16_epr, 1); // next 8 elements ay2 = GGML_F16x_VEC_LOAD(y + i + 1 * ggml_f16_epr, 1); // next 8 elements
ax2 = GGML_F16x_VEC_LOAD(x[0] + i + 1*ggml_f16_epr, 1); // next 8 ekements ax2 = GGML_F16x_VEC_LOAD(x[0] + i + 1*ggml_f16_epr, 1); // next 8 elements
sum_01 = GGML_F16x_VEC_FMA(sum_01, ax2, ay2); sum_01 = GGML_F16x_VEC_FMA(sum_01, ax2, ay2);
ax2 = GGML_F16x_VEC_LOAD(x[1] + i + 1*ggml_f16_epr, 1); ax2 = GGML_F16x_VEC_LOAD(x[1] + i + 1*ggml_f16_epr, 1);
sum_11 = GGML_F16x_VEC_FMA(sum_11, ax2, ay2); sum_11 = GGML_F16x_VEC_FMA(sum_11, ax2, ay2);
...@@ -159,7 +160,7 @@ inline static void ggml_vec_dot_f16_unroll(const int n, const int xs, float * GG ...@@ -159,7 +160,7 @@ inline static void ggml_vec_dot_f16_unroll(const int n, const int xs, float * GG
ax3 = GGML_F16x_VEC_LOAD(x[0] + i + 2*ggml_f16_epr, 2); ax3 = GGML_F16x_VEC_LOAD(x[0] + i + 2*ggml_f16_epr, 2);
sum_02 = GGML_F16x_VEC_FMA(sum_02, ax3, ay3); sum_02 = GGML_F16x_VEC_FMA(sum_02, ax3, ay3);
ax1 = GGML_F16x_VEC_LOAD(x[1] + i + 2*ggml_f16_epr, 2); ax3 = GGML_F16x_VEC_LOAD(x[1] + i + 2*ggml_f16_epr, 2);
sum_12 = GGML_F16x_VEC_FMA(sum_12, ax3, ay3); sum_12 = GGML_F16x_VEC_FMA(sum_12, ax3, ay3);
ay4 = GGML_F16x_VEC_LOAD(y + i + 3 * ggml_f16_epr, 3); ay4 = GGML_F16x_VEC_LOAD(y + i + 3 * ggml_f16_epr, 3);
...@@ -654,11 +655,11 @@ inline static void ggml_vec_scale_f32(const int n, float * y, const float v) { ...@@ -654,11 +655,11 @@ inline static void ggml_vec_scale_f32(const int n, float * y, const float v) {
} }
// leftovers // leftovers
// maximum number of leftover elements will be less that ggml_f32_epr. Apply predicated svmad on available elements only // maximum number of leftover elements will be less that ggml_f32_epr. Apply predicated svmad on available elements only
if (np < n) { for (int i = np; i < n; i += ggml_f32_epr) {
svbool_t pg = svwhilelt_b32(np, n); svbool_t pg = svwhilelt_b32(i, n);
ay1 = svld1_f32(pg, y + np); ay1 = svld1_f32(pg, y + i);
ay1 = svmul_f32_m(pg, ay1, vx); ay1 = svmul_f32_m(pg, ay1, vx);
svst1_f32(pg, y + np, ay1); svst1_f32(pg, y + i, ay1);
} }
#elif defined(__riscv_v_intrinsic) #elif defined(__riscv_v_intrinsic)
for (int i = 0, avl; i < n; i += avl) { for (int i = 0, avl; i < n; i += avl) {
...@@ -819,7 +820,8 @@ inline static void ggml_vec_tanh_f16 (const int n, ggml_fp16_t * y, const ggml_f ...@@ -819,7 +820,8 @@ inline static void ggml_vec_tanh_f16 (const int n, ggml_fp16_t * y, const ggml_f
inline static void ggml_vec_elu_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? x[i] : expm1f(x[i]); } inline static void ggml_vec_elu_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? x[i] : expm1f(x[i]); }
inline static void ggml_vec_elu_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) { inline static void ggml_vec_elu_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
for (int i = 0; i < n; ++i) { for (int i = 0; i < n; ++i) {
y[i] = GGML_CPU_FP32_TO_FP16(expm1f(GGML_CPU_FP16_TO_FP32(x[i]))); const float v = GGML_CPU_FP16_TO_FP32(x[i]);
y[i] = GGML_CPU_FP32_TO_FP16((v > 0.f) ? v : expm1f(v));
} }
} }
inline static void ggml_vec_relu_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? x[i] : 0.f; } inline static void ggml_vec_relu_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? x[i] : 0.f; }
......
...@@ -44,6 +44,8 @@ if (CUDAToolkit_FOUND) ...@@ -44,6 +44,8 @@ if (CUDAToolkit_FOUND)
list(APPEND GGML_HEADERS_CUDA "../../include/ggml-cuda.h") list(APPEND GGML_HEADERS_CUDA "../../include/ggml-cuda.h")
file(GLOB GGML_SOURCES_CUDA "*.cu") file(GLOB GGML_SOURCES_CUDA "*.cu")
file(GLOB SRCS "template-instances/fattn-tile*.cu")
list(APPEND GGML_SOURCES_CUDA ${SRCS})
file(GLOB SRCS "template-instances/fattn-mma*.cu") file(GLOB SRCS "template-instances/fattn-mma*.cu")
list(APPEND GGML_SOURCES_CUDA ${SRCS}) list(APPEND GGML_SOURCES_CUDA ${SRCS})
file(GLOB SRCS "template-instances/mmq*.cu") file(GLOB SRCS "template-instances/mmq*.cu")
......
...@@ -245,14 +245,6 @@ static const char * cu_get_error_str(CUresult err) { ...@@ -245,14 +245,6 @@ static const char * cu_get_error_str(CUresult err) {
#define FAST_FP16_AVAILABLE #define FAST_FP16_AVAILABLE
#endif // defined(FP16_AVAILABLE) && __CUDA_ARCH__ != 610 #endif // defined(FP16_AVAILABLE) && __CUDA_ARCH__ != 610
#if (!defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_VOLTA) || defined(GGML_USE_MUSA)
#define FP16_MMA_AVAILABLE
#endif // (!defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_VOLTA) || defined(GGML_USE_MUSA)
#if defined(GGML_HIP_ROCWMMA_FATTN) && (defined(CDNA) || defined(RDNA3) || (defined(GGML_HIP_ROCWMMA_FATTN_GFX12) && defined(RDNA4)))
#define FP16_MMA_AVAILABLE
#endif // defined(GGML_HIP_ROCWMMA_FATTN) && (defined(CDNA) || defined(RDNA3) || (defined(GGML_HIP_ROCWMMA_FATTN_GFX12) && defined(RDNA4)))
#if defined(GGML_USE_HIP) && defined(CDNA) && !defined(GGML_HIP_NO_MMQ_MFMA) #if defined(GGML_USE_HIP) && defined(CDNA) && !defined(GGML_HIP_NO_MMQ_MFMA)
#define AMD_MFMA_AVAILABLE #define AMD_MFMA_AVAILABLE
#endif // defined(GGML_USE_HIP) && defined(CDNA) && !defined(GGML_HIP_NO_MMQ_MFMA) #endif // defined(GGML_USE_HIP) && defined(CDNA) && !defined(GGML_HIP_NO_MMQ_MFMA)
...@@ -278,7 +270,8 @@ static bool fp16_available(const int cc) { ...@@ -278,7 +270,8 @@ static bool fp16_available(const int cc) {
} }
static bool fast_fp16_available(const int cc) { static bool fast_fp16_available(const int cc) {
return (GGML_CUDA_CC_IS_NVIDIA(cc) && fp16_available(cc) && cc != 610) || GGML_CUDA_CC_IS_AMD(cc); return GGML_CUDA_CC_IS_AMD(cc) ||
(GGML_CUDA_CC_IS_NVIDIA(cc) && fp16_available(cc) && ggml_cuda_highest_compiled_arch(cc) != 610);
} }
// To be used for feature selection of external libraries, e.g. cuBLAS. // To be used for feature selection of external libraries, e.g. cuBLAS.
...@@ -287,27 +280,6 @@ static bool fast_fp16_hardware_available(const int cc) { ...@@ -287,27 +280,6 @@ static bool fast_fp16_hardware_available(const int cc) {
(GGML_CUDA_CC_IS_MTHREADS(cc) && cc >= GGML_CUDA_CC_QY2); (GGML_CUDA_CC_IS_MTHREADS(cc) && cc >= GGML_CUDA_CC_QY2);
} }
// Any FP16 tensor core instructions are available for ggml code.
static bool fp16_mma_available(const int cc) {
#if defined(GGML_USE_HIP) && !defined(GGML_HIP_ROCWMMA_FATTN)
return false;
#else
if ((GGML_CUDA_CC_IS_NVIDIA(cc) && ggml_cuda_highest_compiled_arch(cc) >= GGML_CUDA_CC_VOLTA) ||
GGML_CUDA_CC_IS_CDNA(cc) || GGML_CUDA_CC_IS_RDNA3(cc) ||
GGML_CUDA_CC_IS_MTHREADS(cc)) {
return true;
} else if (GGML_CUDA_CC_IS_RDNA4(cc)) {
#if defined(GGML_HIP_ROCWMMA_FATTN) && defined(GGML_HIP_ROCWMMA_FATTN_GFX12)
return true;
#else
return false;
#endif // defined(GGML_HIP_ROCWMMA_FATTN) && defined(GGML_HIP_ROCWMMA_FATTN_GFX12)
} else {
return false;
}
#endif // defined(GGML_USE_HIP) && !defined(GGML_HIP_ROCWMMA_FATTN)
}
// To be used for feature selection of external libraries, e.g. cuBLAS. // To be used for feature selection of external libraries, e.g. cuBLAS.
static bool fp16_mma_hardware_available(const int cc) { static bool fp16_mma_hardware_available(const int cc) {
return (GGML_CUDA_CC_IS_NVIDIA(cc) && cc >= GGML_CUDA_CC_VOLTA) || return (GGML_CUDA_CC_IS_NVIDIA(cc) && cc >= GGML_CUDA_CC_VOLTA) ||
...@@ -625,6 +597,10 @@ static __device__ __forceinline__ void ggml_cuda_mad(half2 & acc, const half2 v, ...@@ -625,6 +597,10 @@ static __device__ __forceinline__ void ggml_cuda_mad(half2 & acc, const half2 v,
} }
// Aligned memory transfers of 8/16 bytes can be faster than 2 transfers with 4 bytes, especially on AMD. // Aligned memory transfers of 8/16 bytes can be faster than 2 transfers with 4 bytes, especially on AMD.
// Important: do not use this function if dst and src both point at registers.
// Due to the strict aliasing rule the compiler can do incorrect optimizations if src and dst have different types.
// The function is intended for copies between registers and SRAM/VRAM to make the compiler emit the right instructions.
// If dst and src point at different address spaces then they are guaranteed to not be aliased.
template <int nbytes, int alignment = 0> template <int nbytes, int alignment = 0>
static __device__ __forceinline__ void ggml_cuda_memcpy_1(void * __restrict__ dst, const void * __restrict__ src) { static __device__ __forceinline__ void ggml_cuda_memcpy_1(void * __restrict__ dst, const void * __restrict__ src) {
if constexpr (alignment != 0) { if constexpr (alignment != 0) {
......
...@@ -793,8 +793,6 @@ void launch_fattn( ...@@ -793,8 +793,6 @@ void launch_fattn(
GGML_ASSERT(!mask || mask->ne[1] >= GGML_PAD(Q->ne[1], 16) && GGML_ASSERT(!mask || mask->ne[1] >= GGML_PAD(Q->ne[1], 16) &&
"the Flash-Attention CUDA kernel requires the mask to be padded to 16 and at least n_queries big"); "the Flash-Attention CUDA kernel requires the mask to be padded to 16 and at least n_queries big");
GGML_ASSERT(K->ne[1] % FATTN_KQ_STRIDE == 0 && "Incorrect KV cache padding.");
ggml_cuda_pool & pool = ctx.pool(); ggml_cuda_pool & pool = ctx.pool();
cudaStream_t main_stream = ctx.stream(); cudaStream_t main_stream = ctx.stream();
const int id = ggml_cuda_get_device(); const int id = ggml_cuda_get_device();
...@@ -878,7 +876,7 @@ void launch_fattn( ...@@ -878,7 +876,7 @@ void launch_fattn(
// Optional optimization where the mask is scanned to determine whether part of the calculation can be skipped. // Optional optimization where the mask is scanned to determine whether part of the calculation can be skipped.
// Only worth the overhead if there is at lease one FATTN_KQ_STRIDE x FATTN_KQ_STRIDE square to be skipped or // Only worth the overhead if there is at lease one FATTN_KQ_STRIDE x FATTN_KQ_STRIDE square to be skipped or
// multiple sequences of possibly different lengths. // multiple sequences of possibly different lengths.
if (mask && (Q->ne[1] >= 1024 || Q->ne[3] > 1)) { if (mask && K->ne[1] % FATTN_KQ_STRIDE == 0 && (Q->ne[1] >= 1024 || Q->ne[3] > 1)) {
const int s31 = mask->nb[1] / sizeof(half2); const int s31 = mask->nb[1] / sizeof(half2);
const int s33 = mask->nb[3] / sizeof(half2); const int s33 = mask->nb[3] / sizeof(half2);
...@@ -916,8 +914,7 @@ void launch_fattn( ...@@ -916,8 +914,7 @@ void launch_fattn(
dst_tmp_meta.alloc(blocks_num.x*ncols * (2*2 + DV) * sizeof(float)); dst_tmp_meta.alloc(blocks_num.x*ncols * (2*2 + DV) * sizeof(float));
} else { } else {
GGML_ASSERT(K->ne[1] % KQ_row_granularity == 0); const int ntiles_KQ = (K->ne[1] + KQ_row_granularity - 1) / KQ_row_granularity; // Max. number of parallel blocks limited by tensor size.
const int ntiles_KQ = K->ne[1] / KQ_row_granularity; // Max. number of parallel blocks limited by tensor size.
// parallel_blocks must not be larger than what the tensor size allows: // parallel_blocks must not be larger than what the tensor size allows:
parallel_blocks = std::min(parallel_blocks, ntiles_KQ); parallel_blocks = std::min(parallel_blocks, ntiles_KQ);
...@@ -946,7 +943,7 @@ void launch_fattn( ...@@ -946,7 +943,7 @@ void launch_fattn(
blocks_num.x = ntiles_x; blocks_num.x = ntiles_x;
blocks_num.y = parallel_blocks; blocks_num.y = parallel_blocks;
blocks_num.z = Q->ne[2]*Q->ne[3]; blocks_num.z = (Q->ne[2]/ncols2)*Q->ne[3];
if (parallel_blocks > 1) { if (parallel_blocks > 1) {
dst_tmp.alloc(parallel_blocks*ggml_nelements(KQV)); dst_tmp.alloc(parallel_blocks*ggml_nelements(KQV));
......
...@@ -535,8 +535,6 @@ void ggml_cuda_flash_attn_ext_vec_case(ggml_backend_cuda_context & ctx, ggml_ten ...@@ -535,8 +535,6 @@ void ggml_cuda_flash_attn_ext_vec_case(ggml_backend_cuda_context & ctx, ggml_ten
float logit_softcap; float logit_softcap;
memcpy(&logit_softcap, (const float *) KQV->op_params + 2, sizeof(float)); memcpy(&logit_softcap, (const float *) KQV->op_params + 2, sizeof(float));
const int cc = ggml_cuda_info().devices[ggml_cuda_get_device()].cc;
if (Q->ne[1] == 1) { if (Q->ne[1] == 1) {
constexpr int cols_per_block = 1; constexpr int cols_per_block = 1;
if (logit_softcap == 0.0f) { if (logit_softcap == 0.0f) {
......
...@@ -6,19 +6,19 @@ ...@@ -6,19 +6,19 @@
#include "fattn-common.cuh" #include "fattn-common.cuh"
#include "fattn-wmma-f16.cuh" #include "fattn-wmma-f16.cuh"
#ifdef FP16_MMA_AVAILABLE #ifdef GGML_USE_WMMA_FATTN
#if !defined(GGML_USE_HIP) #if !defined(GGML_USE_HIP)
#include <mma.h> #include <mma.h>
#ifdef GGML_USE_MUSA #if defined(GGML_USE_MUSA)
namespace wmma = mtmusa::wmma; namespace wmma = mtmusa::wmma;
#else // GGML_USE_MUSA #else // GGML_USE_MUSA
namespace wmma = nvcuda::wmma; namespace wmma = nvcuda::wmma;
#endif // GGML_USE_MUSA #endif // GGML_USE_MUSA
#elif defined(GGML_HIP_ROCWMMA_FATTN) && defined(FP16_MMA_AVAILABLE) #elif defined(GGML_USE_HIP)
#include <rocwmma/rocwmma.hpp> #include <rocwmma/rocwmma.hpp>
namespace wmma = rocwmma; namespace wmma = rocwmma;
#endif // !defined(GGML_USE_HIP) #endif // !defined(GGML_USE_HIP)
#endif // FP16_MMA_AVAILABLE #endif // GGML_USE_WMMA_FATTN
// D == head size, VKQ_stride == num VKQ rows calculated in parallel: // D == head size, VKQ_stride == num VKQ rows calculated in parallel:
template<int D, int ncols, int nwarps, int VKQ_stride, typename KQ_acc_t, bool use_logit_softcap> template<int D, int ncols, int nwarps, int VKQ_stride, typename KQ_acc_t, bool use_logit_softcap>
...@@ -45,7 +45,7 @@ static __global__ void flash_attn_ext_f16( ...@@ -45,7 +45,7 @@ static __global__ void flash_attn_ext_f16(
const int32_t nb21, const int32_t nb22, const int64_t nb23, const int32_t nb21, const int32_t nb22, const int64_t nb23,
const int32_t ne31, const int32_t ne32, const int32_t ne33, const int32_t ne31, const int32_t ne32, const int32_t ne33,
const int32_t nb31, const int32_t nb32, const int64_t nb33) { const int32_t nb31, const int32_t nb32, const int64_t nb33) {
#if defined(FLASH_ATTN_AVAILABLE) && (__CUDA_ARCH__ == GGML_CUDA_CC_VOLTA || (defined(GGML_HIP_ROCWMMA_FATTN) && defined(FP16_MMA_AVAILABLE))) #if defined(FLASH_ATTN_AVAILABLE) && (__CUDA_ARCH__ == GGML_CUDA_CC_VOLTA || (defined(GGML_HIP_ROCWMMA_FATTN) && defined(GGML_USE_WMMA_FATTN)))
// Skip unused kernel variants for faster compilation: // Skip unused kernel variants for faster compilation:
if (use_logit_softcap && !(D == 128 || D == 256)) { if (use_logit_softcap && !(D == 128 || D == 256)) {
NO_DEVICE_CODE; NO_DEVICE_CODE;
...@@ -481,7 +481,7 @@ static __global__ void flash_attn_ext_f16( ...@@ -481,7 +481,7 @@ static __global__ void flash_attn_ext_f16(
ne31, ne32, ne33, ne31, ne32, ne33,
nb31, nb32, nb33); nb31, nb32, nb33);
NO_DEVICE_CODE; NO_DEVICE_CODE;
#endif // defined(FLASH_ATTN_AVAILABLE) && (__CUDA_ARCH__ == GGML_CUDA_CC_VOLTA || (defined(GGML_HIP_ROCWMMA_FATTN) && defined(FP16_MMA_AVAILABLE))) #endif // defined(FLASH_ATTN_AVAILABLE) && (__CUDA_ARCH__ == GGML_CUDA_CC_VOLTA || (defined(GGML_HIP_ROCWMMA_FATTN) && defined(GGML_USE_WMMA_FATTN)))
} }
constexpr int get_max_power_of_2(int x) { constexpr int get_max_power_of_2(int x) {
......
#pragma once
#include "common.cuh" #include "common.cuh"
#if (!defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_VOLTA) || defined(GGML_USE_MUSA)
#define GGML_USE_WMMA_FATTN
#endif // (!defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_VOLTA) || defined(GGML_USE_MUSA)
#if defined(GGML_HIP_ROCWMMA_FATTN)
#if defined(CDNA) && (ROCWMMA_VERSION_MAJOR < 2 || ROCWMMA_VERSION_MINOR > 0 || ROCWMMA_VERSION_PATCH > 0)
#define GGML_USE_WMMA_FATTN
#elif defined(CDNA)
#warning "rocwmma fattn on CDNA is broken on rocwmma v2.0.0, expect degraded performance"
#endif // defined(CDNA) && (ROCWMMA_VERSION_MAJOR < 2 || ROCWMMA_VERSION_MINOR > 0 || ROCWMMA_VERSION_PATCH > 0)
#if defined(RDNA3)
#define GGML_USE_WMMA_FATTN
#endif // defined(RDNA3)
#if defined(RDNA4) && ROCWMMA_VERSION_MAJOR > 1
#define GGML_USE_WMMA_FATTN
#elif defined(RDNA4)
#warning "rocwmma fattn is not suported on RDNA4 on rocwmma < v2.0.0, expect degraded performance"
#endif // defined(RDNA4) && ROCWMMA_VERSION_MAJOR > 1
#endif // defined(GGML_HIP_ROCWMMA_FATTN)
// WMMA flash attention requires FP16 matrix instructions to be available for ggml code.
static bool ggml_cuda_should_use_wmma_fattn(const int cc) {
#if defined(GGML_USE_HIP) && !defined(GGML_HIP_ROCWMMA_FATTN)
return false;
#else
if ((GGML_CUDA_CC_IS_NVIDIA(cc) && ggml_cuda_highest_compiled_arch(cc) == GGML_CUDA_CC_VOLTA) ||
GGML_CUDA_CC_IS_RDNA3(cc) || GGML_CUDA_CC_IS_MTHREADS(cc)) {
return true;
} else if (GGML_CUDA_CC_IS_CDNA(cc)){
#if defined(GGML_HIP_ROCWMMA_FATTN) && (ROCWMMA_VERSION_MAJOR < 2 || ROCWMMA_VERSION_MINOR > 0 || ROCWMMA_VERSION_PATCH > 0)
return true;
#else
return false;
#endif // defined(GGML_HIP_ROCWMMA_FATTN) (ROCWMMA_VERSION_MAJOR < 2 || ROCWMMA_VERSION_MINOR > 0 || ROCWMMA_VERSION_PATCH > 0)
} else if (GGML_CUDA_CC_IS_RDNA4(cc)) {
#if defined(GGML_HIP_ROCWMMA_FATTN) && ROCWMMA_VERSION_MAJOR > 1
return true;
#else
return false;
#endif // defined(GGML_HIP_ROCWMMA_FATTN) && ROCWMMA_VERSION_MAJOR > 1
} else {
return false;
}
#endif // defined(GGML_USE_HIP) && !defined(GGML_HIP_ROCWMMA_FATTN)
}
void ggml_cuda_flash_attn_ext_wmma_f16(ggml_backend_cuda_context & ctx, ggml_tensor * dst); void ggml_cuda_flash_attn_ext_wmma_f16(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
...@@ -198,6 +198,7 @@ static best_fattn_kernel ggml_cuda_get_best_fattn_kernel(const int device, const ...@@ -198,6 +198,7 @@ static best_fattn_kernel ggml_cuda_get_best_fattn_kernel(const int device, const
return BEST_FATTN_KERNEL_NONE; return BEST_FATTN_KERNEL_NONE;
#endif// FLASH_ATTN_AVAILABLE #endif// FLASH_ATTN_AVAILABLE
const ggml_tensor * KQV = dst;
const ggml_tensor * Q = dst->src[0]; const ggml_tensor * Q = dst->src[0];
const ggml_tensor * K = dst->src[1]; const ggml_tensor * K = dst->src[1];
const ggml_tensor * V = dst->src[2]; const ggml_tensor * V = dst->src[2];
...@@ -206,31 +207,32 @@ static best_fattn_kernel ggml_cuda_get_best_fattn_kernel(const int device, const ...@@ -206,31 +207,32 @@ static best_fattn_kernel ggml_cuda_get_best_fattn_kernel(const int device, const
const int gqa_ratio = Q->ne[2] / K->ne[2]; const int gqa_ratio = Q->ne[2] / K->ne[2];
GGML_ASSERT(Q->ne[2] % K->ne[2] == 0); GGML_ASSERT(Q->ne[2] % K->ne[2] == 0);
float max_bias = 0.0f;
memcpy(&max_bias, (const float *) KQV->op_params + 1, sizeof(float));
// The effective batch size for the kernel can be increased by gqa_ratio.
// The kernel versions without this optimization are also used for ALiBi, if there is no mask, or if the KV cache is not padded,
const bool gqa_opt_applies = gqa_ratio % 2 == 0 && mask && max_bias == 0.0f && K->ne[1] % FATTN_KQ_STRIDE == 0;
const int cc = ggml_cuda_info().devices[device].cc; const int cc = ggml_cuda_info().devices[device].cc;
switch (K->ne[0]) { switch (K->ne[0]) {
case 40:
case 64: case 64:
case 128:
case 256:
if (V->ne[0] != K->ne[0]) {
return BEST_FATTN_KERNEL_NONE;
}
break;
case 80: case 80:
case 96: case 96:
case 128:
case 112: case 112:
case 256:
if (V->ne[0] != K->ne[0]) { if (V->ne[0] != K->ne[0]) {
return BEST_FATTN_KERNEL_NONE; return BEST_FATTN_KERNEL_NONE;
} }
if (!fp16_mma_available(cc) && !turing_mma_available(cc)) {
return BEST_FATTN_KERNEL_NONE;
}
break; break;
case 576: case 576:
if (V->ne[0] != 512) { if (V->ne[0] != 512) {
return BEST_FATTN_KERNEL_NONE; return BEST_FATTN_KERNEL_NONE;
} }
if (!turing_mma_available(cc) || gqa_ratio % 16 != 0) { if (!gqa_opt_applies || gqa_ratio % 16 != 0) {
return BEST_FATTN_KERNEL_NONE; return BEST_FATTN_KERNEL_NONE;
} }
break; break;
...@@ -264,47 +266,57 @@ static best_fattn_kernel ggml_cuda_get_best_fattn_kernel(const int device, const ...@@ -264,47 +266,57 @@ static best_fattn_kernel ggml_cuda_get_best_fattn_kernel(const int device, const
return BEST_FATTN_KERNEL_NONE; return BEST_FATTN_KERNEL_NONE;
} }
const bool can_use_vector_kernel = Q->ne[0] <= 256 && Q->ne[0] % 64 == 0; // For small batch sizes the vector kernel may be preferable over the kernels optimized for large batch sizes:
const bool can_use_vector_kernel = Q->ne[0] <= 256 && Q->ne[0] % 64 == 0 && K->ne[1] % FATTN_KQ_STRIDE == 0;
// If Turing tensor cores available, use them except for some cases with batch size 1:
if (turing_mma_available(cc)) {
best_fattn_kernel best = BEST_FATTN_KERNEL_MMA_F16;
// If Turing tensor cores available, use them:
if (turing_mma_available(cc) && K->ne[1] % FATTN_KQ_STRIDE == 0 && Q->ne[0] != 40) {
if (can_use_vector_kernel) { if (can_use_vector_kernel) {
if (K->type == GGML_TYPE_F16 && V->type == GGML_TYPE_F16) { if (K->type == GGML_TYPE_F16 && V->type == GGML_TYPE_F16) {
if (cc >= GGML_CUDA_CC_ADA_LOVELACE && Q->ne[1] == 1 && Q->ne[3] == 1 && !(gqa_ratio > 4 && K->ne[1] >= 8192)) { if (cc >= GGML_CUDA_CC_ADA_LOVELACE && Q->ne[1] == 1 && Q->ne[3] == 1 && !(gqa_ratio > 4 && K->ne[1] >= 8192)) {
best = BEST_FATTN_KERNEL_VEC; return BEST_FATTN_KERNEL_VEC;
} }
} else { } else {
if (cc >= GGML_CUDA_CC_ADA_LOVELACE) { if (cc >= GGML_CUDA_CC_ADA_LOVELACE) {
if (Q->ne[1] <= 2) { if (Q->ne[1] <= 2) {
best = BEST_FATTN_KERNEL_VEC; return BEST_FATTN_KERNEL_VEC;
} }
} else { } else {
if (Q->ne[1] == 1) { if (Q->ne[1] == 1) {
best = BEST_FATTN_KERNEL_VEC; return BEST_FATTN_KERNEL_VEC;
} }
} }
} }
if ((gqa_ratio % 2 != 0 || !mask) && Q->ne[1] == 1) { if (!gqa_opt_applies && Q->ne[1] == 1) {
best = BEST_FATTN_KERNEL_VEC; // GQA-specific optimizations in the mma kernel do not apply. return BEST_FATTN_KERNEL_VEC;
} }
} }
return best; return BEST_FATTN_KERNEL_MMA_F16;
} }
// Use kernels specialized for small batch sizes if possible: // Use the WMMA kernel if possible:
if (Q->ne[1] <= 8 && can_use_vector_kernel) { if (ggml_cuda_should_use_wmma_fattn(cc) && K->ne[1] % FATTN_KQ_STRIDE == 0 && Q->ne[0] != 40 && Q->ne[0] != 576) {
return BEST_FATTN_KERNEL_VEC; if (can_use_vector_kernel && Q->ne[1] <= 2) {
} return BEST_FATTN_KERNEL_VEC;
}
// For large batch sizes, use the WMMA kernel if possible:
if (fp16_mma_available(cc)) {
return BEST_FATTN_KERNEL_WMMA_F16; return BEST_FATTN_KERNEL_WMMA_F16;
} }
// If there is no suitable kernel for tensor cores or small batch sizes, use the generic kernel for large batch sizes: // If there are no tensor cores available, use the generic tile kernel:
if (can_use_vector_kernel) {
if (K->type == GGML_TYPE_F16 && V->type == GGML_TYPE_F16) {
if (Q->ne[1] == 1) {
if (!gqa_opt_applies) {
return BEST_FATTN_KERNEL_VEC;
}
}
} else {
if (Q->ne[1] <= 2) {
return BEST_FATTN_KERNEL_VEC;
}
}
}
return BEST_FATTN_KERNEL_TILE; return BEST_FATTN_KERNEL_TILE;
} }
......
...@@ -291,7 +291,7 @@ static ggml_cuda_device_info ggml_cuda_init() { ...@@ -291,7 +291,7 @@ static ggml_cuda_device_info ggml_cuda_init() {
info.default_tensor_split[id] = total_vram; info.default_tensor_split[id] = total_vram;
total_vram += prop.totalGlobalMem; total_vram += prop.totalGlobalMem;
info.devices[id].integrated = prop.integrated; info.devices[id].integrated = false; // Temporarily disabled due to issues with corrupted output (e.g. #15034)
info.devices[id].nsm = prop.multiProcessorCount; info.devices[id].nsm = prop.multiProcessorCount;
info.devices[id].smpb = prop.sharedMemPerBlock; info.devices[id].smpb = prop.sharedMemPerBlock;
info.devices[id].warp_size = prop.warpSize; info.devices[id].warp_size = prop.warpSize;
...@@ -2466,6 +2466,9 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg ...@@ -2466,6 +2466,9 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg
case GGML_UNARY_OP_ELU: case GGML_UNARY_OP_ELU:
ggml_cuda_op_elu(ctx, dst); ggml_cuda_op_elu(ctx, dst);
break; break;
case GGML_UNARY_OP_XIELU:
ggml_cuda_op_xielu(ctx, dst);
break;
default: default:
return false; return false;
} }
......
// This file has been autogenerated by generate_cu_files.py, do not edit manually.
#include "../fattn-tile.cuh"
DECL_FATTN_TILE_CASE(112, 112);
// This file has been autogenerated by generate_cu_files.py, do not edit manually.
#include "../fattn-tile.cuh"
DECL_FATTN_TILE_CASE(128, 128);
// This file has been autogenerated by generate_cu_files.py, do not edit manually.
#include "../fattn-tile.cuh"
DECL_FATTN_TILE_CASE(256, 256);
// This file has been autogenerated by generate_cu_files.py, do not edit manually.
#include "../fattn-tile.cuh"
DECL_FATTN_TILE_CASE(40, 40);
// This file has been autogenerated by generate_cu_files.py, do not edit manually.
#include "../fattn-tile.cuh"
DECL_FATTN_TILE_CASE(576, 512);
// This file has been autogenerated by generate_cu_files.py, do not edit manually.
#include "../fattn-tile.cuh"
DECL_FATTN_TILE_CASE(64, 64);
// This file has been autogenerated by generate_cu_files.py, do not edit manually.
#include "../fattn-tile.cuh"
DECL_FATTN_TILE_CASE(80, 80);
// This file has been autogenerated by generate_cu_files.py, do not edit manually.
#include "../fattn-tile.cuh"
DECL_FATTN_TILE_CASE(96, 96);
...@@ -13,7 +13,7 @@ ...@@ -13,7 +13,7 @@
It is intended as fusion of softmax->top-k->get_rows pipeline for MoE models It is intended as fusion of softmax->top-k->get_rows pipeline for MoE models
*/ */
template <size_t n_experts, bool with_norm> template <int n_experts, bool with_norm>
__launch_bounds__(4 * WARP_SIZE, 1) __global__ void topk_moe_cuda(const float * logits, __launch_bounds__(4 * WARP_SIZE, 1) __global__ void topk_moe_cuda(const float * logits,
float * weights, float * weights,
int32_t * ids, int32_t * ids,
...@@ -204,8 +204,6 @@ void ggml_cuda_op_topk_moe(ggml_backend_cuda_context & ctx, ...@@ -204,8 +204,6 @@ void ggml_cuda_op_topk_moe(ggml_backend_cuda_context & ctx,
GGML_ASSERT(ids->nb[1] / ggml_type_size(ids->type) == (size_t) n_experts); GGML_ASSERT(ids->nb[1] / ggml_type_size(ids->type) == (size_t) n_experts);
cudaStream_t stream = ctx.stream();
const int n_expert_used = weights->ne[1]; const int n_expert_used = weights->ne[1];
if (with_norm) { if (with_norm) {
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment