Unverified Commit 527cc978 authored by Jeffrey Morgan's avatar Jeffrey Morgan Committed by GitHub
Browse files

llama: update vendored code to commit 40c6d79f (#7875)

parent a37f4a86
/** /**
* llama.cpp - commit 3f1ae2e32cde00c39b96be6d01c2997c29bae555 - do not edit this file * llama.cpp - commit 40c6d79fb52f995f47507fedfeaae2ac05d9b35c - do not edit this file
* *
* MIT License * MIT License
* *
...@@ -543,9 +543,9 @@ constexpr __device__ dequantize_1_f32_t get_dequantize_1_f32(ggml_type type_V) { ...@@ -543,9 +543,9 @@ constexpr __device__ dequantize_1_f32_t get_dequantize_1_f32(ggml_type type_V) {
} }
template<int D, int parallel_blocks> // D == head size template<int D, int parallel_blocks> // D == head size
#if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) #if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__))
__launch_bounds__(D, 1) __launch_bounds__(D, 1)
#endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) #endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__))
static __global__ void flash_attn_combine_results( static __global__ void flash_attn_combine_results(
const float * __restrict__ VKQ_parts, const float * __restrict__ VKQ_parts,
const float2 * __restrict__ VKQ_meta, const float2 * __restrict__ VKQ_meta,
......
/** /**
* llama.cpp - commit 3f1ae2e32cde00c39b96be6d01c2997c29bae555 - do not edit this file * llama.cpp - commit 40c6d79fb52f995f47507fedfeaae2ac05d9b35c - do not edit this file
* *
* MIT License * MIT License
* *
...@@ -31,9 +31,9 @@ ...@@ -31,9 +31,9 @@
#define FATTN_KQ_STRIDE_TILE_F16 64 #define FATTN_KQ_STRIDE_TILE_F16 64
template<int D, int ncols, int nwarps, int parallel_blocks, bool use_logit_softcap> // D == head size template<int D, int ncols, int nwarps, int parallel_blocks, bool use_logit_softcap> // D == head size
#if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) #if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__))
__launch_bounds__(nwarps*WARP_SIZE, 1) __launch_bounds__(nwarps*WARP_SIZE, 1)
#endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) #endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__))
static __global__ void flash_attn_tile_ext_f16( static __global__ void flash_attn_tile_ext_f16(
const char * __restrict__ Q, const char * __restrict__ Q,
const char * __restrict__ K, const char * __restrict__ K,
...@@ -285,7 +285,7 @@ static __global__ void flash_attn_tile_ext_f16( ...@@ -285,7 +285,7 @@ static __global__ void flash_attn_tile_ext_f16(
} }
half kqsum_j = __low2half(kqsum[j_VKQ_0/nwarps]) + __high2half(kqsum[j_VKQ_0/nwarps]); half kqsum_j = __low2half(kqsum[j_VKQ_0/nwarps]) + __high2half(kqsum[j_VKQ_0/nwarps]);
kqsum_j = warp_reduce_sum(kqsum_j); kqsum_j = warp_reduce_sum((float)kqsum_j);
#pragma unroll #pragma unroll
for (int i00 = 0; i00 < D; i00 += 2*WARP_SIZE) { for (int i00 = 0; i00 < D; i00 += 2*WARP_SIZE) {
......
/** /**
* llama.cpp - commit 3f1ae2e32cde00c39b96be6d01c2997c29bae555 - do not edit this file * llama.cpp - commit 40c6d79fb52f995f47507fedfeaae2ac05d9b35c - do not edit this file
* *
* MIT License * MIT License
* *
......
/** /**
* llama.cpp - commit 3f1ae2e32cde00c39b96be6d01c2997c29bae555 - do not edit this file * llama.cpp - commit 40c6d79fb52f995f47507fedfeaae2ac05d9b35c - do not edit this file
* *
* MIT License * MIT License
* *
...@@ -31,9 +31,9 @@ ...@@ -31,9 +31,9 @@
#define FATTN_KQ_STRIDE_TILE_F32 32 #define FATTN_KQ_STRIDE_TILE_F32 32
template<int D, int ncols, int nwarps, int parallel_blocks, bool use_logit_softcap> // D == head size template<int D, int ncols, int nwarps, int parallel_blocks, bool use_logit_softcap> // D == head size
#if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) #if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__))
__launch_bounds__(nwarps*WARP_SIZE, 1) __launch_bounds__(nwarps*WARP_SIZE, 1)
#endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) #endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__))
static __global__ void flash_attn_tile_ext_f32( static __global__ void flash_attn_tile_ext_f32(
const char * __restrict__ Q, const char * __restrict__ Q,
const char * __restrict__ K, const char * __restrict__ K,
......
/** /**
* llama.cpp - commit 3f1ae2e32cde00c39b96be6d01c2997c29bae555 - do not edit this file * llama.cpp - commit 40c6d79fb52f995f47507fedfeaae2ac05d9b35c - do not edit this file
* *
* MIT License * MIT License
* *
......
/** /**
* llama.cpp - commit 3f1ae2e32cde00c39b96be6d01c2997c29bae555 - do not edit this file * llama.cpp - commit 40c6d79fb52f995f47507fedfeaae2ac05d9b35c - do not edit this file
* *
* MIT License * MIT License
* *
...@@ -28,9 +28,9 @@ ...@@ -28,9 +28,9 @@
#include "fattn-common.cuh" #include "fattn-common.cuh"
template<int D, int ncols, int parallel_blocks, ggml_type type_K, ggml_type type_V, bool use_logit_softcap> // D == head size template<int D, int ncols, int parallel_blocks, ggml_type type_K, ggml_type type_V, bool use_logit_softcap> // D == head size
#if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) #if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__))
__launch_bounds__(D, 1) __launch_bounds__(D, 1)
#endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) #endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__))
static __global__ void flash_attn_vec_ext_f16( static __global__ void flash_attn_vec_ext_f16(
const char * __restrict__ Q, const char * __restrict__ Q,
const char * __restrict__ K, const char * __restrict__ K,
...@@ -222,7 +222,7 @@ static __global__ void flash_attn_vec_ext_f16( ...@@ -222,7 +222,7 @@ static __global__ void flash_attn_vec_ext_f16(
#pragma unroll #pragma unroll
for (int j = 0; j < ncols; ++j) { for (int j = 0; j < ncols; ++j) {
half sum = vec_dot_KQ(K + (k_VKQ_0 + i_KQ)*nb11, Q_h2[j], Q_i32[j], Q_ds[j]); half sum = vec_dot_KQ(K + (k_VKQ_0 + i_KQ)*nb11, Q_h2[j], Q_i32[j], Q_ds[j]);
sum = warp_reduce_sum(sum); sum = warp_reduce_sum((float)sum);
if (use_logit_softcap) { if (use_logit_softcap) {
sum = logit_softcap*tanhf(sum); sum = logit_softcap*tanhf(sum);
...@@ -246,7 +246,6 @@ static __global__ void flash_attn_vec_ext_f16( ...@@ -246,7 +246,6 @@ static __global__ void flash_attn_vec_ext_f16(
for (int j = 0; j < ncols; ++j) { for (int j = 0; j < ncols; ++j) {
half kqmax_new_j = ncols == 1 ? kqmax_new : kqmax_new_arr[j]; half kqmax_new_j = ncols == 1 ? kqmax_new : kqmax_new_arr[j];
kqmax_new_j = warp_reduce_max(kqmax_new_j);
if (threadIdx.x == 0) { if (threadIdx.x == 0) {
kqmax_shared[j][threadIdx.y] = kqmax_new_j; kqmax_shared[j][threadIdx.y] = kqmax_new_j;
} }
...@@ -291,7 +290,7 @@ static __global__ void flash_attn_vec_ext_f16( ...@@ -291,7 +290,7 @@ static __global__ void flash_attn_vec_ext_f16(
#pragma unroll #pragma unroll
for (int j = 0; j < ncols; ++j) { for (int j = 0; j < ncols; ++j) {
kqsum[j] = warp_reduce_sum(kqsum[j]); kqsum[j] = warp_reduce_sum((float)kqsum[j]);
if (threadIdx.x == 0) { if (threadIdx.x == 0) {
kqsum_shared[j][threadIdx.y] = kqsum[j]; kqsum_shared[j][threadIdx.y] = kqsum[j];
} }
...@@ -306,7 +305,7 @@ static __global__ void flash_attn_vec_ext_f16( ...@@ -306,7 +305,7 @@ static __global__ void flash_attn_vec_ext_f16(
} }
kqsum[j_VKQ] = kqsum_shared[j_VKQ][threadIdx.x]; kqsum[j_VKQ] = kqsum_shared[j_VKQ][threadIdx.x];
kqsum[j_VKQ] = warp_reduce_sum(kqsum[j_VKQ]); kqsum[j_VKQ] = warp_reduce_sum((float)kqsum[j_VKQ]);
half dst_val = (__low2half(VKQ[j_VKQ]) + __high2half(VKQ[j_VKQ])); half dst_val = (__low2half(VKQ[j_VKQ]) + __high2half(VKQ[j_VKQ]));
if (parallel_blocks == 1) { if (parallel_blocks == 1) {
......
/** /**
* llama.cpp - commit 3f1ae2e32cde00c39b96be6d01c2997c29bae555 - do not edit this file * llama.cpp - commit 40c6d79fb52f995f47507fedfeaae2ac05d9b35c - do not edit this file
* *
* MIT License * MIT License
* *
...@@ -28,9 +28,9 @@ ...@@ -28,9 +28,9 @@
#include "fattn-common.cuh" #include "fattn-common.cuh"
template<int D, int ncols, int parallel_blocks, ggml_type type_K, ggml_type type_V, bool use_logit_softcap> // D == head size template<int D, int ncols, int parallel_blocks, ggml_type type_K, ggml_type type_V, bool use_logit_softcap> // D == head size
#if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) #if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__))
__launch_bounds__(D, 1) __launch_bounds__(D, 1)
#endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) #endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__))
static __global__ void flash_attn_vec_ext_f32( static __global__ void flash_attn_vec_ext_f32(
const char * __restrict__ Q, const char * __restrict__ Q,
const char * __restrict__ K, const char * __restrict__ K,
...@@ -232,7 +232,6 @@ static __global__ void flash_attn_vec_ext_f32( ...@@ -232,7 +232,6 @@ static __global__ void flash_attn_vec_ext_f32(
for (int j = 0; j < ncols; ++j) { for (int j = 0; j < ncols; ++j) {
float kqmax_new_j = kqmax_new_arr[j]; float kqmax_new_j = kqmax_new_arr[j];
kqmax_new_j = warp_reduce_max(kqmax_new_j);
if (threadIdx.x == 0) { if (threadIdx.x == 0) {
kqmax_shared[j][threadIdx.y] = kqmax_new_j; kqmax_shared[j][threadIdx.y] = kqmax_new_j;
} }
......
/** /**
* llama.cpp - commit 3f1ae2e32cde00c39b96be6d01c2997c29bae555 - do not edit this file * llama.cpp - commit 40c6d79fb52f995f47507fedfeaae2ac05d9b35c - do not edit this file
* *
* MIT License * MIT License
* *
...@@ -33,9 +33,9 @@ ...@@ -33,9 +33,9 @@
// D == head size, VKQ_stride == num VKQ rows calculated in parallel: // D == head size, VKQ_stride == num VKQ rows calculated in parallel:
template<int D, int ncols, int nwarps, int VKQ_stride, int parallel_blocks, typename KQ_acc_t, bool use_logit_softcap> template<int D, int ncols, int nwarps, int VKQ_stride, int parallel_blocks, typename KQ_acc_t, bool use_logit_softcap>
#if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) #if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__))
__launch_bounds__(nwarps*WARP_SIZE, 1) __launch_bounds__(nwarps*WARP_SIZE, 1)
#endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) #endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__))
static __global__ void flash_attn_ext_f16( static __global__ void flash_attn_ext_f16(
const char * __restrict__ Q, const char * __restrict__ Q,
const char * __restrict__ K, const char * __restrict__ K,
......
/** /**
* llama.cpp - commit 3f1ae2e32cde00c39b96be6d01c2997c29bae555 - do not edit this file * llama.cpp - commit 40c6d79fb52f995f47507fedfeaae2ac05d9b35c - do not edit this file
* *
* MIT License * MIT License
* *
...@@ -39,9 +39,9 @@ static void ggml_cuda_flash_attn_ext_wmma_f16(ggml_backend_cuda_context & ctx, g ...@@ -39,9 +39,9 @@ static void ggml_cuda_flash_attn_ext_wmma_f16(ggml_backend_cuda_context & ctx, g
const ggml_tensor * KQV = dst; const ggml_tensor * KQV = dst;
const ggml_tensor * Q = dst->src[0]; const ggml_tensor * Q = dst->src[0];
const int32_t precision = KQV->op_params[3]; const enum ggml_prec prec = ggml_flash_attn_ext_get_prec(KQV);
if (precision != GGML_PREC_DEFAULT) { if (prec != GGML_PREC_DEFAULT) {
if (Q->ne[1] <= 32 || Q->ne[0] > 128) { if (Q->ne[1] <= 32 || Q->ne[0] > 128) {
constexpr int cols_per_block = 16; constexpr int cols_per_block = 16;
switch (Q->ne[0]) { switch (Q->ne[0]) {
...@@ -327,11 +327,11 @@ void ggml_cuda_flash_attn_ext(ggml_backend_cuda_context & ctx, ggml_tensor * dst ...@@ -327,11 +327,11 @@ void ggml_cuda_flash_attn_ext(ggml_backend_cuda_context & ctx, ggml_tensor * dst
ggml_cuda_set_device(ctx.device); ggml_cuda_set_device(ctx.device);
const int cc = ggml_cuda_info().devices[ggml_cuda_get_device()].cc; const int cc = ggml_cuda_info().devices[ggml_cuda_get_device()].cc;
const int32_t precision = KQV->op_params[3]; const enum ggml_prec prec = ggml_flash_attn_ext_get_prec(KQV);
// On AMD the tile kernels perform poorly, use the vec kernel instead: // On AMD the tile kernels perform poorly, use the vec kernel instead:
if (cc >= CC_OFFSET_AMD) { if (cc >= CC_OFFSET_AMD) {
if (precision == GGML_PREC_DEFAULT && fast_fp16_available(cc)) { if (prec == GGML_PREC_DEFAULT && fast_fp16_available(cc)) {
ggml_cuda_flash_attn_ext_vec_f16(ctx, dst); ggml_cuda_flash_attn_ext_vec_f16(ctx, dst);
} else { } else {
ggml_cuda_flash_attn_ext_vec_f32(ctx, dst); ggml_cuda_flash_attn_ext_vec_f32(ctx, dst);
...@@ -358,7 +358,7 @@ void ggml_cuda_flash_attn_ext(ggml_backend_cuda_context & ctx, ggml_tensor * dst ...@@ -358,7 +358,7 @@ void ggml_cuda_flash_attn_ext(ggml_backend_cuda_context & ctx, ggml_tensor * dst
} }
if (Q->ne[1] == 1 && Q->ne[0] % (2*WARP_SIZE) == 0) { if (Q->ne[1] == 1 && Q->ne[0] % (2*WARP_SIZE) == 0) {
if (precision == GGML_PREC_DEFAULT) { if (prec == GGML_PREC_DEFAULT) {
ggml_cuda_flash_attn_ext_vec_f16(ctx, dst); ggml_cuda_flash_attn_ext_vec_f16(ctx, dst);
return; return;
} else if(Q->ne[0] <= 128) { } else if(Q->ne[0] <= 128) {
......
/** /**
* llama.cpp - commit 3f1ae2e32cde00c39b96be6d01c2997c29bae555 - do not edit this file * llama.cpp - commit 40c6d79fb52f995f47507fedfeaae2ac05d9b35c - do not edit this file
* *
* MIT License * MIT License
* *
......
/** /**
* llama.cpp - commit 3f1ae2e32cde00c39b96be6d01c2997c29bae555 - do not edit this file * llama.cpp - commit 40c6d79fb52f995f47507fedfeaae2ac05d9b35c - do not edit this file
* *
* MIT License * MIT License
* *
......
/** /**
* llama.cpp - commit 3f1ae2e32cde00c39b96be6d01c2997c29bae555 - do not edit this file * llama.cpp - commit 40c6d79fb52f995f47507fedfeaae2ac05d9b35c - do not edit this file
* *
* MIT License * MIT License
* *
......
/** /**
* llama.cpp - commit 3f1ae2e32cde00c39b96be6d01c2997c29bae555 - do not edit this file * llama.cpp - commit 40c6d79fb52f995f47507fedfeaae2ac05d9b35c - do not edit this file
* *
* MIT License * MIT License
* *
...@@ -31,20 +31,22 @@ ...@@ -31,20 +31,22 @@
#include "ggml-cuda/common.cuh" #include "ggml-cuda/common.cuh"
#include "ggml-cuda/acc.cuh" #include "ggml-cuda/acc.cuh"
#include "ggml-cuda/arange.cuh" #include "ggml-cuda/arange.cuh"
#include "ggml-cuda/argmax.cuh"
#include "ggml-cuda/argsort.cuh" #include "ggml-cuda/argsort.cuh"
#include "ggml-cuda/binbcast.cuh" #include "ggml-cuda/binbcast.cuh"
#include "ggml-cuda/clamp.cuh" #include "ggml-cuda/clamp.cuh"
#include "ggml-cuda/concat.cuh" #include "ggml-cuda/concat.cuh"
#include "ggml-cuda/conv-transpose-1d.cuh" #include "ggml-cuda/conv-transpose-1d.cuh"
#include "ggml-cuda/convert.cuh" #include "ggml-cuda/convert.cuh"
#include "ggml-cuda/count-equal.cuh"
#include "ggml-cuda/cpy.cuh" #include "ggml-cuda/cpy.cuh"
#include "ggml-cuda/cross-entropy-loss.cuh" #include "ggml-cuda/cross-entropy-loss.cuh"
#include "ggml-cuda/diagmask.cuh" #include "ggml-cuda/diagmask.cuh"
#include "ggml-cuda/dmmv.cuh"
#include "ggml-cuda/fattn.cuh" #include "ggml-cuda/fattn.cuh"
#include "ggml-cuda/getrows.cuh" #include "ggml-cuda/getrows.cuh"
#include "ggml-cuda/im2col.cuh" #include "ggml-cuda/im2col.cuh"
#include "ggml-cuda/mmq.cuh" #include "ggml-cuda/mmq.cuh"
#include "ggml-cuda/mmv.cuh"
#include "ggml-cuda/mmvq.cuh" #include "ggml-cuda/mmvq.cuh"
#include "ggml-cuda/norm.cuh" #include "ggml-cuda/norm.cuh"
#include "ggml-cuda/opt-step-adamw.cuh" #include "ggml-cuda/opt-step-adamw.cuh"
...@@ -60,7 +62,7 @@ ...@@ -60,7 +62,7 @@
#include "ggml-cuda/tsembd.cuh" #include "ggml-cuda/tsembd.cuh"
#include "ggml-cuda/unary.cuh" #include "ggml-cuda/unary.cuh"
#include "ggml-cuda/upscale.cuh" #include "ggml-cuda/upscale.cuh"
#include "ggml-cuda/rwkv-wkv.cuh" #include "ggml-cuda/wkv6.cuh"
#include <algorithm> #include <algorithm>
#include <array> #include <array>
...@@ -82,54 +84,16 @@ ...@@ -82,54 +84,16 @@
static_assert(sizeof(half) == sizeof(ggml_fp16_t), "wrong fp16 size"); static_assert(sizeof(half) == sizeof(ggml_fp16_t), "wrong fp16 size");
static void ggml_cuda_default_log_callback(enum ggml_log_level level, const char * msg, void * user_data) {
GGML_UNUSED(level);
GGML_UNUSED(user_data);
fprintf(stderr, "%s", msg);
}
ggml_log_callback ggml_cuda_log_callback = ggml_cuda_default_log_callback;
void * ggml_cuda_log_user_data = NULL;
GGML_API void ggml_backend_cuda_log_set_callback(ggml_log_callback log_callback, void * user_data) {
ggml_cuda_log_callback = log_callback;
ggml_cuda_log_user_data = user_data;
}
#define GGML_CUDA_LOG_INFO(...) ggml_cuda_log(GGML_LOG_LEVEL_INFO, __VA_ARGS__)
#define GGML_CUDA_LOG_WARN(...) ggml_cuda_log(GGML_LOG_LEVEL_WARN, __VA_ARGS__)
#define GGML_CUDA_LOG_ERROR(...) ggml_cuda_log(GGML_LOG_LEVEL_ERROR, __VA_ARGS__)
GGML_ATTRIBUTE_FORMAT(2, 3)
static void ggml_cuda_log(enum ggml_log_level level, const char * format, ...) {
if (ggml_cuda_log_callback != NULL) {
va_list args;
va_start(args, format);
char buffer[128];
int len = vsnprintf(buffer, 128, format, args);
if (len < 128) {
ggml_cuda_log_callback(level, buffer, ggml_cuda_log_user_data);
} else {
std::vector<char> buffer2(len + 1); // vsnprintf adds a null terminator
va_end(args);
va_start(args, format);
vsnprintf(&buffer2[0], buffer2.size(), format, args);
ggml_cuda_log_callback(level, buffer2.data(), ggml_cuda_log_user_data);
}
va_end(args);
}
}
[[noreturn]] [[noreturn]]
void ggml_cuda_error(const char * stmt, const char * func, const char * file, int line, const char * msg) { void ggml_cuda_error(const char * stmt, const char * func, const char * file, int line, const char * msg) {
int id = -1; // in case cudaGetDevice fails int id = -1; // in case cudaGetDevice fails
cudaGetDevice(&id); cudaGetDevice(&id);
GGML_CUDA_LOG_ERROR("CUDA error: %s\n", msg); GGML_LOG_ERROR(GGML_CUDA_NAME " error: %s\n", msg);
GGML_CUDA_LOG_ERROR(" current device: %d, in function %s at %s:%d\n", id, func, file, line); GGML_LOG_ERROR(" current device: %d, in function %s at %s:%d\n", id, func, file, line);
GGML_CUDA_LOG_ERROR(" %s\n", stmt); GGML_LOG_ERROR(" %s\n", stmt);
// abort with GGML_ASSERT to get a stack trace // abort with GGML_ABORT to get a stack trace
GGML_ABORT("CUDA error"); GGML_ABORT(GGML_CUDA_NAME " error");
} }
// this is faster on Windows // this is faster on Windows
...@@ -153,7 +117,7 @@ int ggml_cuda_get_device() { ...@@ -153,7 +117,7 @@ int ggml_cuda_get_device() {
static cudaError_t ggml_cuda_device_malloc(void ** ptr, size_t size, int device) { static cudaError_t ggml_cuda_device_malloc(void ** ptr, size_t size, int device) {
ggml_cuda_set_device(device); ggml_cuda_set_device(device);
#if defined(GGML_USE_HIPBLAS) && defined(GGML_HIP_UMA) #if defined(GGML_USE_HIP) && defined(GGML_HIP_UMA)
auto res = hipMallocManaged(ptr, size); auto res = hipMallocManaged(ptr, size);
if (res == hipSuccess) { if (res == hipSuccess) {
// if error we "need" to know why... // if error we "need" to know why...
...@@ -162,7 +126,7 @@ static cudaError_t ggml_cuda_device_malloc(void ** ptr, size_t size, int device) ...@@ -162,7 +126,7 @@ static cudaError_t ggml_cuda_device_malloc(void ** ptr, size_t size, int device)
return res; return res;
#else #else
#if !defined(GGML_USE_HIPBLAS) #if !defined(GGML_USE_HIP)
cudaError_t err; cudaError_t err;
if (getenv("GGML_CUDA_ENABLE_UNIFIED_MEMORY") != nullptr) if (getenv("GGML_CUDA_ENABLE_UNIFIED_MEMORY") != nullptr)
{ {
...@@ -175,7 +139,7 @@ static cudaError_t ggml_cuda_device_malloc(void ** ptr, size_t size, int device) ...@@ -175,7 +139,7 @@ static cudaError_t ggml_cuda_device_malloc(void ** ptr, size_t size, int device)
return err; return err;
#else #else
return cudaMalloc(ptr, size); return cudaMalloc(ptr, size);
#endif // !defined(GGML_USE_HIPBLAS) #endif // !defined(GGML_USE_HIP)
#endif #endif
} }
...@@ -192,7 +156,7 @@ static ggml_cuda_device_info ggml_cuda_init() { ...@@ -192,7 +156,7 @@ static ggml_cuda_device_info ggml_cuda_init() {
cudaError_t err = cudaGetDeviceCount(&info.device_count); cudaError_t err = cudaGetDeviceCount(&info.device_count);
if (err != cudaSuccess) { if (err != cudaSuccess) {
GGML_CUDA_LOG_ERROR("%s: failed to initialize " GGML_CUDA_NAME ": %s\n", __func__, cudaGetErrorString(err)); GGML_LOG_ERROR("%s: failed to initialize " GGML_CUDA_NAME ": %s\n", __func__, cudaGetErrorString(err));
return info; return info;
} }
...@@ -200,20 +164,20 @@ static ggml_cuda_device_info ggml_cuda_init() { ...@@ -200,20 +164,20 @@ static ggml_cuda_device_info ggml_cuda_init() {
int64_t total_vram = 0; int64_t total_vram = 0;
#ifdef GGML_CUDA_FORCE_MMQ #ifdef GGML_CUDA_FORCE_MMQ
GGML_CUDA_LOG_INFO("%s: GGML_CUDA_FORCE_MMQ: yes\n", __func__); GGML_LOG_INFO("%s: GGML_CUDA_FORCE_MMQ: yes\n", __func__);
#else #else
GGML_CUDA_LOG_INFO("%s: GGML_CUDA_FORCE_MMQ: no\n", __func__); GGML_LOG_INFO("%s: GGML_CUDA_FORCE_MMQ: no\n", __func__);
#endif // GGML_CUDA_FORCE_MMQ #endif // GGML_CUDA_FORCE_MMQ
#ifdef GGML_CUDA_FORCE_CUBLAS #ifdef GGML_CUDA_FORCE_CUBLAS
GGML_CUDA_LOG_INFO("%s: GGML_CUDA_FORCE_CUBLAS: yes\n", __func__); GGML_LOG_INFO("%s: GGML_CUDA_FORCE_CUBLAS: yes\n", __func__);
#else #else
GGML_CUDA_LOG_INFO("%s: GGML_CUDA_FORCE_CUBLAS: no\n", __func__); GGML_LOG_INFO("%s: GGML_CUDA_FORCE_CUBLAS: no\n", __func__);
#endif // GGML_CUDA_FORCE_CUBLAS #endif // GGML_CUDA_FORCE_CUBLAS
GGML_CUDA_LOG_INFO("%s: found %d " GGML_CUDA_NAME " devices:\n", __func__, info.device_count); GGML_LOG_INFO("%s: found %d " GGML_CUDA_NAME " devices:\n", __func__, info.device_count);
for (int id = 0; id < info.device_count; ++id) { for (int id = 0; id < info.device_count; ++id) {
int device_vmm = 0; int device_vmm = 0;
#if !defined(GGML_USE_HIPBLAS) && !defined(GGML_CUDA_NO_VMM) #if !defined(GGML_USE_HIP) && !defined(GGML_CUDA_NO_VMM)
CUdevice device; CUdevice device;
CU_CHECK(cuDeviceGet(&device, id)); CU_CHECK(cuDeviceGet(&device, id));
CU_CHECK(cuDeviceGetAttribute(&device_vmm, CU_DEVICE_ATTRIBUTE_VIRTUAL_MEMORY_MANAGEMENT_SUPPORTED, device)); CU_CHECK(cuDeviceGetAttribute(&device_vmm, CU_DEVICE_ATTRIBUTE_VIRTUAL_MEMORY_MANAGEMENT_SUPPORTED, device));
...@@ -225,25 +189,25 @@ static ggml_cuda_device_info ggml_cuda_init() { ...@@ -225,25 +189,25 @@ static ggml_cuda_device_info ggml_cuda_init() {
alloc_prop.location.id = id; alloc_prop.location.id = id;
CU_CHECK(cuMemGetAllocationGranularity(&info.devices[id].vmm_granularity, &alloc_prop, CU_MEM_ALLOC_GRANULARITY_RECOMMENDED)); CU_CHECK(cuMemGetAllocationGranularity(&info.devices[id].vmm_granularity, &alloc_prop, CU_MEM_ALLOC_GRANULARITY_RECOMMENDED));
} }
#endif // !defined(GGML_USE_HIPBLAS) && !defined(GGML_CUDA_NO_VMM) #endif // !defined(GGML_USE_HIP) && !defined(GGML_CUDA_NO_VMM)
info.devices[id].vmm = !!device_vmm; info.devices[id].vmm = !!device_vmm;
cudaDeviceProp prop; cudaDeviceProp prop;
CUDA_CHECK(cudaGetDeviceProperties(&prop, id)); CUDA_CHECK(cudaGetDeviceProperties(&prop, id));
GGML_CUDA_LOG_INFO(" Device %d: %s, compute capability %d.%d, VMM: %s\n", id, prop.name, prop.major, prop.minor, device_vmm ? "yes" : "no"); GGML_LOG_INFO(" Device %d: %s, compute capability %d.%d, VMM: %s\n", id, prop.name, prop.major, prop.minor, device_vmm ? "yes" : "no");
info.default_tensor_split[id] = total_vram; info.default_tensor_split[id] = total_vram;
total_vram += prop.totalGlobalMem; total_vram += prop.totalGlobalMem;
info.devices[id].nsm = prop.multiProcessorCount; info.devices[id].nsm = prop.multiProcessorCount;
info.devices[id].smpb = prop.sharedMemPerBlock; info.devices[id].smpb = prop.sharedMemPerBlock;
#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) #if defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
info.devices[id].smpbo = prop.sharedMemPerBlock; info.devices[id].smpbo = prop.sharedMemPerBlock;
info.devices[id].cc = 100*prop.major + 10*prop.minor + CC_OFFSET_AMD; info.devices[id].cc = 100*prop.major + 10*prop.minor + CC_OFFSET_AMD;
#else #else
info.devices[id].smpbo = prop.sharedMemPerBlockOptin; info.devices[id].smpbo = prop.sharedMemPerBlockOptin;
info.devices[id].cc = 100*prop.major + 10*prop.minor; info.devices[id].cc = 100*prop.major + 10*prop.minor;
#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) #endif // defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
} }
for (int id = 0; id < info.device_count; ++id) { for (int id = 0; id < info.device_count; ++id) {
...@@ -338,7 +302,7 @@ struct ggml_cuda_pool_leg : public ggml_cuda_pool { ...@@ -338,7 +302,7 @@ struct ggml_cuda_pool_leg : public ggml_cuda_pool {
*actual_size = look_ahead_size; *actual_size = look_ahead_size;
pool_size += look_ahead_size; pool_size += look_ahead_size;
#ifdef DEBUG_CUDA_MALLOC #ifdef DEBUG_CUDA_MALLOC
GGML_CUDA_LOG_INFO("%s[%d]: %d buffers, max_size = %u MB, pool_size = %u MB, requested %u MB\n", __func__, device, nnz, GGML_LOG_INFO("%s[%d]: %d buffers, max_size = %u MB, pool_size = %u MB, requested %u MB\n", __func__, device, nnz,
(uint32_t)(max_size / 1024 / 1024), (uint32_t)(pool_size / 1024 / 1024), (uint32_t)(size / 1024 / 1024)); (uint32_t)(max_size / 1024 / 1024), (uint32_t)(pool_size / 1024 / 1024), (uint32_t)(size / 1024 / 1024));
#endif #endif
return ptr; return ptr;
...@@ -353,7 +317,7 @@ struct ggml_cuda_pool_leg : public ggml_cuda_pool { ...@@ -353,7 +317,7 @@ struct ggml_cuda_pool_leg : public ggml_cuda_pool {
return; return;
} }
} }
GGML_CUDA_LOG_WARN("Cuda buffer pool full, increase MAX_CUDA_BUFFERS\n"); GGML_LOG_DEBUG(GGML_CUDA_NAME " buffer pool full, increase MAX_CUDA_BUFFERS\n");
ggml_cuda_set_device(device); ggml_cuda_set_device(device);
CUDA_CHECK(cudaFree(ptr)); CUDA_CHECK(cudaFree(ptr));
pool_size -= size; pool_size -= size;
...@@ -361,7 +325,7 @@ struct ggml_cuda_pool_leg : public ggml_cuda_pool { ...@@ -361,7 +325,7 @@ struct ggml_cuda_pool_leg : public ggml_cuda_pool {
}; };
// pool with virtual memory // pool with virtual memory
#if !defined(GGML_USE_HIPBLAS) && !defined(GGML_CUDA_NO_VMM) #if !defined(GGML_USE_HIP) && !defined(GGML_CUDA_NO_VMM)
struct ggml_cuda_pool_vmm : public ggml_cuda_pool { struct ggml_cuda_pool_vmm : public ggml_cuda_pool {
static const size_t CUDA_POOL_VMM_MAX_SIZE = 1ull << 35; // 32 GB static const size_t CUDA_POOL_VMM_MAX_SIZE = 1ull << 35; // 32 GB
...@@ -455,14 +419,14 @@ struct ggml_cuda_pool_vmm : public ggml_cuda_pool { ...@@ -455,14 +419,14 @@ struct ggml_cuda_pool_vmm : public ggml_cuda_pool {
GGML_ASSERT(ptr == (void *) (pool_addr + pool_used)); GGML_ASSERT(ptr == (void *) (pool_addr + pool_used));
} }
}; };
#endif // !defined(GGML_USE_HIPBLAS) && !defined(GGML_CUDA_NO_VMM) #endif // !defined(GGML_USE_HIP) && !defined(GGML_CUDA_NO_VMM)
std::unique_ptr<ggml_cuda_pool> ggml_backend_cuda_context::new_pool_for_device(int device) { std::unique_ptr<ggml_cuda_pool> ggml_backend_cuda_context::new_pool_for_device(int device) {
#if !defined(GGML_USE_HIPBLAS) && !defined(GGML_CUDA_NO_VMM) #if !defined(GGML_USE_HIP) && !defined(GGML_CUDA_NO_VMM)
if (ggml_cuda_info().devices[device].vmm) { if (ggml_cuda_info().devices[device].vmm) {
return std::unique_ptr<ggml_cuda_pool>(new ggml_cuda_pool_vmm(device)); return std::unique_ptr<ggml_cuda_pool>(new ggml_cuda_pool_vmm(device));
} }
#endif // !defined(GGML_USE_HIPBLAS) && !defined(GGML_CUDA_NO_VMM) #endif // !defined(GGML_USE_HIP) && !defined(GGML_CUDA_NO_VMM)
return std::unique_ptr<ggml_cuda_pool>(new ggml_cuda_pool_leg(device)); return std::unique_ptr<ggml_cuda_pool>(new ggml_cuda_pool_leg(device));
} }
...@@ -483,16 +447,7 @@ struct ggml_backend_cuda_buffer_context { ...@@ -483,16 +447,7 @@ struct ggml_backend_cuda_buffer_context {
} }
}; };
GGML_CALL static const char * ggml_backend_cuda_buffer_get_name(ggml_backend_buffer_t buffer) { static void ggml_backend_cuda_buffer_free_buffer(ggml_backend_buffer_t buffer) {
ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context;
return ctx->name.c_str();
}
GGML_CALL static bool ggml_backend_buffer_is_cuda(ggml_backend_buffer_t buffer) {
return buffer->iface.get_name == ggml_backend_cuda_buffer_get_name;
}
GGML_CALL static void ggml_backend_cuda_buffer_free_buffer(ggml_backend_buffer_t buffer) {
ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context; ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context;
delete ctx; delete ctx;
...@@ -501,12 +456,16 @@ GGML_CALL static void ggml_backend_cuda_buffer_free_buffer(ggml_backend_buffer_t ...@@ -501,12 +456,16 @@ GGML_CALL static void ggml_backend_cuda_buffer_free_buffer(ggml_backend_buffer_t
free(buffer); free(buffer);
} }
GGML_CALL static void * ggml_backend_cuda_buffer_get_base(ggml_backend_buffer_t buffer) { static bool ggml_backend_buffer_is_cuda(ggml_backend_buffer_t buffer) {
return buffer->iface.free_buffer == ggml_backend_cuda_buffer_free_buffer;
}
static void * ggml_backend_cuda_buffer_get_base(ggml_backend_buffer_t buffer) {
ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context; ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context;
return ctx->dev_ptr; return ctx->dev_ptr;
} }
GGML_CALL static void ggml_backend_cuda_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor) { static void ggml_backend_cuda_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor) {
ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context; ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context;
if (tensor->view_src != NULL) { if (tensor->view_src != NULL) {
...@@ -526,7 +485,7 @@ GGML_CALL static void ggml_backend_cuda_buffer_init_tensor(ggml_backend_buffer_t ...@@ -526,7 +485,7 @@ GGML_CALL static void ggml_backend_cuda_buffer_init_tensor(ggml_backend_buffer_t
} }
} }
GGML_CALL static void ggml_backend_cuda_buffer_memset_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, uint8_t value, size_t offset, size_t size) { static void ggml_backend_cuda_buffer_memset_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, uint8_t value, size_t offset, size_t size) {
ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context; ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context;
ggml_cuda_set_device(ctx->device); ggml_cuda_set_device(ctx->device);
...@@ -534,7 +493,7 @@ GGML_CALL static void ggml_backend_cuda_buffer_memset_tensor(ggml_backend_buffer ...@@ -534,7 +493,7 @@ GGML_CALL static void ggml_backend_cuda_buffer_memset_tensor(ggml_backend_buffer
CUDA_CHECK(cudaStreamSynchronize(cudaStreamPerThread)); CUDA_CHECK(cudaStreamSynchronize(cudaStreamPerThread));
} }
GGML_CALL static void ggml_backend_cuda_buffer_set_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, const void * data, size_t offset, size_t size) { static void ggml_backend_cuda_buffer_set_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context; ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context;
ggml_cuda_set_device(ctx->device); ggml_cuda_set_device(ctx->device);
...@@ -542,7 +501,7 @@ GGML_CALL static void ggml_backend_cuda_buffer_set_tensor(ggml_backend_buffer_t ...@@ -542,7 +501,7 @@ GGML_CALL static void ggml_backend_cuda_buffer_set_tensor(ggml_backend_buffer_t
CUDA_CHECK(cudaStreamSynchronize(cudaStreamPerThread)); CUDA_CHECK(cudaStreamSynchronize(cudaStreamPerThread));
} }
GGML_CALL static void ggml_backend_cuda_buffer_get_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * tensor, void * data, size_t offset, size_t size) { static void ggml_backend_cuda_buffer_get_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * tensor, void * data, size_t offset, size_t size) {
ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context; ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context;
ggml_cuda_set_device(ctx->device); ggml_cuda_set_device(ctx->device);
...@@ -550,7 +509,7 @@ GGML_CALL static void ggml_backend_cuda_buffer_get_tensor(ggml_backend_buffer_t ...@@ -550,7 +509,7 @@ GGML_CALL static void ggml_backend_cuda_buffer_get_tensor(ggml_backend_buffer_t
CUDA_CHECK(cudaStreamSynchronize(cudaStreamPerThread)); CUDA_CHECK(cudaStreamSynchronize(cudaStreamPerThread));
} }
GGML_CALL static bool ggml_backend_cuda_buffer_cpy_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * src, ggml_tensor * dst) { static bool ggml_backend_cuda_buffer_cpy_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * src, ggml_tensor * dst) {
if (ggml_backend_buffer_is_cuda(src->buffer)) { if (ggml_backend_buffer_is_cuda(src->buffer)) {
ggml_backend_cuda_buffer_context * src_ctx = (ggml_backend_cuda_buffer_context *)src->buffer->context; ggml_backend_cuda_buffer_context * src_ctx = (ggml_backend_cuda_buffer_context *)src->buffer->context;
ggml_backend_cuda_buffer_context * dst_ctx = (ggml_backend_cuda_buffer_context *)dst->buffer->context; ggml_backend_cuda_buffer_context * dst_ctx = (ggml_backend_cuda_buffer_context *)dst->buffer->context;
...@@ -571,7 +530,7 @@ GGML_CALL static bool ggml_backend_cuda_buffer_cpy_tensor(ggml_backend_buffer_t ...@@ -571,7 +530,7 @@ GGML_CALL static bool ggml_backend_cuda_buffer_cpy_tensor(ggml_backend_buffer_t
GGML_UNUSED(buffer); GGML_UNUSED(buffer);
} }
GGML_CALL static void ggml_backend_cuda_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) { static void ggml_backend_cuda_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context; ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context;
ggml_cuda_set_device(ctx->device); ggml_cuda_set_device(ctx->device);
...@@ -580,8 +539,7 @@ GGML_CALL static void ggml_backend_cuda_buffer_clear(ggml_backend_buffer_t buffe ...@@ -580,8 +539,7 @@ GGML_CALL static void ggml_backend_cuda_buffer_clear(ggml_backend_buffer_t buffe
CUDA_CHECK(cudaDeviceSynchronize()); CUDA_CHECK(cudaDeviceSynchronize());
} }
static ggml_backend_buffer_i ggml_backend_cuda_buffer_interface = { static const ggml_backend_buffer_i ggml_backend_cuda_buffer_interface = {
/* .get_name = */ ggml_backend_cuda_buffer_get_name,
/* .free_buffer = */ ggml_backend_cuda_buffer_free_buffer, /* .free_buffer = */ ggml_backend_cuda_buffer_free_buffer,
/* .get_base = */ ggml_backend_cuda_buffer_get_base, /* .get_base = */ ggml_backend_cuda_buffer_get_base,
/* .init_tensor = */ ggml_backend_cuda_buffer_init_tensor, /* .init_tensor = */ ggml_backend_cuda_buffer_init_tensor,
...@@ -599,29 +557,27 @@ struct ggml_backend_cuda_buffer_type_context { ...@@ -599,29 +557,27 @@ struct ggml_backend_cuda_buffer_type_context {
std::string name; std::string name;
}; };
GGML_CALL static const char * ggml_backend_cuda_buffer_type_name(ggml_backend_buffer_type_t buft) { static const char * ggml_backend_cuda_buffer_type_get_name(ggml_backend_buffer_type_t buft) {
ggml_backend_cuda_buffer_type_context * ctx = (ggml_backend_cuda_buffer_type_context *)buft->context; ggml_backend_cuda_buffer_type_context * ctx = (ggml_backend_cuda_buffer_type_context *)buft->context;
return ctx->name.c_str(); return ctx->name.c_str();
} }
static bool ggml_backend_buft_is_cuda(ggml_backend_buffer_type_t buft) { static bool ggml_backend_buft_is_cuda(ggml_backend_buffer_type_t buft) {
return buft->iface.get_name == ggml_backend_cuda_buffer_type_name; return buft->iface.get_name == ggml_backend_cuda_buffer_type_get_name;
} }
GGML_CALL static ggml_backend_buffer_t ggml_backend_cuda_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) { static ggml_backend_buffer_t ggml_backend_cuda_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
ggml_backend_cuda_buffer_type_context * buft_ctx = (ggml_backend_cuda_buffer_type_context *)buft->context; ggml_backend_cuda_buffer_type_context * buft_ctx = (ggml_backend_cuda_buffer_type_context *)buft->context;
ggml_cuda_set_device(buft_ctx->device); ggml_cuda_set_device(buft_ctx->device);
size = std::max(size, (size_t)1); // cudaMalloc returns null for size 0
void * dev_ptr; void * dev_ptr;
cudaError_t err = ggml_cuda_device_malloc(&dev_ptr, size, buft_ctx->device); cudaError_t err = ggml_cuda_device_malloc(&dev_ptr, size, buft_ctx->device);
if (err != cudaSuccess) { if (err != cudaSuccess) {
// clear the error // clear the error
cudaGetLastError(); cudaGetLastError();
GGML_CUDA_LOG_ERROR("%s: allocating %.2f MiB on device %d: cudaMalloc failed: %s\n", __func__, size / 1024.0 / 1024.0, buft_ctx->device, cudaGetErrorString(err)); GGML_LOG_ERROR("%s: allocating %.2f MiB on device %d: cudaMalloc failed: %s\n", __func__, size / 1024.0 / 1024.0, buft_ctx->device, cudaGetErrorString(err));
return nullptr; return nullptr;
} }
...@@ -630,13 +586,13 @@ GGML_CALL static ggml_backend_buffer_t ggml_backend_cuda_buffer_type_alloc_buffe ...@@ -630,13 +586,13 @@ GGML_CALL static ggml_backend_buffer_t ggml_backend_cuda_buffer_type_alloc_buffe
return ggml_backend_buffer_init(buft, ggml_backend_cuda_buffer_interface, ctx, size); return ggml_backend_buffer_init(buft, ggml_backend_cuda_buffer_interface, ctx, size);
} }
GGML_CALL static size_t ggml_backend_cuda_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) { static size_t ggml_backend_cuda_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) {
return 128; return 128;
GGML_UNUSED(buft); GGML_UNUSED(buft);
} }
GGML_CALL static size_t ggml_backend_cuda_buffer_type_get_alloc_size(ggml_backend_buffer_type_t buft, const ggml_tensor * tensor) { static size_t ggml_backend_cuda_buffer_type_get_alloc_size(ggml_backend_buffer_type_t buft, const ggml_tensor * tensor) {
size_t size = ggml_nbytes(tensor); size_t size = ggml_nbytes(tensor);
int64_t ne0 = tensor->ne[0]; int64_t ne0 = tensor->ne[0];
...@@ -651,8 +607,8 @@ GGML_CALL static size_t ggml_backend_cuda_buffer_type_get_alloc_size(ggml_backen ...@@ -651,8 +607,8 @@ GGML_CALL static size_t ggml_backend_cuda_buffer_type_get_alloc_size(ggml_backen
GGML_UNUSED(buft); GGML_UNUSED(buft);
} }
static ggml_backend_buffer_type_i ggml_backend_cuda_buffer_type_interface = { static const ggml_backend_buffer_type_i ggml_backend_cuda_buffer_type_interface = {
/* .get_name = */ ggml_backend_cuda_buffer_type_name, /* .get_name = */ ggml_backend_cuda_buffer_type_get_name,
/* .alloc_buffer = */ ggml_backend_cuda_buffer_type_alloc_buffer, /* .alloc_buffer = */ ggml_backend_cuda_buffer_type_alloc_buffer,
/* .get_alignment = */ ggml_backend_cuda_buffer_type_get_alignment, /* .get_alignment = */ ggml_backend_cuda_buffer_type_get_alignment,
/* .get_max_size = */ NULL, // defaults to SIZE_MAX /* .get_max_size = */ NULL, // defaults to SIZE_MAX
...@@ -660,7 +616,7 @@ static ggml_backend_buffer_type_i ggml_backend_cuda_buffer_type_interface = { ...@@ -660,7 +616,7 @@ static ggml_backend_buffer_type_i ggml_backend_cuda_buffer_type_interface = {
/* .is_host = */ NULL, /* .is_host = */ NULL,
}; };
GGML_CALL ggml_backend_buffer_type_t ggml_backend_cuda_buffer_type(int device) { ggml_backend_buffer_type_t ggml_backend_cuda_buffer_type(int device) {
static std::mutex mutex; static std::mutex mutex;
std::lock_guard<std::mutex> lock(mutex); std::lock_guard<std::mutex> lock(mutex);
...@@ -673,9 +629,10 @@ GGML_CALL ggml_backend_buffer_type_t ggml_backend_cuda_buffer_type(int device) { ...@@ -673,9 +629,10 @@ GGML_CALL ggml_backend_buffer_type_t ggml_backend_cuda_buffer_type(int device) {
static bool ggml_backend_cuda_buffer_type_initialized = false; static bool ggml_backend_cuda_buffer_type_initialized = false;
if (!ggml_backend_cuda_buffer_type_initialized) { if (!ggml_backend_cuda_buffer_type_initialized) {
for (int i = 0; i < GGML_CUDA_MAX_DEVICES; i++) { for (int i = 0; i < ggml_backend_cuda_get_device_count(); i++) {
ggml_backend_cuda_buffer_types[i] = { ggml_backend_cuda_buffer_types[i] = {
/* .iface = */ ggml_backend_cuda_buffer_type_interface, /* .iface = */ ggml_backend_cuda_buffer_type_interface,
/* .device = */ ggml_backend_reg_dev_get(ggml_backend_cuda_reg(), i),
/* .context = */ new ggml_backend_cuda_buffer_type_context{i, GGML_CUDA_NAME + std::to_string(i)}, /* .context = */ new ggml_backend_cuda_buffer_type_context{i, GGML_CUDA_NAME + std::to_string(i)},
}; };
} }
...@@ -722,7 +679,9 @@ static size_t ggml_nbytes_split(const struct ggml_tensor * tensor, int nrows_spl ...@@ -722,7 +679,9 @@ static size_t ggml_nbytes_split(const struct ggml_tensor * tensor, int nrows_spl
} }
struct ggml_backend_cuda_split_buffer_type_context { struct ggml_backend_cuda_split_buffer_type_context {
int main_device;
std::array<float, GGML_CUDA_MAX_DEVICES> tensor_split; std::array<float, GGML_CUDA_MAX_DEVICES> tensor_split;
std::string name;
}; };
struct ggml_backend_cuda_split_buffer_context { struct ggml_backend_cuda_split_buffer_context {
...@@ -745,30 +704,20 @@ struct ggml_backend_cuda_split_buffer_context { ...@@ -745,30 +704,20 @@ struct ggml_backend_cuda_split_buffer_context {
std::vector<ggml_tensor_extra_gpu *> tensor_extras; std::vector<ggml_tensor_extra_gpu *> tensor_extras;
}; };
GGML_CALL static const char * ggml_backend_cuda_split_buffer_get_name(ggml_backend_buffer_t buffer) {
return GGML_CUDA_NAME "_Split";
GGML_UNUSED(buffer); static void ggml_backend_cuda_split_buffer_free_buffer(ggml_backend_buffer_t buffer) {
}
static bool ggml_backend_buffer_is_cuda_split(ggml_backend_buffer_t buffer) {
return buffer->iface.get_name == ggml_backend_cuda_split_buffer_get_name;
GGML_UNUSED(ggml_backend_buffer_is_cuda_split); // only used in debug builds currently, avoid unused function warning in release builds
}
GGML_CALL static void ggml_backend_cuda_split_buffer_free_buffer(ggml_backend_buffer_t buffer) {
ggml_backend_cuda_split_buffer_context * ctx = (ggml_backend_cuda_split_buffer_context *)buffer->context; ggml_backend_cuda_split_buffer_context * ctx = (ggml_backend_cuda_split_buffer_context *)buffer->context;
delete ctx; delete ctx;
} }
GGML_CALL static void * ggml_backend_cuda_split_buffer_get_base(ggml_backend_buffer_t buffer) { static void * ggml_backend_cuda_split_buffer_get_base(ggml_backend_buffer_t buffer) {
// the pointers are stored in the tensor extras, this is just a dummy address and never dereferenced // the pointers are stored in the tensor extras, this is just a dummy address and never dereferenced
return (void *)0x1000; return (void *)0x1000;
GGML_UNUSED(buffer); GGML_UNUSED(buffer);
} }
GGML_CALL static void ggml_backend_cuda_split_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor) { static void ggml_backend_cuda_split_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor) {
GGML_ASSERT(tensor->view_src == nullptr); // views of split tensors are not supported GGML_ASSERT(tensor->view_src == nullptr); // views of split tensors are not supported
ggml_backend_cuda_split_buffer_context * ctx = (ggml_backend_cuda_split_buffer_context *)buffer->context; ggml_backend_cuda_split_buffer_context * ctx = (ggml_backend_cuda_split_buffer_context *)buffer->context;
...@@ -816,7 +765,7 @@ GGML_CALL static void ggml_backend_cuda_split_buffer_init_tensor(ggml_backend_bu ...@@ -816,7 +765,7 @@ GGML_CALL static void ggml_backend_cuda_split_buffer_init_tensor(ggml_backend_bu
tensor->extra = extra; tensor->extra = extra;
} }
GGML_CALL static void ggml_backend_cuda_split_buffer_set_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, const void * data, size_t offset, size_t size) { static void ggml_backend_cuda_split_buffer_set_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
// split tensors must always be set in their entirety at once // split tensors must always be set in their entirety at once
GGML_ASSERT(offset == 0); GGML_ASSERT(offset == 0);
GGML_ASSERT(size == ggml_nbytes(tensor)); GGML_ASSERT(size == ggml_nbytes(tensor));
...@@ -854,7 +803,7 @@ GGML_CALL static void ggml_backend_cuda_split_buffer_set_tensor(ggml_backend_buf ...@@ -854,7 +803,7 @@ GGML_CALL static void ggml_backend_cuda_split_buffer_set_tensor(ggml_backend_buf
} }
} }
GGML_CALL static void ggml_backend_cuda_split_buffer_get_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * tensor, void * data, size_t offset, size_t size) { static void ggml_backend_cuda_split_buffer_get_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * tensor, void * data, size_t offset, size_t size) {
// split tensors must always be set in their entirety at once // split tensors must always be set in their entirety at once
GGML_ASSERT(offset == 0); GGML_ASSERT(offset == 0);
GGML_ASSERT(size == ggml_nbytes(tensor)); GGML_ASSERT(size == ggml_nbytes(tensor));
...@@ -892,13 +841,12 @@ GGML_CALL static void ggml_backend_cuda_split_buffer_get_tensor(ggml_backend_buf ...@@ -892,13 +841,12 @@ GGML_CALL static void ggml_backend_cuda_split_buffer_get_tensor(ggml_backend_buf
} }
} }
GGML_CALL static void ggml_backend_cuda_split_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) { static void ggml_backend_cuda_split_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
GGML_UNUSED(buffer); GGML_UNUSED(buffer);
GGML_UNUSED(value); GGML_UNUSED(value);
} }
static struct ggml_backend_buffer_i ggml_backend_cuda_split_buffer_interface = { static const ggml_backend_buffer_i ggml_backend_cuda_split_buffer_interface = {
/* .get_name = */ ggml_backend_cuda_split_buffer_get_name,
/* .free_buffer = */ ggml_backend_cuda_split_buffer_free_buffer, /* .free_buffer = */ ggml_backend_cuda_split_buffer_free_buffer,
/* .get_base = */ ggml_backend_cuda_split_buffer_get_base, /* .get_base = */ ggml_backend_cuda_split_buffer_get_base,
/* .init_tensor = */ ggml_backend_cuda_split_buffer_init_tensor, /* .init_tensor = */ ggml_backend_cuda_split_buffer_init_tensor,
...@@ -912,17 +860,17 @@ static struct ggml_backend_buffer_i ggml_backend_cuda_split_buffer_interface = { ...@@ -912,17 +860,17 @@ static struct ggml_backend_buffer_i ggml_backend_cuda_split_buffer_interface = {
// cuda split buffer type // cuda split buffer type
GGML_CALL static const char * ggml_backend_cuda_split_buffer_type_name(ggml_backend_buffer_type_t buft) { static const char * ggml_backend_cuda_split_buffer_type_get_name(ggml_backend_buffer_type_t buft) {
return GGML_CUDA_NAME "_Split"; ggml_backend_cuda_split_buffer_type_context * ctx = (ggml_backend_cuda_split_buffer_type_context *)buft->context;
GGML_UNUSED(buft); return ctx->name.c_str();
} }
static bool ggml_backend_buft_is_cuda_split(ggml_backend_buffer_type_t buft) { static bool ggml_backend_buft_is_cuda_split(ggml_backend_buffer_type_t buft) {
return buft->iface.get_name == ggml_backend_cuda_split_buffer_type_name; return buft->iface.get_name == ggml_backend_cuda_split_buffer_type_get_name;
} }
GGML_CALL static ggml_backend_buffer_t ggml_backend_cuda_split_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) { static ggml_backend_buffer_t ggml_backend_cuda_split_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
// since we don't know the exact split after rounding, we cannot allocate the device buffers at this point // since we don't know the exact split after rounding, we cannot allocate the device buffers at this point
// instead, we allocate them for each tensor separately in init_tensor // instead, we allocate them for each tensor separately in init_tensor
// however, the size still represents the maximum cumulative size of all the device buffers after the tensors are allocated, // however, the size still represents the maximum cumulative size of all the device buffers after the tensors are allocated,
...@@ -932,13 +880,13 @@ GGML_CALL static ggml_backend_buffer_t ggml_backend_cuda_split_buffer_type_alloc ...@@ -932,13 +880,13 @@ GGML_CALL static ggml_backend_buffer_t ggml_backend_cuda_split_buffer_type_alloc
return ggml_backend_buffer_init(buft, ggml_backend_cuda_split_buffer_interface, ctx, size); return ggml_backend_buffer_init(buft, ggml_backend_cuda_split_buffer_interface, ctx, size);
} }
GGML_CALL static size_t ggml_backend_cuda_split_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) { static size_t ggml_backend_cuda_split_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) {
return 128; return 128;
GGML_UNUSED(buft); GGML_UNUSED(buft);
} }
GGML_CALL static size_t ggml_backend_cuda_split_buffer_type_get_alloc_size(ggml_backend_buffer_type_t buft, const ggml_tensor * tensor) { static size_t ggml_backend_cuda_split_buffer_type_get_alloc_size(ggml_backend_buffer_type_t buft, const ggml_tensor * tensor) {
ggml_backend_cuda_split_buffer_type_context * ctx = (ggml_backend_cuda_split_buffer_type_context *)buft->context; ggml_backend_cuda_split_buffer_type_context * ctx = (ggml_backend_cuda_split_buffer_type_context *)buft->context;
size_t total_size = 0; size_t total_size = 0;
...@@ -965,14 +913,14 @@ GGML_CALL static size_t ggml_backend_cuda_split_buffer_type_get_alloc_size(ggml_ ...@@ -965,14 +913,14 @@ GGML_CALL static size_t ggml_backend_cuda_split_buffer_type_get_alloc_size(ggml_
return total_size; return total_size;
} }
GGML_CALL static bool ggml_backend_cuda_split_buffer_type_is_host(ggml_backend_buffer_type_t buft) { static bool ggml_backend_cuda_split_buffer_type_is_host(ggml_backend_buffer_type_t buft) {
return false; return false;
GGML_UNUSED(buft); GGML_UNUSED(buft);
} }
static ggml_backend_buffer_type_i ggml_backend_cuda_split_buffer_type_interface = { static const ggml_backend_buffer_type_i ggml_backend_cuda_split_buffer_type_interface = {
/* .get_name = */ ggml_backend_cuda_split_buffer_type_name, /* .get_name = */ ggml_backend_cuda_split_buffer_type_get_name,
/* .alloc_buffer = */ ggml_backend_cuda_split_buffer_type_alloc_buffer, /* .alloc_buffer = */ ggml_backend_cuda_split_buffer_type_alloc_buffer,
/* .get_alignment = */ ggml_backend_cuda_split_buffer_type_get_alignment, /* .get_alignment = */ ggml_backend_cuda_split_buffer_type_get_alignment,
/* .get_max_size = */ NULL, // defaults to SIZE_MAX /* .get_max_size = */ NULL, // defaults to SIZE_MAX
...@@ -980,11 +928,11 @@ static ggml_backend_buffer_type_i ggml_backend_cuda_split_buffer_type_interface ...@@ -980,11 +928,11 @@ static ggml_backend_buffer_type_i ggml_backend_cuda_split_buffer_type_interface
/* .is_host = */ ggml_backend_cuda_split_buffer_type_is_host, /* .is_host = */ ggml_backend_cuda_split_buffer_type_is_host,
}; };
GGML_CALL ggml_backend_buffer_type_t ggml_backend_cuda_split_buffer_type(const float * tensor_split) { ggml_backend_buffer_type_t ggml_backend_cuda_split_buffer_type(int main_device, const float * tensor_split) {
static std::mutex mutex; static std::mutex mutex;
std::lock_guard<std::mutex> lock(mutex); std::lock_guard<std::mutex> lock(mutex);
static std::map<std::array<float, GGML_CUDA_MAX_DEVICES>, struct ggml_backend_buffer_type> buft_map; static std::map<std::pair<int, std::array<float, GGML_CUDA_MAX_DEVICES>>, struct ggml_backend_buffer_type> buft_map;
std::array<float, GGML_CUDA_MAX_DEVICES> tensor_split_arr = {}; std::array<float, GGML_CUDA_MAX_DEVICES> tensor_split_arr = {};
...@@ -1002,35 +950,35 @@ GGML_CALL ggml_backend_buffer_type_t ggml_backend_cuda_split_buffer_type(const f ...@@ -1002,35 +950,35 @@ GGML_CALL ggml_backend_buffer_type_t ggml_backend_cuda_split_buffer_type(const f
} }
} }
auto it = buft_map.find(tensor_split_arr); auto it = buft_map.find({main_device, tensor_split_arr});
if (it != buft_map.end()) { if (it != buft_map.end()) {
return &it->second; return &it->second;
} }
auto * ctx = new ggml_backend_cuda_split_buffer_type_context{
main_device,
tensor_split_arr,
GGML_CUDA_NAME + std::to_string(main_device) + "_Split",
};
struct ggml_backend_buffer_type buft { struct ggml_backend_buffer_type buft {
/* .iface = */ ggml_backend_cuda_split_buffer_type_interface, /* .iface = */ ggml_backend_cuda_split_buffer_type_interface,
/* .context = */ new ggml_backend_cuda_split_buffer_type_context{tensor_split_arr}, /* .device = */ ggml_backend_reg_dev_get(ggml_backend_cuda_reg(), main_device),
/* .context = */ ctx,
}; };
auto result = buft_map.emplace(tensor_split_arr, buft); auto result = buft_map.emplace(std::make_pair(main_device, tensor_split_arr), buft);
return &result.first->second; return &result.first->second;
} }
// host buffer type // host buffer type
GGML_CALL static const char * ggml_backend_cuda_host_buffer_type_name(ggml_backend_buffer_type_t buft) { static const char * ggml_backend_cuda_host_buffer_type_name(ggml_backend_buffer_type_t buft) {
return GGML_CUDA_NAME "_Host"; return GGML_CUDA_NAME "_Host";
GGML_UNUSED(buft); GGML_UNUSED(buft);
} }
GGML_CALL static const char * ggml_backend_cuda_host_buffer_name(ggml_backend_buffer_t buffer) { static void ggml_backend_cuda_host_buffer_free_buffer(ggml_backend_buffer_t buffer) {
return GGML_CUDA_NAME "_Host";
GGML_UNUSED(buffer);
}
GGML_CALL static void ggml_backend_cuda_host_buffer_free_buffer(ggml_backend_buffer_t buffer) {
CUDA_CHECK(cudaFreeHost(buffer->context)); CUDA_CHECK(cudaFreeHost(buffer->context));
} }
...@@ -1044,7 +992,7 @@ static void * ggml_cuda_host_malloc(size_t size) { ...@@ -1044,7 +992,7 @@ static void * ggml_cuda_host_malloc(size_t size) {
if (err != cudaSuccess) { if (err != cudaSuccess) {
// clear the error // clear the error
cudaGetLastError(); cudaGetLastError();
GGML_CUDA_LOG_WARN("%s: failed to allocate %.2f MiB of pinned memory: %s\n", __func__, GGML_LOG_DEBUG("%s: failed to allocate %.2f MiB of pinned memory: %s\n", __func__,
size / 1024.0 / 1024.0, cudaGetErrorString(err)); size / 1024.0 / 1024.0, cudaGetErrorString(err));
return nullptr; return nullptr;
} }
...@@ -1052,7 +1000,7 @@ static void * ggml_cuda_host_malloc(size_t size) { ...@@ -1052,7 +1000,7 @@ static void * ggml_cuda_host_malloc(size_t size) {
return ptr; return ptr;
} }
GGML_CALL static ggml_backend_buffer_t ggml_backend_cuda_host_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) { static ggml_backend_buffer_t ggml_backend_cuda_host_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
void * ptr = ggml_cuda_host_malloc(size); void * ptr = ggml_cuda_host_malloc(size);
if (ptr == nullptr) { if (ptr == nullptr) {
...@@ -1062,13 +1010,12 @@ GGML_CALL static ggml_backend_buffer_t ggml_backend_cuda_host_buffer_type_alloc_ ...@@ -1062,13 +1010,12 @@ GGML_CALL static ggml_backend_buffer_t ggml_backend_cuda_host_buffer_type_alloc_
ggml_backend_buffer_t buffer = ggml_backend_cpu_buffer_from_ptr(ptr, size); ggml_backend_buffer_t buffer = ggml_backend_cpu_buffer_from_ptr(ptr, size);
buffer->buft = buft; buffer->buft = buft;
buffer->iface.get_name = ggml_backend_cuda_host_buffer_name;
buffer->iface.free_buffer = ggml_backend_cuda_host_buffer_free_buffer; buffer->iface.free_buffer = ggml_backend_cuda_host_buffer_free_buffer;
return buffer; return buffer;
} }
GGML_CALL ggml_backend_buffer_type_t ggml_backend_cuda_host_buffer_type() { ggml_backend_buffer_type_t ggml_backend_cuda_host_buffer_type() {
static struct ggml_backend_buffer_type ggml_backend_cuda_buffer_type_host = { static struct ggml_backend_buffer_type ggml_backend_cuda_buffer_type_host = {
/* .iface = */ { /* .iface = */ {
/* .get_name = */ ggml_backend_cuda_host_buffer_type_name, /* .get_name = */ ggml_backend_cuda_host_buffer_type_name,
...@@ -1078,6 +1025,7 @@ GGML_CALL ggml_backend_buffer_type_t ggml_backend_cuda_host_buffer_type() { ...@@ -1078,6 +1025,7 @@ GGML_CALL ggml_backend_buffer_type_t ggml_backend_cuda_host_buffer_type() {
/* .get_alloc_size = */ ggml_backend_cpu_buffer_type()->iface.get_alloc_size, /* .get_alloc_size = */ ggml_backend_cpu_buffer_type()->iface.get_alloc_size,
/* .is_host = */ ggml_backend_cpu_buffer_type()->iface.is_host, /* .is_host = */ ggml_backend_cpu_buffer_type()->iface.is_host,
}, },
/* .device = */ ggml_backend_reg_dev_get(ggml_backend_cuda_reg(), 0),
/* .context = */ nullptr, /* .context = */ nullptr,
}; };
...@@ -1102,120 +1050,12 @@ typedef void (*ggml_cuda_op_mul_mat_t)( ...@@ -1102,120 +1050,12 @@ typedef void (*ggml_cuda_op_mul_mat_t)(
#define MUL_MAT_SRC1_COL_STRIDE 128 #define MUL_MAT_SRC1_COL_STRIDE 128
static __global__ void mul_mat_p021_f16_f32(
const void * __restrict__ vx, const float * __restrict__ y, float * __restrict__ dst,
const int ncols_x, const int nrows_x, const int nchannels_x, const int nchannels_y) {
const half * x = (const half *) vx;
const int row_x = blockDim.y*blockIdx.y + threadIdx.y;
const int channel = blockDim.z*blockIdx.z + threadIdx.z;
const int channel_x = channel / (nchannels_y / nchannels_x);
const int nrows_y = ncols_x;
const int nrows_dst = nrows_x;
const int row_dst = row_x;
float tmp = 0.0f;
for (int col_x0 = 0; col_x0 < ncols_x; col_x0 += blockDim.x) {
const int col_x = col_x0 + threadIdx.x;
if (col_x >= ncols_x) {
break;
}
// x is transposed and permuted
const int ix = row_x*nchannels_x*ncols_x + channel_x*ncols_x + col_x;
const float xi = __half2float(x[ix]);
const int row_y = col_x;
// y is not transposed but permuted
const int iy = channel*nrows_y + row_y;
tmp += xi * y[iy];
}
// dst is not transposed and not permuted
const int idst = channel*nrows_dst + row_dst;
// sum up partial sums and write back result
tmp = warp_reduce_sum(tmp);
if (threadIdx.x == 0) {
dst[idst] = tmp;
}
}
static __global__ void mul_mat_vec_nc_f16_f32( // nc == non-contiguous
const void * __restrict__ vx, const float * __restrict__ y, float * __restrict__ dst, const int ncols_x, const int nrows_x,
const int row_stride_x, const int channel_stride_x, const int channel_x_divisor) {
const half * x = (const half *) vx;
const int row_x = blockDim.y*blockIdx.y + threadIdx.y;
const int channel = blockDim.z*blockIdx.z + threadIdx.z;
const int channel_x = channel / channel_x_divisor;
const int nrows_y = ncols_x;
const int nrows_dst = nrows_x;
const int row_dst = row_x;
const int idst = channel*nrows_dst + row_dst;
float tmp = 0.0f;
for (int col_x0 = 0; col_x0 < ncols_x; col_x0 += blockDim.x) {
const int col_x = col_x0 + threadIdx.x;
if (col_x >= ncols_x) {
break;
}
const int row_y = col_x;
const int ix = channel_x*channel_stride_x + row_x*row_stride_x + col_x;
const int iy = channel*nrows_y + row_y;
const float xi = __half2float(x[ix]);
tmp += xi * y[iy];
}
// sum up partial sums and write back result
tmp = warp_reduce_sum(tmp);
if (threadIdx.x == 0) {
dst[idst] = tmp;
}
}
static void ggml_mul_mat_p021_f16_f32_cuda(
const void * vx, const float * y, float * dst, const int ncols_x, const int nrows_x,
const int nchannels_x, const int nchannels_y, cudaStream_t stream) {
const dim3 block_nums(1, nrows_x, nchannels_y);
const dim3 block_dims(WARP_SIZE, 1, 1);
mul_mat_p021_f16_f32<<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols_x, nrows_x, nchannels_x, nchannels_y);
}
static void ggml_mul_mat_vec_nc_f16_f32_cuda(
const void * vx, const float * y, float * dst, const int ncols_x, const int nrows_x, const int row_stride_x,
const int nchannels_x, const int nchannels_y, const int channel_stride_x, cudaStream_t stream) {
const dim3 block_nums(1, nrows_x, nchannels_y);
const dim3 block_dims(WARP_SIZE, 1, 1);
mul_mat_vec_nc_f16_f32<<<block_nums, block_dims, 0, stream>>>
(vx, y, dst, ncols_x, nrows_x, row_stride_x, channel_stride_x, nchannels_y/nchannels_x);
}
static cudaError_t ggml_cuda_cpy_tensor_2d( static cudaError_t ggml_cuda_cpy_tensor_2d(
void * dst, const struct ggml_tensor * src, int64_t i3, int64_t i2, int64_t i1_low, int64_t i1_high, cudaStream_t stream) { void * dst, const struct ggml_tensor * src, int64_t i3, int64_t i2, int64_t i1_low, int64_t i1_high, cudaStream_t stream) {
GGML_ASSERT(ggml_backend_buffer_is_cuda(src->buffer)); GGML_ASSERT(ggml_backend_buffer_is_cuda(src->buffer));
char * src_ptr = (char *) src->data; const char * src_ptr = (const char *) src->data;
char * dst_ptr = (char *) dst; char * dst_ptr = (char *) dst;
const int64_t ne0 = src->ne[0]; const int64_t ne0 = src->ne[0];
const int64_t nb0 = src->nb[0]; const int64_t nb0 = src->nb[0];
...@@ -1225,7 +1065,7 @@ static cudaError_t ggml_cuda_cpy_tensor_2d( ...@@ -1225,7 +1065,7 @@ static cudaError_t ggml_cuda_cpy_tensor_2d(
const enum ggml_type type = src->type; const enum ggml_type type = src->type;
const int64_t ts = ggml_type_size(type); const int64_t ts = ggml_type_size(type);
const int64_t bs = ggml_blck_size(type); const int64_t bs = ggml_blck_size(type);
int64_t i1_diff = i1_high - i1_low; const int64_t i1_diff = i1_high - i1_low;
const char * x = src_ptr + i1_low*nb1 + i2*nb2 + i3*nb3; const char * x = src_ptr + i1_low*nb1 + i2*nb2 + i3*nb3;
if (nb0 == ts && nb1 == ts*ne0/bs) { if (nb0 == ts && nb1 == ts*ne0/bs) {
...@@ -1297,6 +1137,11 @@ static void ggml_cuda_op_mul_mat_cublas( ...@@ -1297,6 +1137,11 @@ static void ggml_cuda_op_mul_mat_cublas(
const half alpha_f16 = 1.0f; const half alpha_f16 = 1.0f;
const half beta_f16 = 0.0f; const half beta_f16 = 0.0f;
cublasComputeType_t cu_compute_type = CUBLAS_COMPUTE_16F;
if (ggml_cuda_info().devices[ctx.device].cc == CC_CDNA) {
cu_compute_type = CUBLAS_COMPUTE_32F;
}
CUBLAS_CHECK(cublasSetStream(ctx.cublas_handle(id), stream)); CUBLAS_CHECK(cublasSetStream(ctx.cublas_handle(id), stream));
CUBLAS_CHECK( CUBLAS_CHECK(
cublasGemmEx(ctx.cublas_handle(id), CUBLAS_OP_T, CUBLAS_OP_N, cublasGemmEx(ctx.cublas_handle(id), CUBLAS_OP_T, CUBLAS_OP_N,
...@@ -1304,7 +1149,7 @@ static void ggml_cuda_op_mul_mat_cublas( ...@@ -1304,7 +1149,7 @@ static void ggml_cuda_op_mul_mat_cublas(
&alpha_f16, src0_ptr, CUDA_R_16F, ne00, &alpha_f16, src0_ptr, CUDA_R_16F, ne00,
src1_ptr, CUDA_R_16F, ne10, src1_ptr, CUDA_R_16F, ne10,
&beta_f16, dst_f16.get(), CUDA_R_16F, ldc, &beta_f16, dst_f16.get(), CUDA_R_16F, ldc,
CUBLAS_COMPUTE_16F, cu_compute_type,
CUBLAS_GEMM_DEFAULT_TENSOR_OP)); CUBLAS_GEMM_DEFAULT_TENSOR_OP));
const to_fp32_cuda_t to_fp32_cuda = ggml_get_to_fp32_cuda(GGML_TYPE_F16); const to_fp32_cuda_t to_fp32_cuda = ggml_get_to_fp32_cuda(GGML_TYPE_F16);
...@@ -1379,11 +1224,17 @@ static void ggml_cuda_set_peer_access(const int n_tokens, int main_device) { ...@@ -1379,11 +1224,17 @@ static void ggml_cuda_set_peer_access(const int n_tokens, int main_device) {
cudaError_t err = cudaDeviceEnablePeerAccess(id_other, 0); cudaError_t err = cudaDeviceEnablePeerAccess(id_other, 0);
if (err != cudaErrorPeerAccessAlreadyEnabled) { if (err != cudaErrorPeerAccessAlreadyEnabled) {
CUDA_CHECK(err); CUDA_CHECK(err);
} else {
// reset the error
cudaGetLastError();
} }
} else { } else {
cudaError_t err = cudaDeviceDisablePeerAccess(id_other); cudaError_t err = cudaDeviceDisablePeerAccess(id_other);
if (err != cudaErrorPeerAccessNotEnabled) { if (err != cudaErrorPeerAccessNotEnabled) {
CUDA_CHECK(err); CUDA_CHECK(err);
} else {
// reset the error
cudaGetLastError();
} }
} }
} }
...@@ -1401,7 +1252,7 @@ static void ggml_cuda_set_peer_access(const int n_tokens, int main_device) { ...@@ -1401,7 +1252,7 @@ static void ggml_cuda_set_peer_access(const int n_tokens, int main_device) {
static cudaError_t ggml_cuda_Memcpy2DPeerAsync( static cudaError_t ggml_cuda_Memcpy2DPeerAsync(
void * dst, int dstDevice, size_t dpitch, void * src, int srcDevice, size_t spitch, size_t width, size_t height, cudaStream_t stream) { void * dst, int dstDevice, size_t dpitch, void * src, int srcDevice, size_t spitch, size_t width, size_t height, cudaStream_t stream) {
#if !defined(GGML_USE_HIPBLAS) && !defined(GGML_USE_MUSA) #if !defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA)
// cudaMemcpy2DAsync may fail with copies between vmm pools of different devices // cudaMemcpy2DAsync may fail with copies between vmm pools of different devices
cudaMemcpy3DPeerParms p = {}; cudaMemcpy3DPeerParms p = {};
p.dstDevice = dstDevice; p.dstDevice = dstDevice;
...@@ -1415,7 +1266,7 @@ static cudaError_t ggml_cuda_Memcpy2DPeerAsync( ...@@ -1415,7 +1266,7 @@ static cudaError_t ggml_cuda_Memcpy2DPeerAsync(
GGML_UNUSED(dstDevice); GGML_UNUSED(dstDevice);
GGML_UNUSED(srcDevice); GGML_UNUSED(srcDevice);
return cudaMemcpy2DAsync(dst, dpitch, src, spitch, width, height, cudaMemcpyDeviceToDevice, stream); return cudaMemcpy2DAsync(dst, dpitch, src, spitch, width, height, cudaMemcpyDeviceToDevice, stream);
#endif // !defined(GGML_USE_HIPBLAS) && !defined(GGML_USE_MUSA) #endif // !defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA)
} }
static void ggml_cuda_op_mul_mat( static void ggml_cuda_op_mul_mat(
...@@ -1463,7 +1314,7 @@ static void ggml_cuda_op_mul_mat( ...@@ -1463,7 +1314,7 @@ static void ggml_cuda_op_mul_mat(
const int64_t src1_padded_col_size = GGML_PAD(ne10, MATRIX_ROW_PADDING); const int64_t src1_padded_col_size = GGML_PAD(ne10, MATRIX_ROW_PADDING);
const bool split = ggml_backend_buffer_is_cuda_split(src0->buffer); const bool split = ggml_backend_buft_is_cuda_split(src0->buffer->buft);
GGML_ASSERT(!(split && ne02 > 1)); GGML_ASSERT(!(split && ne02 > 1));
GGML_ASSERT(!(split && ne03 > 1)); GGML_ASSERT(!(split && ne03 > 1));
GGML_ASSERT(!(split && ne02 < ne12)); GGML_ASSERT(!(split && ne02 < ne12));
...@@ -1542,14 +1393,24 @@ static void ggml_cuda_op_mul_mat( ...@@ -1542,14 +1393,24 @@ static void ggml_cuda_op_mul_mat(
if (src0_is_contiguous) { if (src0_is_contiguous) {
dev[id].src0_dd = split ? (char *) src0_extra->data_device[id] : (char *) src0->data; dev[id].src0_dd = split ? (char *) src0_extra->data_device[id] : (char *) src0->data;
} else { } else {
dev[id].src0_dd = dev[id].src0_dd_alloc.alloc(ctx.pool(id), ggml_nbytes(src0)); // If src0 is not contiguous it will be copied to a temporary buffer.
} // This buffer needs to be cleared entirely because multiple regions will function as padding.
const size_t nbytes_data = ggml_nbytes(src0);
// If src0 is on a temporary compute buffers (partial offloading) there may be some padding that needs to be cleared: const size_t nbytes_padding = ggml_row_size(src0->type, MATRIX_ROW_PADDING - ne00 % MATRIX_ROW_PADDING);
dev[id].src0_dd = dev[id].src0_dd_alloc.alloc(ctx.pool(id), nbytes_data + nbytes_padding);
// TODO: remove this for MUSA once the Guilty Lockup issue is resolved
#ifndef GGML_USE_MUSA
CUDA_CHECK(cudaMemsetAsync(dev[id].src0_dd, 0, nbytes_data + nbytes_padding, stream));
#else // GGML_USE_MUSA
CUDA_CHECK(cudaMemsetAsync(dev[id].src0_dd + nbytes_data, 0, nbytes_padding, stream));
#endif // !GGML_USE_MUSA
}
// If src0 is on a temporary compute buffer (partial offloading) there may be some padding that needs to be cleared:
if (ne00 % MATRIX_ROW_PADDING != 0 && ggml_is_quantized(src0->type) && ggml_backend_buffer_get_usage(src0->buffer) == GGML_BACKEND_BUFFER_USAGE_COMPUTE && src0->view_src == nullptr) { if (ne00 % MATRIX_ROW_PADDING != 0 && ggml_is_quantized(src0->type) && ggml_backend_buffer_get_usage(src0->buffer) == GGML_BACKEND_BUFFER_USAGE_COMPUTE && src0->view_src == nullptr) {
const int64_t nbytes_data = ggml_row_size(src0->type, (dev[id].row_high - dev[id].row_low)*ne00); const size_t nbytes_data = ggml_row_size(src0->type, (dev[id].row_high - dev[id].row_low)*ne00);
const int64_t nbytes_padding = ggml_row_size(src0->type, MATRIX_ROW_PADDING - ne00 % MATRIX_ROW_PADDING); const size_t nbytes_padding = ggml_row_size(src0->type, MATRIX_ROW_PADDING - ne00 % MATRIX_ROW_PADDING);
CUDA_CHECK(cudaMemsetAsync(dev[id].src0_dd + nbytes_data , 0, nbytes_padding, stream)); CUDA_CHECK(cudaMemsetAsync(dev[id].src0_dd + nbytes_data, 0, nbytes_padding, stream));
} }
if (src1_on_device && src1_is_contiguous) { if (src1_on_device && src1_is_contiguous) {
...@@ -1720,58 +1581,6 @@ static void ggml_cuda_op_mul_mat( ...@@ -1720,58 +1581,6 @@ static void ggml_cuda_op_mul_mat(
} }
} }
static void ggml_cuda_mul_mat_vec_p021(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
GGML_ASSERT(ggml_is_permuted(src0) && ggml_is_permuted(src1));
GGML_ASSERT(ggml_backend_buffer_is_cuda(src0->buffer));
GGML_ASSERT(src0->nb[0] <= src0->nb[1] && src0->nb[2] <= src0->nb[3]); // 0213 permutation
GGML_ASSERT(src1->nb[0] <= src1->nb[1] && src1->nb[2] <= src1->nb[3]); // 0213 permutation
GGML_ASSERT(src0->type == GGML_TYPE_F16);
GGML_ASSERT(src1->type == GGML_TYPE_F32);
const int64_t ne00 = src0->ne[0];
const int64_t ne01 = src0->ne[1];
const int64_t ne02 = src0->ne[2];
const int64_t ne12 = src1->ne[2];
cudaStream_t main_stream = ctx.stream();
void * src0_ddq = src0->data;
float * src1_ddf = (float *) src1->data;
float * dst_ddf = (float *) dst->data;
ggml_mul_mat_p021_f16_f32_cuda(src0_ddq, src1_ddf, dst_ddf, ne00, ne01, ne02, ne12, main_stream);
}
static void ggml_cuda_mul_mat_vec_nc(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
GGML_ASSERT(!ggml_is_transposed(src0));
GGML_ASSERT(!ggml_is_transposed(src1));
GGML_ASSERT(!ggml_is_permuted(src0));
GGML_ASSERT(ggml_backend_buffer_is_cuda(src0->buffer));
GGML_ASSERT(src0->type == GGML_TYPE_F16);
GGML_ASSERT(src1->type == GGML_TYPE_F32);
const int64_t ne00 = src0->ne[0];
const int64_t ne01 = src0->ne[1];
const int64_t ne02 = src0->ne[2];
const int64_t nb01 = src0->nb[1];
const int64_t nb02 = src0->nb[2];
const int64_t ne12 = src1->ne[2];
cudaStream_t main_stream = ctx.stream();
void * src0_ddq = src0->data;
float * src1_ddf = (float *) src1->data;
float * dst_ddf = (float *) dst->data;
const int64_t row_stride_x = nb01 / sizeof(half);
const int64_t channel_stride_x = nb02 / sizeof(half);
ggml_mul_mat_vec_nc_f16_f32_cuda(src0_ddq, src1_ddf, dst_ddf, ne00, ne01, row_stride_x, ne02, ne12, channel_stride_x, main_stream);
}
static __global__ void k_compute_batched_ptrs( static __global__ void k_compute_batched_ptrs(
const half * src0_as_f16, const half * src1_as_f16, char * dst, const half * src0_as_f16, const half * src1_as_f16, char * dst,
const void ** ptrs_src, void ** ptrs_dst, const void ** ptrs_src, void ** ptrs_dst,
...@@ -1833,6 +1642,10 @@ static void ggml_cuda_mul_mat_batched_cublas(ggml_backend_cuda_context & ctx, co ...@@ -1833,6 +1642,10 @@ static void ggml_cuda_mul_mat_batched_cublas(ggml_backend_cuda_context & ctx, co
cublasComputeType_t cu_compute_type = CUBLAS_COMPUTE_16F; cublasComputeType_t cu_compute_type = CUBLAS_COMPUTE_16F;
cudaDataType_t cu_data_type = CUDA_R_16F; cudaDataType_t cu_data_type = CUDA_R_16F;
if (ggml_cuda_info().devices[ctx.device].cc == CC_CDNA) {
cu_compute_type = CUBLAS_COMPUTE_32F;
}
// dst strides // dst strides
size_t nbd2 = dst->nb[2]; size_t nbd2 = dst->nb[2];
size_t nbd3 = dst->nb[3]; size_t nbd3 = dst->nb[3];
...@@ -1943,23 +1756,19 @@ static void ggml_cuda_mul_mat_batched_cublas(ggml_backend_cuda_context & ctx, co ...@@ -1943,23 +1756,19 @@ static void ggml_cuda_mul_mat_batched_cublas(ggml_backend_cuda_context & ctx, co
} }
static void ggml_cuda_mul_mat(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { static void ggml_cuda_mul_mat(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
const bool split = ggml_backend_buffer_is_cuda_split(src0->buffer); const bool split = ggml_backend_buft_is_cuda_split(src0->buffer->buft);
bool use_dequantize_mul_mat_vec = ggml_cuda_dmmv_type_supported(src0->type) bool use_mul_mat_vec = src0->type == GGML_TYPE_F16
&& src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32
&& src0->ne[0] % (GGML_CUDA_DMMV_X*2) == 0 && src1->ne[1] == 1; && src0->ne[0] % 2 == 0 && src1->ne[1] == 1;
bool use_mul_mat_vec_q = ggml_is_quantized(src0->type) bool use_mul_mat_vec_q = ggml_is_quantized(src0->type)
&& src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32
&& src1->ne[1] <= MMVQ_MAX_BATCH_SIZE; && src1->ne[1] <= MMVQ_MAX_BATCH_SIZE;
bool use_mul_mat_q = ggml_is_quantized(src0->type) bool use_mul_mat_q = ggml_is_quantized(src0->type)
&& src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32; && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32;
// if mmvq is available it's a better choice than dmmv: bool any_gpus_with_slow_fp16 = false;
#ifndef GGML_CUDA_FORCE_DMMV bool any_gpus_without_fp16_mma = false;
use_dequantize_mul_mat_vec = use_dequantize_mul_mat_vec && !use_mul_mat_vec_q;
#endif // GGML_CUDA_FORCE_DMMV
bool any_gpus_with_slow_fp16 = false;
if (split) { if (split) {
ggml_backend_cuda_split_buffer_type_context * buft_ctx = (ggml_backend_cuda_split_buffer_type_context *) src0->buffer->buft->context; ggml_backend_cuda_split_buffer_type_context * buft_ctx = (ggml_backend_cuda_split_buffer_type_context *) src0->buffer->buft->context;
...@@ -1970,14 +1779,16 @@ static void ggml_cuda_mul_mat(ggml_backend_cuda_context & ctx, const ggml_tensor ...@@ -1970,14 +1779,16 @@ static void ggml_cuda_mul_mat(ggml_backend_cuda_context & ctx, const ggml_tensor
continue; continue;
} }
const int cc = ggml_cuda_info().devices[id].cc; const int cc = ggml_cuda_info().devices[id].cc;
use_mul_mat_q = use_mul_mat_q && ggml_cuda_should_use_mmq(src0->type, cc, src1->ne[1]); use_mul_mat_q = use_mul_mat_q && ggml_cuda_should_use_mmq(src0->type, cc, src1->ne[1]);
any_gpus_with_slow_fp16 = any_gpus_with_slow_fp16 || !fast_fp16_available(cc); any_gpus_with_slow_fp16 = any_gpus_with_slow_fp16 || !fast_fp16_available(cc);
any_gpus_without_fp16_mma = any_gpus_without_fp16_mma || !fp16_mma_available(cc);
} }
} else { } else {
const int cc = ggml_cuda_info().devices[ctx.device].cc; const int cc = ggml_cuda_info().devices[ctx.device].cc;
use_mul_mat_q = use_mul_mat_q && ggml_cuda_should_use_mmq(src0->type, cc, src1->ne[1]); use_mul_mat_q = use_mul_mat_q && ggml_cuda_should_use_mmq(src0->type, cc, src1->ne[1]);
any_gpus_with_slow_fp16 = any_gpus_with_slow_fp16 || !fast_fp16_available(cc); any_gpus_with_slow_fp16 = any_gpus_with_slow_fp16 || !fast_fp16_available(cc);
any_gpus_without_fp16_mma = any_gpus_without_fp16_mma || !fp16_mma_available(cc);
} }
// debug helpers // debug helpers
...@@ -1988,18 +1799,16 @@ static void ggml_cuda_mul_mat(ggml_backend_cuda_context & ctx, const ggml_tensor ...@@ -1988,18 +1799,16 @@ static void ggml_cuda_mul_mat(ggml_backend_cuda_context & ctx, const ggml_tensor
//printf("src0 is contiguous %d, transposed %d, type = %s, name = %s\n", ggml_is_contiguous(src0), ggml_is_transposed(src0), ggml_type_name(src0->type), src0->name); //printf("src0 is contiguous %d, transposed %d, type = %s, name = %s\n", ggml_is_contiguous(src0), ggml_is_transposed(src0), ggml_type_name(src0->type), src0->name);
//printf("src1 is contiguous %d, transposed %d, type = %s, name = %s\n", ggml_is_contiguous(src1), ggml_is_transposed(src1), ggml_type_name(src1->type), src1->name); //printf("src1 is contiguous %d, transposed %d, type = %s, name = %s\n", ggml_is_contiguous(src1), ggml_is_transposed(src1), ggml_type_name(src1->type), src1->name);
if (!split && any_gpus_with_slow_fp16 && src0->type == GGML_TYPE_F16 && ggml_is_permuted(src0) && ggml_is_permuted(src1) && src1->ne[1] == 1) { if (!split && use_mul_mat_vec && dst->ne[3] == 1 && (src0->ne[1] < MMV_MAX_ROWS || any_gpus_without_fp16_mma)) {
// FP32 precision KQ single-batch for batch size 1 without FlashAttention // the custom F16 vector kernel can be used over batched cuBLAS GEMM
ggml_cuda_mul_mat_vec_p021(ctx, src0, src1, dst); // but this is only faster for GPUs without tensor cores or with a thin src0 matrix (particularly KQV in attention)
} else if (!split && any_gpus_with_slow_fp16 && src0->type == GGML_TYPE_F16 && !ggml_is_contiguous(src0) && !ggml_is_transposed(src1) && src1->ne[1] == 1) { ggml_cuda_mul_mat_vec(ctx, src0, src1, dst);
// FP32 precision KQV single-batch for batch size 1 without FlashAttention
ggml_cuda_mul_mat_vec_nc(ctx, src0, src1, dst);
} else if (!split && src0->type == GGML_TYPE_F16 && (src1->type == GGML_TYPE_F16 || !any_gpus_with_slow_fp16) } else if (!split && src0->type == GGML_TYPE_F16 && (src1->type == GGML_TYPE_F16 || !any_gpus_with_slow_fp16)
&& !ggml_is_transposed(src0) && !ggml_is_transposed(src1) && src1->ne[2]*src1->ne[3] > 1) { && !ggml_is_transposed(src0) && !ggml_is_transposed(src1) && src1->ne[2]*src1->ne[3] > 1) {
// KQ + KQV multi-batch without FlashAttention // general KQ + KQV multi-batch without FlashAttention
ggml_cuda_mul_mat_batched_cublas(ctx, src0, src1, dst); ggml_cuda_mul_mat_batched_cublas(ctx, src0, src1, dst);
} else if (use_dequantize_mul_mat_vec) { } else if (use_mul_mat_vec) {
ggml_cuda_op_mul_mat(ctx, src0, src1, dst, ggml_cuda_op_dequantize_mul_mat_vec, nullptr); ggml_cuda_op_mul_mat(ctx, src0, src1, dst, ggml_cuda_op_mul_mat_vec, nullptr);
} else if (use_mul_mat_vec_q) { } else if (use_mul_mat_vec_q) {
ggml_cuda_op_mul_mat(ctx, src0, src1, dst, ggml_cuda_op_mul_mat_vec_q, quantize_row_q8_1_cuda); ggml_cuda_op_mul_mat(ctx, src0, src1, dst, ggml_cuda_op_mul_mat_vec_q, quantize_row_q8_1_cuda);
} else if (use_mul_mat_q) { } else if (use_mul_mat_q) {
...@@ -2070,7 +1879,7 @@ static void ggml_cuda_mul_mat_id(ggml_backend_cuda_context & ctx, ggml_tensor * ...@@ -2070,7 +1879,7 @@ static void ggml_cuda_mul_mat_id(ggml_backend_cuda_context & ctx, ggml_tensor *
GGML_TENSOR_BINARY_OP_LOCALS GGML_TENSOR_BINARY_OP_LOCALS
GGML_ASSERT(!ggml_backend_buffer_is_cuda_split(src0->buffer) && "mul_mat_id does not support split buffers"); GGML_ASSERT(!ggml_backend_buft_is_cuda_split(src0->buffer->buft) && "mul_mat_id does not support split buffers");
cudaStream_t stream = ctx.stream(); cudaStream_t stream = ctx.stream();
...@@ -2203,11 +2012,17 @@ static void ggml_cuda_mul_mat_id(ggml_backend_cuda_context & ctx, ggml_tensor * ...@@ -2203,11 +2012,17 @@ static void ggml_cuda_mul_mat_id(ggml_backend_cuda_context & ctx, ggml_tensor *
static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct ggml_tensor * dst) { static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct ggml_tensor * dst) {
// why is this here instead of mul_mat? // why is this here instead of mul_mat?
if (dst->src[0] != nullptr && ggml_backend_buffer_is_cuda_split(dst->src[0]->buffer)) { if (dst->src[0] != nullptr && ggml_backend_buft_is_cuda_split(dst->src[0]->buffer->buft)) {
ggml_cuda_set_peer_access(dst->src[1]->ne[1], ctx.device); ggml_cuda_set_peer_access(dst->src[1]->ne[1], ctx.device);
} }
switch (dst->op) { switch (dst->op) {
case GGML_OP_ARGMAX:
ggml_cuda_argmax(ctx, dst);
break;
case GGML_OP_COUNT_EQUAL:
ggml_cuda_count_equal(ctx, dst);
break;
case GGML_OP_REPEAT: case GGML_OP_REPEAT:
ggml_cuda_op_repeat(ctx, dst); ggml_cuda_op_repeat(ctx, dst);
break; break;
...@@ -2313,7 +2128,7 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg ...@@ -2313,7 +2128,7 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg
break; break;
case GGML_OP_MUL_MAT: case GGML_OP_MUL_MAT:
if (dst->src[0]->ne[3] != dst->src[1]->ne[3]) { if (dst->src[0]->ne[3] != dst->src[1]->ne[3]) {
GGML_CUDA_LOG_ERROR("%s: cannot compute %s: src0->ne[3] = %" PRId64 ", src1->ne[3] = %" PRId64 " - fallback to CPU\n", __func__, dst->name, dst->src[0]->ne[3], dst->src[1]->ne[3]); GGML_LOG_ERROR("%s: cannot compute %s: src0->ne[3] = %" PRId64 ", src1->ne[3] = %" PRId64 " - fallback to CPU\n", __func__, dst->name, dst->src[0]->ne[3], dst->src[1]->ne[3]);
return false; return false;
} else { } else {
ggml_cuda_mul_mat(ctx, dst->src[0], dst->src[1], dst); ggml_cuda_mul_mat(ctx, dst->src[0], dst->src[1], dst);
...@@ -2384,8 +2199,8 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg ...@@ -2384,8 +2199,8 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg
case GGML_OP_CROSS_ENTROPY_LOSS: case GGML_OP_CROSS_ENTROPY_LOSS:
ggml_cuda_cross_entropy_loss(ctx, dst); ggml_cuda_cross_entropy_loss(ctx, dst);
break; break;
case GGML_OP_RWKV_WKV: case GGML_OP_RWKV_WKV6:
ggml_cuda_op_rwkv_wkv(ctx, dst); ggml_cuda_op_rwkv_wkv6(ctx, dst);
break; break;
case GGML_OP_CROSS_ENTROPY_LOSS_BACK: case GGML_OP_CROSS_ENTROPY_LOSS_BACK:
ggml_cuda_cross_entropy_loss_back(ctx, dst); ggml_cuda_cross_entropy_loss_back(ctx, dst);
...@@ -2399,7 +2214,7 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg ...@@ -2399,7 +2214,7 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg
cudaError_t err = cudaGetLastError(); cudaError_t err = cudaGetLastError();
if (err != cudaSuccess) { if (err != cudaSuccess) {
GGML_CUDA_LOG_ERROR("%s: %s failed\n", __func__, ggml_op_desc(dst)); GGML_LOG_ERROR("%s: %s failed\n", __func__, ggml_op_desc(dst));
CUDA_CHECK(err); CUDA_CHECK(err);
} }
...@@ -2410,26 +2225,20 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg ...@@ -2410,26 +2225,20 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg
// backend // backend
GGML_CALL static const char * ggml_backend_cuda_name(ggml_backend_t backend) { static const char * ggml_backend_cuda_get_name(ggml_backend_t backend) {
ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context; ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
return cuda_ctx->name.c_str(); return cuda_ctx->name.c_str();
} }
GGML_CALL static void ggml_backend_cuda_free(ggml_backend_t backend) { static void ggml_backend_cuda_free(ggml_backend_t backend) {
ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context; ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
delete cuda_ctx; delete cuda_ctx;
delete backend; delete backend;
} }
GGML_CALL static ggml_backend_buffer_type_t ggml_backend_cuda_get_default_buffer_type(ggml_backend_t backend) { static void ggml_backend_cuda_set_tensor_async(ggml_backend_t backend, ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
return ggml_backend_cuda_buffer_type(cuda_ctx->device);
}
GGML_CALL static void ggml_backend_cuda_set_tensor_async(ggml_backend_t backend, ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context; ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
ggml_backend_buffer_t buf = tensor->view_src ? tensor->view_src->buffer : tensor->buffer; ggml_backend_buffer_t buf = tensor->view_src ? tensor->view_src->buffer : tensor->buffer;
...@@ -2438,7 +2247,7 @@ GGML_CALL static void ggml_backend_cuda_set_tensor_async(ggml_backend_t backend, ...@@ -2438,7 +2247,7 @@ GGML_CALL static void ggml_backend_cuda_set_tensor_async(ggml_backend_t backend,
CUDA_CHECK(cudaMemcpyAsync((char *)tensor->data + offset, data, size, cudaMemcpyHostToDevice, cuda_ctx->stream())); CUDA_CHECK(cudaMemcpyAsync((char *)tensor->data + offset, data, size, cudaMemcpyHostToDevice, cuda_ctx->stream()));
} }
GGML_CALL static void ggml_backend_cuda_get_tensor_async(ggml_backend_t backend, const ggml_tensor * tensor, void * data, size_t offset, size_t size) { static void ggml_backend_cuda_get_tensor_async(ggml_backend_t backend, const ggml_tensor * tensor, void * data, size_t offset, size_t size) {
ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context; ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
ggml_backend_buffer_t buf = tensor->view_src ? tensor->view_src->buffer : tensor->buffer; ggml_backend_buffer_t buf = tensor->view_src ? tensor->view_src->buffer : tensor->buffer;
...@@ -2447,7 +2256,7 @@ GGML_CALL static void ggml_backend_cuda_get_tensor_async(ggml_backend_t backend, ...@@ -2447,7 +2256,7 @@ GGML_CALL static void ggml_backend_cuda_get_tensor_async(ggml_backend_t backend,
CUDA_CHECK(cudaMemcpyAsync(data, (const char *)tensor->data + offset, size, cudaMemcpyDeviceToHost, cuda_ctx->stream())); CUDA_CHECK(cudaMemcpyAsync(data, (const char *)tensor->data + offset, size, cudaMemcpyDeviceToHost, cuda_ctx->stream()));
} }
GGML_CALL static bool ggml_backend_cuda_cpy_tensor_async(ggml_backend_t backend_src, ggml_backend_t backend_dst, const ggml_tensor * src, ggml_tensor * dst) { static bool ggml_backend_cuda_cpy_tensor_async(ggml_backend_t backend_src, ggml_backend_t backend_dst, const ggml_tensor * src, ggml_tensor * dst) {
ggml_backend_buffer_t buf_src = src->view_src ? src->view_src->buffer : src->buffer; ggml_backend_buffer_t buf_src = src->view_src ? src->view_src->buffer : src->buffer;
ggml_backend_buffer_t buf_dst = dst->view_src ? dst->view_src->buffer : dst->buffer; ggml_backend_buffer_t buf_dst = dst->view_src ? dst->view_src->buffer : dst->buffer;
...@@ -2468,7 +2277,7 @@ GGML_CALL static bool ggml_backend_cuda_cpy_tensor_async(ggml_backend_t backend_ ...@@ -2468,7 +2277,7 @@ GGML_CALL static bool ggml_backend_cuda_cpy_tensor_async(ggml_backend_t backend_
if (cuda_ctx_src->device != buf_ctx_src->device || cuda_ctx_dst->device != buf_ctx_dst->device) { if (cuda_ctx_src->device != buf_ctx_src->device || cuda_ctx_dst->device != buf_ctx_dst->device) {
#ifndef NDEBUG #ifndef NDEBUG
GGML_CUDA_LOG_WARN("%s: backend and buffer devices do not match\n", __func__); GGML_LOG_DEBUG("%s: backend and buffer devices do not match\n", __func__);
#endif #endif
return false; return false;
} }
...@@ -2502,7 +2311,7 @@ GGML_CALL static bool ggml_backend_cuda_cpy_tensor_async(ggml_backend_t backend_ ...@@ -2502,7 +2311,7 @@ GGML_CALL static bool ggml_backend_cuda_cpy_tensor_async(ggml_backend_t backend_
return true; return true;
} }
GGML_CALL static void ggml_backend_cuda_synchronize(ggml_backend_t backend) { static void ggml_backend_cuda_synchronize(ggml_backend_t backend) {
ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context; ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
CUDA_CHECK(cudaStreamSynchronize(cuda_ctx->stream())); CUDA_CHECK(cudaStreamSynchronize(cuda_ctx->stream()));
...@@ -2510,6 +2319,7 @@ GGML_CALL static void ggml_backend_cuda_synchronize(ggml_backend_t backend) { ...@@ -2510,6 +2319,7 @@ GGML_CALL static void ggml_backend_cuda_synchronize(ggml_backend_t backend) {
GGML_UNUSED(backend); GGML_UNUSED(backend);
} }
#ifdef USE_CUDA_GRAPH
static void set_ggml_graph_node_properties(ggml_tensor * node, ggml_graph_node_properties * graph_node_properties) { static void set_ggml_graph_node_properties(ggml_tensor * node, ggml_graph_node_properties * graph_node_properties) {
graph_node_properties->node_address = node->data; graph_node_properties->node_address = node->data;
graph_node_properties->node_op = node->op; graph_node_properties->node_op = node->op;
...@@ -2560,8 +2370,9 @@ static bool ggml_graph_node_has_matching_properties(ggml_tensor * node, ggml_gra ...@@ -2560,8 +2370,9 @@ static bool ggml_graph_node_has_matching_properties(ggml_tensor * node, ggml_gra
return true; return true;
} }
#endif
GGML_CALL static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) { static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) {
ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context; ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
ggml_cuda_set_device(cuda_ctx->device); ggml_cuda_set_device(cuda_ctx->device);
...@@ -2584,7 +2395,7 @@ GGML_CALL static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t ...@@ -2584,7 +2395,7 @@ GGML_CALL static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t
if (ggml_cuda_info().devices[cuda_ctx->device].cc < CC_AMPERE) { if (ggml_cuda_info().devices[cuda_ctx->device].cc < CC_AMPERE) {
cuda_ctx->cuda_graph->disable_due_to_gpu_arch = true; cuda_ctx->cuda_graph->disable_due_to_gpu_arch = true;
#ifndef NDEBUG #ifndef NDEBUG
GGML_CUDA_LOG_WARN("%s: disabling CUDA graphs due to GPU architecture\n", __func__); GGML_LOG_DEBUG("%s: disabling CUDA graphs due to GPU architecture\n", __func__);
#endif #endif
} }
} }
...@@ -2632,17 +2443,17 @@ GGML_CALL static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t ...@@ -2632,17 +2443,17 @@ GGML_CALL static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t
continue; continue;
} }
if (node->src[0] && node->src[0]->buffer && ggml_backend_buffer_is_cuda_split(node->src[0]->buffer)) { if (node->src[0] && node->src[0]->buffer && ggml_backend_buft_is_cuda_split(node->src[0]->buffer->buft)) {
use_cuda_graph = false; // Split buffers are not supported by CUDA graph capture use_cuda_graph = false; // Split buffers are not supported by CUDA graph capture
#ifndef NDEBUG #ifndef NDEBUG
GGML_CUDA_LOG_WARN("%s: disabling CUDA graphs due to split buffer\n", __func__); GGML_LOG_DEBUG("%s: disabling CUDA graphs due to split buffer\n", __func__);
#endif #endif
} }
if (node->op == GGML_OP_MUL_MAT_ID) { if (node->op == GGML_OP_MUL_MAT_ID) {
use_cuda_graph = false; // This node type is not supported by CUDA graph capture use_cuda_graph = false; // This node type is not supported by CUDA graph capture
#ifndef NDEBUG #ifndef NDEBUG
GGML_CUDA_LOG_WARN("%s: disabling CUDA graphs due to mul_mat_id\n", __func__); GGML_LOG_DEBUG("%s: disabling CUDA graphs due to mul_mat_id\n", __func__);
#endif #endif
} }
...@@ -2651,7 +2462,7 @@ GGML_CALL static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t ...@@ -2651,7 +2462,7 @@ GGML_CALL static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t
// Changes in batch size or context size can cause changes to the grid size of some kernels. // Changes in batch size or context size can cause changes to the grid size of some kernels.
use_cuda_graph = false; use_cuda_graph = false;
#ifndef NDEBUG #ifndef NDEBUG
GGML_CUDA_LOG_WARN("%s: disabling CUDA graphs due to batch size > 1 [%s] [%ld %ld %ld %ld]\n", __func__, node->name, node->ne[0], node->ne[1], node->ne[2], node->ne[3]); GGML_LOG_DEBUG("%s: disabling CUDA graphs due to batch size > 1 [%s] [%ld %ld %ld %ld]\n", __func__, node->name, node->ne[0], node->ne[1], node->ne[2], node->ne[3]);
#endif #endif
} }
...@@ -2663,7 +2474,7 @@ GGML_CALL static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t ...@@ -2663,7 +2474,7 @@ GGML_CALL static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t
if (!ptr) { if (!ptr) {
use_cuda_graph = false; use_cuda_graph = false;
#ifndef NDEBUG #ifndef NDEBUG
GGML_CUDA_LOG_WARN("%s: disabling CUDA graphs due to unsupported copy op\n", __func__); GGML_LOG_DEBUG("%s: disabling CUDA graphs due to unsupported copy op\n", __func__);
#endif #endif
} else { } else {
if (std::find(ggml_cuda_cpy_fn_ptrs.begin(), ggml_cuda_cpy_fn_ptrs.end(), ptr) == ggml_cuda_cpy_fn_ptrs.end()) { if (std::find(ggml_cuda_cpy_fn_ptrs.begin(), ggml_cuda_cpy_fn_ptrs.end(), ptr) == ggml_cuda_cpy_fn_ptrs.end()) {
...@@ -2687,7 +2498,7 @@ GGML_CALL static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t ...@@ -2687,7 +2498,7 @@ GGML_CALL static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t
if (cuda_ctx->cuda_graph->number_consecutive_updates >= 4) { if (cuda_ctx->cuda_graph->number_consecutive_updates >= 4) {
cuda_ctx->cuda_graph->disable_due_to_too_many_updates = true; cuda_ctx->cuda_graph->disable_due_to_too_many_updates = true;
#ifndef NDEBUG #ifndef NDEBUG
GGML_CUDA_LOG_WARN("%s: disabling CUDA graphs due to too many consecutive updates\n", __func__); GGML_LOG_DEBUG("%s: disabling CUDA graphs due to too many consecutive updates\n", __func__);
#endif #endif
} }
} }
...@@ -2719,14 +2530,15 @@ GGML_CALL static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t ...@@ -2719,14 +2530,15 @@ GGML_CALL static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t
for (int j = 0; j < GGML_MAX_SRC; j++) { for (int j = 0; j < GGML_MAX_SRC; j++) {
if (node->src[j] != nullptr) { if (node->src[j] != nullptr) {
assert(node->src[j]->buffer); assert(node->src[j]->buffer);
assert(node->src[j]->buffer->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device) || ggml_backend_buffer_is_cuda_split(node->src[j]->buffer)); assert(node->src[j]->buffer->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device) ||
ggml_backend_buft_is_cuda_split(node->src[j]->buffer->buft));
} }
} }
#endif #endif
bool ok = ggml_cuda_compute_forward(*cuda_ctx, node); bool ok = ggml_cuda_compute_forward(*cuda_ctx, node);
if (!ok) { if (!ok) {
GGML_CUDA_LOG_ERROR("%s: op not supported %s (%s)\n", __func__, node->name, ggml_op_name(node->op)); GGML_LOG_ERROR("%s: op not supported %s (%s)\n", __func__, node->name, ggml_op_name(node->op));
} }
GGML_ASSERT(ok); GGML_ASSERT(ok);
} }
...@@ -2745,7 +2557,7 @@ GGML_CALL static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t ...@@ -2745,7 +2557,7 @@ GGML_CALL static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t
use_cuda_graph = false; use_cuda_graph = false;
cuda_ctx->cuda_graph->disable_due_to_failed_graph_capture = true; cuda_ctx->cuda_graph->disable_due_to_failed_graph_capture = true;
#ifndef NDEBUG #ifndef NDEBUG
GGML_CUDA_LOG_WARN("%s: disabling CUDA graphs due to failed graph capture\n", __func__); GGML_LOG_DEBUG("%s: disabling CUDA graphs due to failed graph capture\n", __func__);
#endif #endif
} else { } else {
graph_evaluated_or_captured = true; // CUDA graph has been captured graph_evaluated_or_captured = true; // CUDA graph has been captured
...@@ -2812,7 +2624,7 @@ GGML_CALL static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t ...@@ -2812,7 +2624,7 @@ GGML_CALL static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t
cudaError_t stat = cudaGraphExecUpdate(cuda_ctx->cuda_graph->instance, cuda_ctx->cuda_graph->graph, &result_info); cudaError_t stat = cudaGraphExecUpdate(cuda_ctx->cuda_graph->instance, cuda_ctx->cuda_graph->graph, &result_info);
if (stat == cudaErrorGraphExecUpdateFailure) { if (stat == cudaErrorGraphExecUpdateFailure) {
#ifndef NDEBUG #ifndef NDEBUG
GGML_CUDA_LOG_ERROR("%s: CUDA graph update failed\n", __func__); GGML_LOG_DEBUG("%s: CUDA graph update failed\n", __func__);
#endif #endif
// The pre-existing graph exec cannot be updated due to violated constraints // The pre-existing graph exec cannot be updated due to violated constraints
// so instead clear error and re-instantiate // so instead clear error and re-instantiate
...@@ -2833,8 +2645,195 @@ GGML_CALL static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t ...@@ -2833,8 +2645,195 @@ GGML_CALL static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t
return GGML_STATUS_SUCCESS; return GGML_STATUS_SUCCESS;
} }
GGML_CALL static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, const ggml_tensor * op) { static void ggml_backend_cuda_event_record(ggml_backend_t backend, ggml_backend_event_t event) {
ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *) backend->context; ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
CUDA_CHECK(cudaEventRecord((cudaEvent_t)event->context, cuda_ctx->stream()));
}
static void ggml_backend_cuda_event_wait(ggml_backend_t backend, ggml_backend_event_t event) {
ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
if (ggml_backend_is_cuda(backend)) {
CUDA_CHECK(cudaStreamWaitEvent(cuda_ctx->stream(), (cudaEvent_t)event->context, 0));
} else {
#if 0
// untested
auto wait_fn = [](void * user_data) {
ggml_backend_event_t event = (ggml_backend_event_t)user_data;
ggml_backend_event_synchronize(event);
};
CUDA_CHECK(cudaLaunchHostFunc(cuda_ctx->stream(), wait_fn, event));
#endif
GGML_ABORT("fatal error");
}
}
static const ggml_backend_i ggml_backend_cuda_interface = {
/* .get_name = */ ggml_backend_cuda_get_name,
/* .free = */ ggml_backend_cuda_free,
/* .set_tensor_async = */ ggml_backend_cuda_set_tensor_async,
/* .get_tensor_async = */ ggml_backend_cuda_get_tensor_async,
/* .cpy_tensor_async = */ ggml_backend_cuda_cpy_tensor_async,
/* .synchronize = */ ggml_backend_cuda_synchronize,
/* .graph_plan_create = */ NULL,
/* .graph_plan_free = */ NULL,
/* .graph_plan_update = */ NULL,
/* .graph_plan_compute = */ NULL,
/* .graph_compute = */ ggml_backend_cuda_graph_compute,
/* .event_record = */ ggml_backend_cuda_event_record,
/* .event_wait = */ ggml_backend_cuda_event_wait,
};
static ggml_guid_t ggml_backend_cuda_guid() {
static ggml_guid guid = { 0x2c, 0xdd, 0xe8, 0x1c, 0x65, 0xb3, 0x65, 0x73, 0x6a, 0x12, 0x88, 0x61, 0x1c, 0xc9, 0xdc, 0x25 };
return &guid;
}
bool ggml_backend_is_cuda(ggml_backend_t backend) {
return backend != NULL && ggml_guid_matches(backend->guid, ggml_backend_cuda_guid());
}
int ggml_backend_cuda_get_device_count() {
return ggml_cuda_info().device_count;
}
void ggml_backend_cuda_get_device_description(int device, char * description, size_t description_size) {
cudaDeviceProp prop;
CUDA_CHECK(cudaGetDeviceProperties(&prop, device));
snprintf(description, description_size, "%s", prop.name);
}
void ggml_backend_cuda_get_device_memory(int device, size_t * free, size_t * total) {
ggml_cuda_set_device(device);
CUDA_CHECK(cudaMemGetInfo(free, total));
}
bool ggml_backend_cuda_register_host_buffer(void * buffer, size_t size) {
if (getenv("GGML_CUDA_REGISTER_HOST") == nullptr) {
return false;
}
#if CUDART_VERSION >= 11100 || defined(GGML_USE_MUSA)
cudaError_t err = cudaHostRegister(buffer, size, cudaHostRegisterPortable | cudaHostRegisterReadOnly);
if (err != cudaSuccess) {
// clear the error
cudaGetLastError();
GGML_LOG_DEBUG("%s: failed to register %.2f MiB of pinned memory: %s\n", __func__,
size / 1024.0 / 1024.0, cudaGetErrorString(err));
return false;
}
return true;
#else
return false;
#endif
}
void ggml_backend_cuda_unregister_host_buffer(void * buffer) {
if (getenv("GGML_CUDA_REGISTER_HOST") == nullptr) {
return;
}
cudaError_t err = cudaHostUnregister(buffer);
if (err != cudaSuccess) {
// clear the error
cudaGetLastError();
}
}
// backend device
struct ggml_backend_cuda_device_context {
int device;
std::string name;
std::string description;
};
static const char * ggml_backend_cuda_device_get_name(ggml_backend_dev_t dev) {
ggml_backend_cuda_device_context * ctx = (ggml_backend_cuda_device_context *)dev->context;
return ctx->name.c_str();
}
static const char * ggml_backend_cuda_device_get_description(ggml_backend_dev_t dev) {
ggml_backend_cuda_device_context * ctx = (ggml_backend_cuda_device_context *)dev->context;
return ctx->description.c_str();
}
static void ggml_backend_cuda_device_get_memory(ggml_backend_dev_t dev, size_t * free, size_t * total) {
ggml_backend_cuda_device_context * ctx = (ggml_backend_cuda_device_context *)dev->context;
ggml_cuda_set_device(ctx->device);
CUDA_CHECK(cudaMemGetInfo(free, total));
}
static enum ggml_backend_dev_type ggml_backend_cuda_device_get_type(ggml_backend_dev_t dev) {
GGML_UNUSED(dev);
return GGML_BACKEND_DEVICE_TYPE_GPU;
}
static void ggml_backend_cuda_device_get_props(ggml_backend_dev_t dev, ggml_backend_dev_props * props) {
props->name = ggml_backend_cuda_device_get_name(dev);
props->description = ggml_backend_cuda_device_get_description(dev);
props->type = ggml_backend_cuda_device_get_type(dev);
ggml_backend_cuda_device_get_memory(dev, &props->memory_free, &props->memory_total);
bool host_buffer = getenv("GGML_CUDA_NO_PINNED") == nullptr;
#ifdef GGML_CUDA_NO_PEER_COPY
bool events = false;
#else
bool events = true;
#endif
props->caps = {
/* .async = */ true,
/* .host_buffer = */ host_buffer,
/* .buffer_from_host_ptr = */ false,
/* .events = */ events,
};
}
static ggml_backend_t ggml_backend_cuda_device_init_backend(ggml_backend_dev_t dev, const char * params) {
GGML_UNUSED(params);
ggml_backend_cuda_device_context * ctx = (ggml_backend_cuda_device_context *)dev->context;
return ggml_backend_cuda_init(ctx->device);
}
static ggml_backend_buffer_type_t ggml_backend_cuda_device_get_buffer_type(ggml_backend_dev_t dev) {
ggml_backend_cuda_device_context * ctx = (ggml_backend_cuda_device_context *)dev->context;
return ggml_backend_cuda_buffer_type(ctx->device);
}
static ggml_backend_buffer_type_t ggml_backend_cuda_device_get_host_buffer_type(ggml_backend_dev_t dev) {
GGML_UNUSED(dev);
return ggml_backend_cuda_host_buffer_type();
}
// TODO: move these functions here
static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const ggml_tensor * op) {
ggml_backend_cuda_device_context * dev_ctx = (ggml_backend_cuda_device_context *) dev->context;
// split buffers can only be used with GGML_OP_MUL_MAT
if (op->op != GGML_OP_MUL_MAT) {
for (int i = 0; i < GGML_MAX_SRC; i++) {
if (op->src[i] && op->src[i]->buffer && ggml_backend_buft_is_cuda_split(op->src[i]->buffer->buft)) {
return false;
}
}
}
// check if all the sources are allocated on this device
for (int i = 0; i < GGML_MAX_SRC; i++) {
if (op->src[i] && op->src[i]->buffer && ggml_backend_buft_is_cuda(op->src[i]->buffer->buft)) {
ggml_backend_cuda_buffer_type_context * buft_ctx = (ggml_backend_cuda_buffer_type_context *)op->src[i]->buffer->buft->context;
if (buft_ctx->device != dev_ctx->device) {
return false;
}
}
}
switch (op->op) { switch (op->op) {
case GGML_OP_UNARY: case GGML_OP_UNARY:
switch (ggml_get_unary_op(op)) { switch (ggml_get_unary_op(op)) {
...@@ -2859,6 +2858,17 @@ GGML_CALL static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, cons ...@@ -2859,6 +2858,17 @@ GGML_CALL static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, cons
{ {
struct ggml_tensor * a = op->src[0]; struct ggml_tensor * a = op->src[0];
struct ggml_tensor * b = op->src[1]; struct ggml_tensor * b = op->src[1];
// for small weight matrices the active device can end up without any rows, don't use row split in those cases
// this avoids some edge cases (and the performance would not be good anyways)
if (a->buffer && ggml_backend_buft_is_cuda_split(a->buffer->buft)) {
ggml_backend_cuda_split_buffer_type_context * buft_ctx = (ggml_backend_cuda_split_buffer_type_context *) a->buffer->buft->context;
int64_t row_low;
int64_t row_high;
get_row_split(&row_low, &row_high, a, buft_ctx->tensor_split, dev_ctx->device);
if (row_low == row_high) {
return false;
}
}
if (b->type == GGML_TYPE_F16 && a->type != GGML_TYPE_F16) { if (b->type == GGML_TYPE_F16 && a->type != GGML_TYPE_F16) {
return false; return false;
} }
...@@ -2964,6 +2974,15 @@ GGML_CALL static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, cons ...@@ -2964,6 +2974,15 @@ GGML_CALL static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, cons
return false; return false;
} break; } break;
case GGML_OP_DUP: case GGML_OP_DUP:
{
ggml_type src0_type = op->src[0]->type;
return src0_type != GGML_TYPE_I32 && src0_type != GGML_TYPE_I16;
} break;
case GGML_OP_ARGMAX:
case GGML_OP_COUNT_EQUAL:
{
return true;
} break;
case GGML_OP_REPEAT: case GGML_OP_REPEAT:
{ {
ggml_type src0_type = op->src[0]->type; ggml_type src0_type = op->src[0]->type;
...@@ -2985,18 +3004,20 @@ GGML_CALL static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, cons ...@@ -2985,18 +3004,20 @@ GGML_CALL static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, cons
} }
return false; return false;
} break; } break;
case GGML_OP_NORM:
case GGML_OP_RMS_NORM:
return ggml_is_contiguous(op->src[0]) && op->ne[0] % WARP_SIZE == 0;
break;
case GGML_OP_NONE: case GGML_OP_NONE:
case GGML_OP_RESHAPE: case GGML_OP_RESHAPE:
case GGML_OP_VIEW: case GGML_OP_VIEW:
case GGML_OP_PERMUTE: case GGML_OP_PERMUTE:
case GGML_OP_TRANSPOSE: case GGML_OP_TRANSPOSE:
case GGML_OP_NORM:
case GGML_OP_ADD: case GGML_OP_ADD:
case GGML_OP_ADD1: case GGML_OP_ADD1:
case GGML_OP_SUB: case GGML_OP_SUB:
case GGML_OP_MUL: case GGML_OP_MUL:
case GGML_OP_DIV: case GGML_OP_DIV:
case GGML_OP_RMS_NORM:
case GGML_OP_SCALE: case GGML_OP_SCALE:
case GGML_OP_SQR: case GGML_OP_SQR:
case GGML_OP_SQRT: case GGML_OP_SQRT:
...@@ -3012,7 +3033,6 @@ GGML_CALL static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, cons ...@@ -3012,7 +3033,6 @@ GGML_CALL static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, cons
case GGML_OP_ROPE: case GGML_OP_ROPE:
return ggml_is_contiguous(op->src[0]); return ggml_is_contiguous(op->src[0]);
case GGML_OP_IM2COL: case GGML_OP_IM2COL:
return op->src[0]->type == GGML_TYPE_F16;
case GGML_OP_POOL_2D: case GGML_OP_POOL_2D:
case GGML_OP_SUM: case GGML_OP_SUM:
case GGML_OP_SUM_ROWS: case GGML_OP_SUM_ROWS:
...@@ -3025,12 +3045,15 @@ GGML_CALL static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, cons ...@@ -3025,12 +3045,15 @@ GGML_CALL static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, cons
case GGML_OP_ARANGE: case GGML_OP_ARANGE:
case GGML_OP_TIMESTEP_EMBEDDING: case GGML_OP_TIMESTEP_EMBEDDING:
case GGML_OP_LEAKY_RELU: case GGML_OP_LEAKY_RELU:
case GGML_OP_RWKV_WKV: case GGML_OP_RWKV_WKV6:
return true; return true;
case GGML_OP_FLASH_ATTN_EXT: { case GGML_OP_FLASH_ATTN_EXT: {
#ifndef FLASH_ATTN_AVAILABLE #ifndef FLASH_ATTN_AVAILABLE
return false; return false;
#endif #endif
if (op->src[1]->type == GGML_TYPE_BF16 || op->src[2]->type == GGML_TYPE_BF16) {
return false;
}
if (op->src[0]->ne[0] == 64 && op->src[1]->type == GGML_TYPE_F16) { if (op->src[0]->ne[0] == 64 && op->src[1]->type == GGML_TYPE_F16) {
return true; return true;
} }
...@@ -3040,7 +3063,7 @@ GGML_CALL static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, cons ...@@ -3040,7 +3063,7 @@ GGML_CALL static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, cons
if (op->src[0]->ne[0] == 256 && op->src[1]->type == GGML_TYPE_F16 && op->src[2]->type == GGML_TYPE_F16) { if (op->src[0]->ne[0] == 256 && op->src[1]->type == GGML_TYPE_F16 && op->src[2]->type == GGML_TYPE_F16) {
return true; return true;
} }
const int cc = ggml_cuda_info().devices[cuda_ctx->device].cc; const int cc = ggml_cuda_info().devices[dev_ctx->device].cc;
return cc >= CC_VOLTA && cc < CC_OFFSET_AMD && op->src[1]->type == GGML_TYPE_F16 && op->src[2]->type == GGML_TYPE_F16; return cc >= CC_VOLTA && cc < CC_OFFSET_AMD && op->src[1]->type == GGML_TYPE_F16 && op->src[2]->type == GGML_TYPE_F16;
} }
case GGML_OP_CROSS_ENTROPY_LOSS: case GGML_OP_CROSS_ENTROPY_LOSS:
...@@ -3050,203 +3073,245 @@ GGML_CALL static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, cons ...@@ -3050,203 +3073,245 @@ GGML_CALL static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, cons
default: default:
return false; return false;
} }
GGML_UNUSED(backend);
} }
GGML_CALL static bool ggml_backend_cuda_supports_buft(ggml_backend_t backend, ggml_backend_buffer_type_t buft) { static bool ggml_backend_cuda_device_supports_buft(ggml_backend_dev_t dev, ggml_backend_buffer_type_t buft) {
if (ggml_backend_buft_is_cuda_split(buft)) { return (ggml_backend_buft_is_cuda(buft) || ggml_backend_buft_is_cuda_split(buft)) && buft->device == dev;
return true; }
}
if (ggml_backend_buft_is_cuda(buft)) { static int64_t get_op_batch_size(const ggml_tensor * op) {
ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context; switch (op->op) {
ggml_backend_cuda_buffer_type_context * buft_ctx = (ggml_backend_cuda_buffer_type_context *)buft->context; case GGML_OP_GET_ROWS:
return buft_ctx->device == cuda_ctx->device; return 0;
case GGML_OP_MUL_MAT:
return op->ne[1];
case GGML_OP_MUL_MAT_ID:
case GGML_OP_ROPE:
return op->ne[2];
default:
return ggml_nrows(op);
} }
return false;
} }
GGML_CALL static bool ggml_backend_cuda_offload_op(ggml_backend_t backend, const ggml_tensor * op) { static bool ggml_backend_cuda_device_offload_op(ggml_backend_dev_t dev, const ggml_tensor * op) {
const int min_batch_size = 32; const int min_batch_size = 32;
return (op->ne[1] >= min_batch_size && op->op != GGML_OP_GET_ROWS) || return get_op_batch_size(op) >= min_batch_size;
(op->ne[2] >= min_batch_size && op->op == GGML_OP_MUL_MAT_ID);
GGML_UNUSED(backend); GGML_UNUSED(dev);
} }
static ggml_backend_event_t ggml_backend_cuda_event_new(ggml_backend_t backend) { static ggml_backend_event_t ggml_backend_cuda_device_event_new(ggml_backend_dev_t dev) {
#ifdef GGML_CUDA_NO_PEER_COPY #ifdef GGML_CUDA_NO_PEER_COPY
return nullptr; return nullptr;
#else #else
ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context; ggml_backend_cuda_device_context * dev_ctx = (ggml_backend_cuda_device_context *)dev->context;
ggml_cuda_set_device(cuda_ctx->device); ggml_cuda_set_device(dev_ctx->device);
cudaEvent_t event; cudaEvent_t event;
CUDA_CHECK(cudaEventCreateWithFlags(&event, cudaEventDisableTiming)); CUDA_CHECK(cudaEventCreateWithFlags(&event, cudaEventDisableTiming));
return new ggml_backend_event { return new ggml_backend_event {
/* .backend = */ backend, /* .device = */ dev,
/* .context = */ event, /* .context = */ event,
}; };
#endif #endif
} }
static void ggml_backend_cuda_event_free(ggml_backend_event_t event) { static void ggml_backend_cuda_device_event_free(ggml_backend_dev_t dev, ggml_backend_event_t event) {
CUDA_CHECK(cudaEventDestroy((cudaEvent_t)event->context)); GGML_UNUSED(dev);
CUDA_CHECK(cudaEventDestroy((cudaEvent_t)event->context));
delete event; delete event;
} }
static void ggml_backend_cuda_event_record(ggml_backend_event_t event) { static void ggml_backend_cuda_device_event_synchronize(ggml_backend_dev_t dev, ggml_backend_event_t event) {
ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)event->backend->context; GGML_UNUSED(dev);
CUDA_CHECK(cudaEventSynchronize((cudaEvent_t)event->context));
CUDA_CHECK(cudaEventRecord((cudaEvent_t)event->context, cuda_ctx->stream()));
} }
static void ggml_backend_cuda_event_wait(ggml_backend_t backend, ggml_backend_event_t event) { static const ggml_backend_device_i ggml_backend_cuda_device_interface = {
ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context; /* .get_name = */ ggml_backend_cuda_device_get_name,
/* .get_description = */ ggml_backend_cuda_device_get_description,
/* .get_memory = */ ggml_backend_cuda_device_get_memory,
/* .get_type = */ ggml_backend_cuda_device_get_type,
/* .get_props = */ ggml_backend_cuda_device_get_props,
/* .init_backend = */ ggml_backend_cuda_device_init_backend,
/* .get_buffer_type = */ ggml_backend_cuda_device_get_buffer_type,
/* .get_host_buffer_type = */ ggml_backend_cuda_device_get_host_buffer_type,
/* .buffer_from_host_ptr = */ NULL,
/* .supports_op = */ ggml_backend_cuda_device_supports_op,
/* .supports_buft = */ ggml_backend_cuda_device_supports_buft,
/* .offload_op = */ ggml_backend_cuda_device_offload_op,
/* .event_new = */ ggml_backend_cuda_device_event_new,
/* .event_free = */ ggml_backend_cuda_device_event_free,
/* .event_synchronize = */ ggml_backend_cuda_device_event_synchronize,
};
if (ggml_backend_is_cuda(event->backend)) { // backend reg
CUDA_CHECK(cudaStreamWaitEvent(cuda_ctx->stream(), (cudaEvent_t)event->context, 0));
} else {
#if 0
// untested
auto wait_fn = [](void * user_data) {
ggml_backend_event_t event = (ggml_backend_event_t)user_data;
ggml_backend_event_synchronize(event);
};
CUDA_CHECK(cudaLaunchHostFunc(cuda_ctx->stream(), wait_fn, event)); struct ggml_backend_cuda_reg_context {
#endif std::vector<ggml_backend_dev_t> devices;
GGML_ABORT("fatal error"); };
}
}
static void ggml_backend_cuda_event_synchronize(ggml_backend_event_t event) { static const char * ggml_backend_cuda_reg_get_name(ggml_backend_reg_t reg) {
CUDA_CHECK(cudaEventSynchronize((cudaEvent_t)event->context)); GGML_UNUSED(reg);
return GGML_CUDA_NAME;
} }
static ggml_backend_i ggml_backend_cuda_interface = { static size_t ggml_backend_cuda_reg_get_device_count(ggml_backend_reg_t reg) {
/* .get_name = */ ggml_backend_cuda_name, ggml_backend_cuda_reg_context * ctx = (ggml_backend_cuda_reg_context *)reg->context;
/* .free = */ ggml_backend_cuda_free, return ctx->devices.size();
/* .get_default_buffer_type = */ ggml_backend_cuda_get_default_buffer_type, }
/* .set_tensor_async = */ ggml_backend_cuda_set_tensor_async,
/* .get_tensor_async = */ ggml_backend_cuda_get_tensor_async,
/* .cpy_tensor_async = */ ggml_backend_cuda_cpy_tensor_async,
/* .synchronize = */ ggml_backend_cuda_synchronize,
/* .graph_plan_create = */ NULL,
/* .graph_plan_free = */ NULL,
/* .graph_plan_update = */ NULL,
/* .graph_plan_compute = */ NULL,
/* .graph_compute = */ ggml_backend_cuda_graph_compute,
/* .supports_op = */ ggml_backend_cuda_supports_op,
/* .supports_buft = */ ggml_backend_cuda_supports_buft,
/* .offload_op = */ ggml_backend_cuda_offload_op,
/* .event_new = */ ggml_backend_cuda_event_new,
/* .event_free = */ ggml_backend_cuda_event_free,
/* .event_record = */ ggml_backend_cuda_event_record,
/* .event_wait = */ ggml_backend_cuda_event_wait,
/* .event_synchronize = */ ggml_backend_cuda_event_synchronize,
};
static ggml_guid_t ggml_backend_cuda_guid() { static ggml_backend_dev_t ggml_backend_cuda_reg_get_device(ggml_backend_reg_t reg, size_t index) {
static ggml_guid guid = { 0x2c, 0xdd, 0xe8, 0x1c, 0x65, 0xb3, 0x65, 0x73, 0x6a, 0x12, 0x88, 0x61, 0x1c, 0xc9, 0xdc, 0x25 }; ggml_backend_cuda_reg_context * ctx = (ggml_backend_cuda_reg_context *)reg->context;
return &guid; GGML_ASSERT(index < ctx->devices.size());
return ctx->devices[index];
} }
GGML_CALL ggml_backend_t ggml_backend_cuda_init(int device) { static ggml_backend_feature * ggml_backend_cuda_get_features(ggml_backend_reg_t reg) {
if (device < 0 || device >= ggml_backend_cuda_get_device_count()) { static std::vector<ggml_backend_feature> features = []() {
GGML_CUDA_LOG_ERROR("%s: invalid device %d\n", __func__, device); std::vector<ggml_backend_feature> features;
return nullptr; #define _STRINGIFY(...) #__VA_ARGS__
} #define STRINGIFY(...) _STRINGIFY(__VA_ARGS__)
ggml_backend_cuda_context * ctx = new ggml_backend_cuda_context(device); #ifdef __CUDA_ARCH_LIST__
if (ctx == nullptr) { features.push_back({ "ARCHS", STRINGIFY(__CUDA_ARCH_LIST__) });
GGML_CUDA_LOG_ERROR("%s: failed to allocate context\n", __func__); #endif
return nullptr;
}
ggml_backend_t cuda_backend = new ggml_backend { #ifdef GGML_CUDA_FORCE_MMQ
/* .guid = */ ggml_backend_cuda_guid(), features.push_back({ "FORCE_MMQ", "1" });
/* .interface = */ ggml_backend_cuda_interface, #endif
/* .context = */ ctx
};
return cuda_backend; #ifdef GGML_CUDA_FORCE_CUBLAS
} features.push_back({ "FORCE_CUBLAS", "1" });
#endif
GGML_CALL bool ggml_backend_is_cuda(ggml_backend_t backend) { #ifdef GGML_CUDA_NO_VMM
return backend != NULL && ggml_guid_matches(backend->guid, ggml_backend_cuda_guid()); features.push_back({ "NO_VMM", "1" });
} #endif
GGML_CALL int ggml_backend_cuda_get_device_count() { #ifdef GGML_CUDA_NO_PEER_COPY
return ggml_cuda_info().device_count; features.push_back({ "NO_PEER_COPY", "1" });
} #endif
GGML_CALL void ggml_backend_cuda_get_device_description(int device, char * description, size_t description_size) { #ifdef GGML_CUDA_F16
cudaDeviceProp prop; features.push_back({ "F16", "1" });
CUDA_CHECK(cudaGetDeviceProperties(&prop, device)); #endif
snprintf(description, description_size, "%s", prop.name);
}
GGML_CALL void ggml_backend_cuda_get_device_memory(int device, size_t * free, size_t * total) { #ifdef GGML_CUDA_USE_GRAPHS
ggml_cuda_set_device(device); features.push_back({ "USE_GRAPHS", "1" });
#endif
CUDA_CHECK(cudaMemGetInfo(free, total)); #ifdef GGML_CUDA_PEER_MAX_BATCH_SIZE
} features.push_back({ "PEER_MAX_BATCH_SIZE", STRINGIFY(GGML_CUDA_PEER_MAX_BATCH_SIZE) });
#endif
GGML_CALL bool ggml_backend_cuda_register_host_buffer(void * buffer, size_t size) { #ifdef GGML_CUDA_FA_ALL_QUANTS
if (getenv("GGML_CUDA_REGISTER_HOST") == nullptr) { features.push_back({ "FA_ALL_QUANTS", "1" });
return false; #endif
}
#if CUDART_VERSION >= 11100 || defined(GGML_USE_MUSA) #undef _STRINGIFY
cudaError_t err = cudaHostRegister(buffer, size, cudaHostRegisterPortable | cudaHostRegisterReadOnly); #undef STRINGIFY
if (err != cudaSuccess) {
// clear the error
cudaGetLastError();
GGML_CUDA_LOG_WARN("%s: failed to register %.2f MiB of pinned memory: %s\n", __func__, features.push_back({ nullptr, nullptr });
size / 1024.0 / 1024.0, cudaGetErrorString(err));
return false; return features;
} }();
return true;
#else return features.data();
return false;
#endif GGML_UNUSED(reg);
} }
GGML_CALL void ggml_backend_cuda_unregister_host_buffer(void * buffer) { static void * ggml_backend_cuda_reg_get_proc_address(ggml_backend_reg_t reg, const char * name) {
if (getenv("GGML_CUDA_REGISTER_HOST") == nullptr) { GGML_UNUSED(reg);
return; if (strcmp(name, "ggml_backend_split_buffer_type") == 0) {
return (void *)ggml_backend_cuda_split_buffer_type;
} }
if (strcmp(name, "ggml_backend_register_host_buffer") == 0) {
cudaError_t err = cudaHostUnregister(buffer); return (void *)ggml_backend_cuda_register_host_buffer;
if (err != cudaSuccess) { }
// clear the error if (strcmp(name, "ggml_backend_unregister_host_buffer") == 0) {
cudaGetLastError(); return (void *)ggml_backend_cuda_unregister_host_buffer;
} }
if (strcmp(name, "ggml_backend_get_features") == 0) {
return (void *)ggml_backend_cuda_get_features;
}
return nullptr;
} }
static const ggml_backend_reg_i ggml_backend_cuda_reg_interface = {
/* .get_name = */ ggml_backend_cuda_reg_get_name,
/* .get_device_count = */ ggml_backend_cuda_reg_get_device_count,
/* .get_device_get = */ ggml_backend_cuda_reg_get_device,
/* .get_proc_address = */ ggml_backend_cuda_reg_get_proc_address,
};
// backend registry // backend registry
GGML_CALL static ggml_backend_t ggml_backend_reg_cuda_init(const char * params, void * user_data) { ggml_backend_reg_t ggml_backend_cuda_reg() {
ggml_backend_t cuda_backend = ggml_backend_cuda_init((int) (intptr_t) user_data); static ggml_backend_reg reg;
return cuda_backend; static bool initialized = false;
GGML_UNUSED(params); {
static std::mutex mutex;
std::lock_guard<std::mutex> lock(mutex);
if (!initialized) {
ggml_backend_cuda_reg_context * ctx = new ggml_backend_cuda_reg_context;
for (int i = 0; i < ggml_cuda_info().device_count; i++) {
ggml_backend_cuda_device_context * dev_ctx = new ggml_backend_cuda_device_context;
dev_ctx->device = i;
dev_ctx->name = GGML_CUDA_NAME + std::to_string(i);
ggml_cuda_set_device(i);
cudaDeviceProp prop;
CUDA_CHECK(cudaGetDeviceProperties(&prop, i));
dev_ctx->description = prop.name;
ggml_backend_dev_t dev = new ggml_backend_device {
/* .iface = */ ggml_backend_cuda_device_interface,
/* .reg = */ &reg,
/* .context = */ dev_ctx
};
ctx->devices.push_back(dev);
}
reg = ggml_backend_reg {
/* .api_version = */ GGML_BACKEND_API_VERSION,
/* .iface = */ ggml_backend_cuda_reg_interface,
/* .context = */ ctx
};
}
initialized = true;
}
return &reg;
} }
GGML_CALL int ggml_backend_cuda_reg_devices() { ggml_backend_t ggml_backend_cuda_init(int device) {
int device_count = ggml_backend_cuda_get_device_count(); if (device < 0 || device >= ggml_backend_cuda_get_device_count()) {
//int device_count = 1; // DEBUG: some tools require delaying CUDA initialization GGML_LOG_ERROR("%s: invalid device %d\n", __func__, device);
for (int i = 0; i < device_count; i++) { return nullptr;
char name[128]; }
snprintf(name, sizeof(name), "%s%d", GGML_CUDA_NAME, i);
ggml_backend_register(name, ggml_backend_reg_cuda_init, ggml_backend_cuda_buffer_type(i), (void *) (intptr_t) i); ggml_backend_cuda_context * ctx = new ggml_backend_cuda_context(device);
if (ctx == nullptr) {
GGML_LOG_ERROR("%s: failed to allocate context\n", __func__);
return nullptr;
} }
return device_count;
ggml_backend_t cuda_backend = new ggml_backend {
/* .guid = */ ggml_backend_cuda_guid(),
/* .interface = */ ggml_backend_cuda_interface,
/* .device = */ ggml_backend_reg_dev_get(ggml_backend_cuda_reg(), device),
/* .context = */ ctx,
};
return cuda_backend;
} }
GGML_BACKEND_DL_IMPL(ggml_backend_cuda_reg)
/** /**
* llama.cpp - commit 3f1ae2e32cde00c39b96be6d01c2997c29bae555 - do not edit this file * llama.cpp - commit 40c6d79fb52f995f47507fedfeaae2ac05d9b35c - do not edit this file
* *
* MIT License * MIT License
* *
...@@ -117,9 +117,9 @@ void ggml_cuda_op_im2col(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { ...@@ -117,9 +117,9 @@ void ggml_cuda_op_im2col(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
const int64_t OH = is_2D ? dst->ne[2] : 1; const int64_t OH = is_2D ? dst->ne[2] : 1;
const int64_t OW = dst->ne[1]; const int64_t OW = dst->ne[1];
const size_t delta_offset = src1->nb[is_2D ? 2 : 1] / 4; // nb is byte offset, src is type float32 const size_t delta_offset = src1->nb[is_2D ? 2 : 1] / 4; // nb is byte offset, src is type float32
const int64_t batch = src1->ne[3]; const int64_t batch = src1->ne[is_2D ? 3 : 2];
const size_t batch_offset = src1->nb[3] / 4; // nb is byte offset, src is type float32 const size_t batch_offset = src1->nb[is_2D ? 3 : 2] / 4; // nb is byte offset, src is type float32
if(dst->type == GGML_TYPE_F16) { if(dst->type == GGML_TYPE_F16) {
im2col_cuda_f16(src1_d, (half *) dst_d, IW, IH, OW, OH, KW, KH, IC, batch, batch_offset, delta_offset, s0, s1, p0, p1, d0, d1, stream); im2col_cuda_f16(src1_d, (half *) dst_d, IW, IH, OW, OH, KW, KH, IC, batch, batch_offset, delta_offset, s0, s1, p0, p1, d0, d1, stream);
......
/** /**
* llama.cpp - commit 3f1ae2e32cde00c39b96be6d01c2997c29bae555 - do not edit this file * llama.cpp - commit 40c6d79fb52f995f47507fedfeaae2ac05d9b35c - do not edit this file
* *
* MIT License * MIT License
* *
......
/** /**
* llama.cpp - commit 3f1ae2e32cde00c39b96be6d01c2997c29bae555 - do not edit this file * llama.cpp - commit 40c6d79fb52f995f47507fedfeaae2ac05d9b35c - do not edit this file
* *
* MIT License * MIT License
* *
......
/** /**
* llama.cpp - commit 3f1ae2e32cde00c39b96be6d01c2997c29bae555 - do not edit this file * llama.cpp - commit 40c6d79fb52f995f47507fedfeaae2ac05d9b35c - do not edit this file
* *
* MIT License * MIT License
* *
...@@ -34,8 +34,6 @@ void ggml_cuda_op_mul_mat_q( ...@@ -34,8 +34,6 @@ void ggml_cuda_op_mul_mat_q(
const int64_t ne00 = src0->ne[0]; const int64_t ne00 = src0->ne[0];
const int64_t nb01 = src0->nb[1];
const int64_t ne10 = src1->ne[0]; const int64_t ne10 = src1->ne[0];
const int64_t ne11 = src1->ne[1]; const int64_t ne11 = src1->ne[1];
GGML_ASSERT(ne10 % QK8_1 == 0); GGML_ASSERT(ne10 % QK8_1 == 0);
...@@ -43,7 +41,7 @@ void ggml_cuda_op_mul_mat_q( ...@@ -43,7 +41,7 @@ void ggml_cuda_op_mul_mat_q(
const int64_t ne0 = dst->ne[0]; const int64_t ne0 = dst->ne[0];
const int64_t row_diff = row_high - row_low; const int64_t row_diff = row_high - row_low;
const int64_t stride00 = nb01 / ggml_type_size(src0->type); const int64_t stride00 = ne00 / ggml_blck_size(src0->type);
int id = ggml_cuda_get_device(); int id = ggml_cuda_get_device();
const int compute_capability = ggml_cuda_info().devices[id].cc; const int compute_capability = ggml_cuda_info().devices[id].cc;
...@@ -176,5 +174,5 @@ bool ggml_cuda_should_use_mmq(enum ggml_type type, int cc, int64_t ne11) { ...@@ -176,5 +174,5 @@ bool ggml_cuda_should_use_mmq(enum ggml_type type, int cc, int64_t ne11) {
return cc < CC_VOLTA || ne11 < MMQ_DP4A_MAX_BATCH_SIZE; return cc < CC_VOLTA || ne11 < MMQ_DP4A_MAX_BATCH_SIZE;
} }
return cc < CC_RDNA3 || ne11 < MMQ_DP4A_MAX_BATCH_SIZE; return (cc < CC_RDNA3 && cc != CC_CDNA && cc != CC_VEGA20) || ne11 < MMQ_DP4A_MAX_BATCH_SIZE;
} }
/** /**
* llama.cpp - commit 3f1ae2e32cde00c39b96be6d01c2997c29bae555 - do not edit this file * llama.cpp - commit 40c6d79fb52f995f47507fedfeaae2ac05d9b35c - do not edit this file
* *
* MIT License * MIT License
* *
...@@ -126,9 +126,9 @@ static constexpr __device__ int get_mmq_x_max_device() { ...@@ -126,9 +126,9 @@ static constexpr __device__ int get_mmq_x_max_device() {
return 128; return 128;
#else // INT8_MMA_AVAILABLE #else // INT8_MMA_AVAILABLE
#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) #if defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
return 128; return 128;
#else // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) #else // defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
#if __CUDA_ARCH__ >= CC_VOLTA #if __CUDA_ARCH__ >= CC_VOLTA
#ifdef GGML_CUDA_FORCE_MMQ #ifdef GGML_CUDA_FORCE_MMQ
...@@ -141,7 +141,7 @@ static constexpr __device__ int get_mmq_x_max_device() { ...@@ -141,7 +141,7 @@ static constexpr __device__ int get_mmq_x_max_device() {
return 64; return 64;
#endif // __CUDA_ARCH__ >= CC_VOLTA #endif // __CUDA_ARCH__ >= CC_VOLTA
#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) #endif // defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
#endif // INT8_MMA_AVAILABLE #endif // INT8_MMA_AVAILABLE
} }
...@@ -150,7 +150,7 @@ static constexpr int get_mmq_y_host(const int cc) { ...@@ -150,7 +150,7 @@ static constexpr int get_mmq_y_host(const int cc) {
} }
static constexpr __device__ int get_mmq_y_device() { static constexpr __device__ int get_mmq_y_device() {
#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) #if defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
#if defined(RDNA1) #if defined(RDNA1)
return 64; return 64;
#else #else
...@@ -162,7 +162,7 @@ static constexpr __device__ int get_mmq_y_device() { ...@@ -162,7 +162,7 @@ static constexpr __device__ int get_mmq_y_device() {
#else #else
return 64; return 64;
#endif // __CUDA_ARCH__ >= CC_VOLTA #endif // __CUDA_ARCH__ >= CC_VOLTA
#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) #endif // defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
} }
#define MMQ_DP4A_TXS_Q4_0 tile_x_sizes{mmq_y*WARP_SIZE + mmq_y, mmq_y*WARP_SIZE/QI4_0 + mmq_y/QI4_0, 0} #define MMQ_DP4A_TXS_Q4_0 tile_x_sizes{mmq_y*WARP_SIZE + mmq_y, mmq_y*WARP_SIZE/QI4_0 + mmq_y/QI4_0, 0}
...@@ -2595,17 +2595,17 @@ static __device__ void mul_mat_q_process_tile( ...@@ -2595,17 +2595,17 @@ static __device__ void mul_mat_q_process_tile(
// The mul_mat_q kernel implements "stream-k" work partitioning as described in https://arxiv.org/abs/2301.03598 // The mul_mat_q kernel implements "stream-k" work partitioning as described in https://arxiv.org/abs/2301.03598
template <ggml_type type, int mmq_x, int nwarps, bool need_check> template <ggml_type type, int mmq_x, int nwarps, bool need_check>
#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) #if defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
#if defined(RDNA3) || defined(RDNA2) #if defined(RDNA3) || defined(RDNA2) || defined(CDNA) || defined(GCN)
__launch_bounds__(WARP_SIZE*nwarps, 2) __launch_bounds__(WARP_SIZE*nwarps, 2)
#endif // defined(RDNA3) || defined(RDNA2) #endif // defined(RDNA3) || defined(RDNA2) || defined(CDNA) || defined(GCN)
#else #else
#if __CUDA_ARCH__ >= CC_VOLTA #if __CUDA_ARCH__ >= CC_VOLTA
__launch_bounds__(WARP_SIZE*nwarps, 1) __launch_bounds__(WARP_SIZE*nwarps, 1)
#else #else
__launch_bounds__(WARP_SIZE*nwarps, 2) __launch_bounds__(WARP_SIZE*nwarps, 2)
#endif // __CUDA_ARCH__ >= CC_VOLTA #endif // __CUDA_ARCH__ >= CC_VOLTA
#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) #endif // defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
static __global__ void mul_mat_q( static __global__ void mul_mat_q(
const char * __restrict__ x, const char * __restrict__ yc, float * __restrict__ dst, float * __restrict__ tmp_fixup, const char * __restrict__ x, const char * __restrict__ yc, float * __restrict__ dst, float * __restrict__ tmp_fixup,
const int ne00, const int ne01, const int stride01, const int ne10, const int ne11, const int stride11, const int ne0) { const int ne00, const int ne01, const int stride01, const int ne10, const int ne11, const int stride11, const int ne0) {
...@@ -2620,7 +2620,7 @@ static __global__ void mul_mat_q( ...@@ -2620,7 +2620,7 @@ static __global__ void mul_mat_q(
constexpr int mmq_y = get_mmq_y_device(); constexpr int mmq_y = get_mmq_y_device();
// On AMD or old CUDA the performance with stream-k was worse, use conventional tiling instead: // On AMD or old CUDA the performance with stream-k was worse, use conventional tiling instead:
#if (defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) || __CUDA_ARCH__ < CC_VOLTA #if (defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) || __CUDA_ARCH__ < CC_VOLTA
{ {
constexpr bool fixup = false; constexpr bool fixup = false;
mul_mat_q_process_tile<type, mmq_x, nwarps, need_check, fixup> mul_mat_q_process_tile<type, mmq_x, nwarps, need_check, fixup>
...@@ -2628,7 +2628,7 @@ static __global__ void mul_mat_q( ...@@ -2628,7 +2628,7 @@ static __global__ void mul_mat_q(
blockIdx.x, blockIdx.y, 0, ne00/qk); blockIdx.x, blockIdx.y, 0, ne00/qk);
return; return;
} }
#endif // (defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) || __CUDA_ARCH__ < CC_VOLTA #endif // (defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) || __CUDA_ARCH__ < CC_VOLTA
const int64_t blocks_per_ne00 = ne00 / qk; const int64_t blocks_per_ne00 = ne00 / qk;
constexpr int blocks_per_iter = MMQ_ITER_K / qk; constexpr int blocks_per_iter = MMQ_ITER_K / qk;
...@@ -2791,14 +2791,14 @@ static void launch_mul_mat_q(ggml_backend_cuda_context & ctx, const mmq_args & a ...@@ -2791,14 +2791,14 @@ static void launch_mul_mat_q(ggml_backend_cuda_context & ctx, const mmq_args & a
const int shmem = mmq_get_shmem<type>(mmq_x, mmq_y, cc); const int shmem = mmq_get_shmem<type>(mmq_x, mmq_y, cc);
#if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) #if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__))
static bool shmem_limit_raised[GGML_CUDA_MAX_DEVICES] = {false}; static bool shmem_limit_raised[GGML_CUDA_MAX_DEVICES] = {false};
if (!shmem_limit_raised[id]) { if (!shmem_limit_raised[id]) {
CUDA_CHECK(cudaFuncSetAttribute(mul_mat_q<type, mmq_x, MMQ_NWARPS, false>, cudaFuncAttributeMaxDynamicSharedMemorySize, shmem)); CUDA_CHECK(cudaFuncSetAttribute(mul_mat_q<type, mmq_x, MMQ_NWARPS, false>, cudaFuncAttributeMaxDynamicSharedMemorySize, shmem));
CUDA_CHECK(cudaFuncSetAttribute(mul_mat_q<type, mmq_x, MMQ_NWARPS, true>, cudaFuncAttributeMaxDynamicSharedMemorySize, shmem)); CUDA_CHECK(cudaFuncSetAttribute(mul_mat_q<type, mmq_x, MMQ_NWARPS, true>, cudaFuncAttributeMaxDynamicSharedMemorySize, shmem));
shmem_limit_raised[id] = true; shmem_limit_raised[id] = true;
} }
#endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) #endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__))
const int nty = (args.ne01 + mmq_y - 1) / mmq_y; const int nty = (args.ne01 + mmq_y - 1) / mmq_y;
const int ntx = (args.ne11 + mmq_x - 1) / mmq_x; const int ntx = (args.ne11 + mmq_x - 1) / mmq_x;
......
/**
* llama.cpp - commit 40c6d79fb52f995f47507fedfeaae2ac05d9b35c - do not edit this file
*
* MIT License
*
* Copyright (c) 2023-2024 The ggml authors
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in all
* copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/
#include "common.cuh"
#include "mmv.cuh"
template <typename type_acc, int block_size>
static __global__ void mul_mat_vec(
const half * __restrict__ x, const float * __restrict__ y, float * __restrict__ dst, const int64_t ncols2, const int64_t stride_row,
const int64_t channel_ratio, const int64_t stride_channel_x, const int64_t stride_channel_y, const int64_t stride_channel_dst) {
const int64_t row = blockIdx.x;
const int64_t channel = blockIdx.z;
const int tid = threadIdx.x;
x += (channel/channel_ratio)*stride_channel_x + row*stride_row;
y += channel *stride_channel_y;
dst += channel *stride_channel_dst;
const half2 * x2 = (const half2 *) x;
const float2 * y2 = (const float2 *) y;
extern __shared__ char data_mmv[];
float * buf_iw = (float *) data_mmv;
if (block_size > WARP_SIZE) {
if (tid < WARP_SIZE) {
buf_iw[tid] = 0.0f;
}
__syncthreads();
}
float sumf;
if (std::is_same<type_acc, float>::value) {
sumf = 0.0f;
for (int64_t col2 = tid; col2 < ncols2; col2 += block_size) {
const float2 tmpx = __half22float2(x2[col2]);
const float2 tmpy = y2[col2];
sumf += tmpx.x * tmpy.x;
sumf += tmpx.y * tmpy.y;
}
} else {
#ifdef FP16_AVAILABLE
half2 sumh2 = make_half2(0.0f, 0.0f);
for (int64_t col2 = tid; col2 < ncols2; col2 += block_size) {
const float2 tmp = y2[col2];
sumh2 += x2[col2] * make_half2(tmp.x, tmp.y);
}
sumf = __low2float(sumh2) + __high2float(sumh2);
#else
NO_DEVICE_CODE;
#endif // FP16_AVAILABLE
}
sumf = warp_reduce_sum(sumf);
if (block_size > WARP_SIZE) {
buf_iw[tid/WARP_SIZE] = sumf;
__syncthreads();
if (tid > WARP_SIZE) {
return;
}
sumf = buf_iw[tid];
sumf = warp_reduce_sum(sumf);
}
if (tid != 0) {
return;
}
dst[row] = sumf;
}
template <typename type_acc>
static void launch_mul_mat_vec_cuda(
const half * x, const float * y, float * dst,
const int64_t ncols, const int64_t nrows, const int64_t stride_row, const int64_t nchannels_x, const int64_t nchannels_y,
const int64_t stride_channel_x, const int64_t stride_channel_y, const int64_t stride_channel_dst,
cudaStream_t stream) {
GGML_ASSERT(ncols % 2 == 0);
GGML_ASSERT(stride_row % 2 == 0);
GGML_ASSERT(nchannels_y % nchannels_x == 0);
const int64_t channel_ratio = nchannels_y / nchannels_x;
int64_t block_size_best = WARP_SIZE;
int64_t niter_best = (ncols + 2*WARP_SIZE - 1) / (2*WARP_SIZE);
for (int64_t block_size = 2*WARP_SIZE; block_size <= 256; block_size += WARP_SIZE) {
const int64_t niter = (ncols + 2*block_size - 1) / (2*block_size);
if (niter < niter_best) {
niter_best = niter;
block_size_best = block_size;
}
}
const int smem = WARP_SIZE*sizeof(float);
const dim3 block_nums(nrows, 1, nchannels_y);
const dim3 block_dims(block_size_best, 1, 1);
switch (block_size_best) {
case 32: {
mul_mat_vec<type_acc, 32><<<block_nums, block_dims, smem, stream>>>
(x, y, dst, ncols/2, stride_row, channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst);
} break;
case 64: {
mul_mat_vec<type_acc, 64><<<block_nums, block_dims, smem, stream>>>
(x, y, dst, ncols/2, stride_row, channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst);
} break;
case 96: {
mul_mat_vec<type_acc, 96><<<block_nums, block_dims, smem, stream>>>
(x, y, dst, ncols/2, stride_row, channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst);
} break;
case 128: {
mul_mat_vec<type_acc, 128><<<block_nums, block_dims, smem, stream>>>
(x, y, dst, ncols/2, stride_row, channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst);
} break;
case 160: {
mul_mat_vec<type_acc, 160><<<block_nums, block_dims, smem, stream>>>
(x, y, dst, ncols/2, stride_row, channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst);
} break;
case 192: {
mul_mat_vec<type_acc, 192><<<block_nums, block_dims, smem, stream>>>
(x, y, dst, ncols/2, stride_row, channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst);
} break;
case 224: {
mul_mat_vec<type_acc, 224><<<block_nums, block_dims, smem, stream>>>
(x, y, dst, ncols/2, stride_row, channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst);
} break;
case 256: {
mul_mat_vec<type_acc, 256><<<block_nums, block_dims, smem, stream>>>
(x, y, dst, ncols/2, stride_row, channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst);
} break;
default: {
GGML_ABORT("fatal error");
} break;
}
}
static void mul_mat_vec_cuda(
const half * x, const float * y, float * dst,
const int64_t ncols, const int64_t nrows, const int64_t stride_row, const int64_t nchannels_x, const int64_t nchannels_y,
const int64_t stride_channel_x, const int64_t stride_channel_y, const int64_t stride_channel_dst,
enum ggml_prec prec, cudaStream_t stream) {
switch (prec) {
case GGML_PREC_DEFAULT: {
launch_mul_mat_vec_cuda<half>(x, y, dst, ncols, nrows, stride_row, nchannels_x, nchannels_y,
stride_channel_x, stride_channel_y, stride_channel_dst, stream);
} break;
case GGML_PREC_F32: {
launch_mul_mat_vec_cuda<float>(x, y, dst, ncols, nrows, stride_row, nchannels_x, nchannels_y,
stride_channel_x, stride_channel_y, stride_channel_dst, stream);
} break;
}
}
void ggml_cuda_mul_mat_vec(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
GGML_ASSERT(src0->type == GGML_TYPE_F16);
GGML_ASSERT(src1->type == GGML_TYPE_F32);
GGML_ASSERT(dst->type == GGML_TYPE_F32);
const int64_t ne00 = src0->ne[0];
const int64_t ne01 = src0->ne[1];
GGML_ASSERT(src1->ne[1] == 1);
const int cc = ggml_cuda_info().devices[ggml_cuda_get_device()].cc;
const enum ggml_prec prec = fast_fp16_available(cc) ? ggml_prec(dst->op_params[0]) : GGML_PREC_F32;
const half * src0_d = (const half *) src0->data;
const float * src1_d = (const float *) src1->data;
float * dst_d = (float *) dst->data;
const int64_t ne02 = src0->ne[2];
const int64_t ne12 = src1->ne[2];
GGML_ASSERT(dst->ne[2] == ne12);
GGML_ASSERT(src0->ne[3] == 1);
GGML_ASSERT(src1->ne[3] == 1);
GGML_ASSERT( dst->ne[3] == 1);
const int64_t stride_row = src0->nb[1] / ggml_type_size(src0->type);
const int64_t channel_stride_x = src0->nb[2] / ggml_type_size(src0->type);
const int64_t channel_stride_y = src1->nb[2] / ggml_type_size(src1->type);
const int64_t channel_stride_dst = dst->nb[2] / ggml_type_size( dst->type);
mul_mat_vec_cuda(src0_d, src1_d, dst_d, ne00, ne01, stride_row, ne02, ne12, channel_stride_x, channel_stride_y, channel_stride_dst, prec, ctx.stream());
}
void ggml_cuda_op_mul_mat_vec(
ggml_backend_cuda_context & ctx,
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, const char * src0_dd_i, const float * src1_ddf_i,
const char * src1_ddq_i, float * dst_dd_i, const int64_t row_low, const int64_t row_high, const int64_t src1_ncols,
const int64_t src1_padded_row_size, cudaStream_t stream) {
GGML_ASSERT(src0->type == GGML_TYPE_F16);
GGML_ASSERT(src1->type == GGML_TYPE_F32);
GGML_ASSERT(dst->type == GGML_TYPE_F32);
const int64_t ne00 = src0->ne[0];
const int64_t row_diff = row_high - row_low;
GGML_ASSERT(src1_ncols == 1);
const int cc = ggml_cuda_info().devices[ggml_cuda_get_device()].cc;
const enum ggml_prec prec = fast_fp16_available(cc) ? ggml_prec(dst->op_params[0]) : GGML_PREC_F32;
// ggml_cuda_op provides single, contiguous matrices
const int64_t stride_row = ne00;
const int64_t nchannels_x = 1;
const int64_t nchannels_y = 1;
const int64_t channel_stride_x = 0;
const int64_t channel_stride_y = 0;
const int64_t channel_stride_dst = 0;
mul_mat_vec_cuda((const half *) src0_dd_i, src1_ddf_i, dst_dd_i, ne00, row_diff, stride_row,
nchannels_x, nchannels_y, channel_stride_x, channel_stride_y, channel_stride_dst, prec, stream);
GGML_UNUSED(ctx);
GGML_UNUSED(src1);
GGML_UNUSED(dst);
GGML_UNUSED(src1_ddq_i);
GGML_UNUSED(src1_ncols);
GGML_UNUSED(src1_padded_row_size);
}
/** /**
* llama.cpp - commit 3f1ae2e32cde00c39b96be6d01c2997c29bae555 - do not edit this file * llama.cpp - commit 40c6d79fb52f995f47507fedfeaae2ac05d9b35c - do not edit this file
* *
* MIT License * MIT License
* *
...@@ -26,21 +26,13 @@ ...@@ -26,21 +26,13 @@
#include "common.cuh" #include "common.cuh"
// dmmv = dequantize_mul_mat_vec // maximum number of src0 rows with which to use mul_mat_vec over cuBLAS if FP16 tensor cores are available
#define MMV_MAX_ROWS 512
// TODO: remove this? void ggml_cuda_mul_mat_vec(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst);
#ifndef GGML_CUDA_DMMV_X
#define GGML_CUDA_DMMV_X 32
#endif
#ifndef GGML_CUDA_MMV_Y void ggml_cuda_op_mul_mat_vec(
#define GGML_CUDA_MMV_Y 1
#endif
void ggml_cuda_op_dequantize_mul_mat_vec(
ggml_backend_cuda_context & ctx, ggml_backend_cuda_context & ctx,
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, const char * src0_dd_i, const float * src1_ddf_i, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, const char * src0_dd_i, const float * src1_ddf_i,
const char * src1_ddq_i, float * dst_dd_i, const int64_t row_low, const int64_t row_high, const int64_t src1_ncols, const char * src1_ddq_i, float * dst_dd_i, const int64_t row_low, const int64_t row_high, const int64_t src1_ncols,
const int64_t src1_padded_row_size, cudaStream_t stream); const int64_t src1_padded_row_size, cudaStream_t stream);
bool ggml_cuda_dmmv_type_supported(ggml_type src0_type);
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment