Unverified Commit 7a81daf0 authored by Jeffrey Morgan's avatar Jeffrey Morgan Committed by GitHub
Browse files

llama: update vendor code to commit ba1cb19c (#8101)

parent 60f75560
/** /**
* llama.cpp - commit 40c6d79fb52f995f47507fedfeaae2ac05d9b35c - do not edit this file * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
* *
* MIT License * MIT License
* *
......
/** /**
* llama.cpp - commit 40c6d79fb52f995f47507fedfeaae2ac05d9b35c - do not edit this file * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
* *
* MIT License * MIT License
* *
......
/** /**
* llama.cpp - commit 40c6d79fb52f995f47507fedfeaae2ac05d9b35c - do not edit this file * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
* *
* MIT License * MIT License
* *
......
/** /**
* llama.cpp - commit 40c6d79fb52f995f47507fedfeaae2ac05d9b35c - do not edit this file * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
* *
* MIT License * MIT License
* *
......
/** /**
* llama.cpp - commit 40c6d79fb52f995f47507fedfeaae2ac05d9b35c - do not edit this file * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
* *
* MIT License * MIT License
* *
......
/** /**
* llama.cpp - commit 40c6d79fb52f995f47507fedfeaae2ac05d9b35c - do not edit this file * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
* *
* MIT License * MIT License
* *
...@@ -330,7 +330,7 @@ void ggml_cuda_flash_attn_ext(ggml_backend_cuda_context & ctx, ggml_tensor * dst ...@@ -330,7 +330,7 @@ void ggml_cuda_flash_attn_ext(ggml_backend_cuda_context & ctx, ggml_tensor * dst
const enum ggml_prec prec = ggml_flash_attn_ext_get_prec(KQV); const enum ggml_prec prec = ggml_flash_attn_ext_get_prec(KQV);
// On AMD the tile kernels perform poorly, use the vec kernel instead: // On AMD the tile kernels perform poorly, use the vec kernel instead:
if (cc >= CC_OFFSET_AMD) { if (cc >= GGML_CUDA_CC_OFFSET_AMD) {
if (prec == GGML_PREC_DEFAULT && fast_fp16_available(cc)) { if (prec == GGML_PREC_DEFAULT && fast_fp16_available(cc)) {
ggml_cuda_flash_attn_ext_vec_f16(ctx, dst); ggml_cuda_flash_attn_ext_vec_f16(ctx, dst);
} else { } else {
......
/** /**
* llama.cpp - commit 40c6d79fb52f995f47507fedfeaae2ac05d9b35c - do not edit this file * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
* *
* MIT License * MIT License
* *
......
/** /**
* llama.cpp - commit 40c6d79fb52f995f47507fedfeaae2ac05d9b35c - do not edit this file * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
* *
* MIT License * MIT License
* *
......
/** /**
* llama.cpp - commit 40c6d79fb52f995f47507fedfeaae2ac05d9b35c - do not edit this file * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
* *
* MIT License * MIT License
* *
......
/** /**
* llama.cpp - commit 40c6d79fb52f995f47507fedfeaae2ac05d9b35c - do not edit this file * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
* *
* MIT License * MIT License
* *
...@@ -203,7 +203,7 @@ static ggml_cuda_device_info ggml_cuda_init() { ...@@ -203,7 +203,7 @@ static ggml_cuda_device_info ggml_cuda_init() {
info.devices[id].smpb = prop.sharedMemPerBlock; info.devices[id].smpb = prop.sharedMemPerBlock;
#if defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__) #if defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
info.devices[id].smpbo = prop.sharedMemPerBlock; info.devices[id].smpbo = prop.sharedMemPerBlock;
info.devices[id].cc = 100*prop.major + 10*prop.minor + CC_OFFSET_AMD; info.devices[id].cc = 100*prop.major + 10*prop.minor + GGML_CUDA_CC_OFFSET_AMD;
#else #else
info.devices[id].smpbo = prop.sharedMemPerBlockOptin; info.devices[id].smpbo = prop.sharedMemPerBlockOptin;
info.devices[id].cc = 100*prop.major + 10*prop.minor; info.devices[id].cc = 100*prop.major + 10*prop.minor;
...@@ -1111,7 +1111,7 @@ static void ggml_cuda_op_mul_mat_cublas( ...@@ -1111,7 +1111,7 @@ static void ggml_cuda_op_mul_mat_cublas(
const int compute_capability = ggml_cuda_info().devices[id].cc; const int compute_capability = ggml_cuda_info().devices[id].cc;
if (compute_capability >= CC_VOLTA && (src0->type == GGML_TYPE_F16 || ggml_is_quantized(src0->type)) && ggml_is_contiguous(src0) && row_diff == src0->ne[1] && dst->op_params[0] == GGML_PREC_DEFAULT) { if (compute_capability >= GGML_CUDA_CC_VOLTA && (src0->type == GGML_TYPE_F16 || ggml_is_quantized(src0->type)) && ggml_is_contiguous(src0) && row_diff == src0->ne[1] && dst->op_params[0] == GGML_PREC_DEFAULT) {
// convert src0 and src1 to fp16, multiply as fp16, convert dst to fp32 // convert src0 and src1 to fp16, multiply as fp16, convert dst to fp32
ggml_cuda_pool_alloc<half> src0_as_f16(ctx.pool(id)); ggml_cuda_pool_alloc<half> src0_as_f16(ctx.pool(id));
if (src0->type != GGML_TYPE_F16) { if (src0->type != GGML_TYPE_F16) {
...@@ -1138,7 +1138,7 @@ static void ggml_cuda_op_mul_mat_cublas( ...@@ -1138,7 +1138,7 @@ static void ggml_cuda_op_mul_mat_cublas(
const half beta_f16 = 0.0f; const half beta_f16 = 0.0f;
cublasComputeType_t cu_compute_type = CUBLAS_COMPUTE_16F; cublasComputeType_t cu_compute_type = CUBLAS_COMPUTE_16F;
if (ggml_cuda_info().devices[ctx.device].cc == CC_CDNA) { if (ggml_cuda_info().devices[ctx.device].cc == GGML_CUDA_CC_CDNA) {
cu_compute_type = CUBLAS_COMPUTE_32F; cu_compute_type = CUBLAS_COMPUTE_32F;
} }
...@@ -1642,7 +1642,7 @@ static void ggml_cuda_mul_mat_batched_cublas(ggml_backend_cuda_context & ctx, co ...@@ -1642,7 +1642,7 @@ static void ggml_cuda_mul_mat_batched_cublas(ggml_backend_cuda_context & ctx, co
cublasComputeType_t cu_compute_type = CUBLAS_COMPUTE_16F; cublasComputeType_t cu_compute_type = CUBLAS_COMPUTE_16F;
cudaDataType_t cu_data_type = CUDA_R_16F; cudaDataType_t cu_data_type = CUDA_R_16F;
if (ggml_cuda_info().devices[ctx.device].cc == CC_CDNA) { if (ggml_cuda_info().devices[ctx.device].cc == GGML_CUDA_CC_CDNA) {
cu_compute_type = CUBLAS_COMPUTE_32F; cu_compute_type = CUBLAS_COMPUTE_32F;
} }
...@@ -2392,7 +2392,7 @@ static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend, ...@@ -2392,7 +2392,7 @@ static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend,
std::vector<void *> ggml_cuda_cpy_fn_ptrs; std::vector<void *> ggml_cuda_cpy_fn_ptrs;
if (cuda_ctx->cuda_graph->graph == nullptr) { if (cuda_ctx->cuda_graph->graph == nullptr) {
if (ggml_cuda_info().devices[cuda_ctx->device].cc < CC_AMPERE) { if (ggml_cuda_info().devices[cuda_ctx->device].cc < GGML_CUDA_CC_AMPERE) {
cuda_ctx->cuda_graph->disable_due_to_gpu_arch = true; cuda_ctx->cuda_graph->disable_due_to_gpu_arch = true;
#ifndef NDEBUG #ifndef NDEBUG
GGML_LOG_DEBUG("%s: disabling CUDA graphs due to GPU architecture\n", __func__); GGML_LOG_DEBUG("%s: disabling CUDA graphs due to GPU architecture\n", __func__);
...@@ -3064,7 +3064,7 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g ...@@ -3064,7 +3064,7 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
return true; return true;
} }
const int cc = ggml_cuda_info().devices[dev_ctx->device].cc; const int cc = ggml_cuda_info().devices[dev_ctx->device].cc;
return cc >= CC_VOLTA && cc < CC_OFFSET_AMD && op->src[1]->type == GGML_TYPE_F16 && op->src[2]->type == GGML_TYPE_F16; return cc >= GGML_CUDA_CC_VOLTA && cc < GGML_CUDA_CC_OFFSET_AMD && op->src[1]->type == GGML_TYPE_F16 && op->src[2]->type == GGML_TYPE_F16;
} }
case GGML_OP_CROSS_ENTROPY_LOSS: case GGML_OP_CROSS_ENTROPY_LOSS:
case GGML_OP_CROSS_ENTROPY_LOSS_BACK: case GGML_OP_CROSS_ENTROPY_LOSS_BACK:
...@@ -3246,7 +3246,7 @@ static void * ggml_backend_cuda_reg_get_proc_address(ggml_backend_reg_t reg, con ...@@ -3246,7 +3246,7 @@ static void * ggml_backend_cuda_reg_get_proc_address(ggml_backend_reg_t reg, con
static const ggml_backend_reg_i ggml_backend_cuda_reg_interface = { static const ggml_backend_reg_i ggml_backend_cuda_reg_interface = {
/* .get_name = */ ggml_backend_cuda_reg_get_name, /* .get_name = */ ggml_backend_cuda_reg_get_name,
/* .get_device_count = */ ggml_backend_cuda_reg_get_device_count, /* .get_device_count = */ ggml_backend_cuda_reg_get_device_count,
/* .get_device_get = */ ggml_backend_cuda_reg_get_device, /* .get_device = */ ggml_backend_cuda_reg_get_device,
/* .get_proc_address = */ ggml_backend_cuda_reg_get_proc_address, /* .get_proc_address = */ ggml_backend_cuda_reg_get_proc_address,
}; };
......
/** /**
* llama.cpp - commit 40c6d79fb52f995f47507fedfeaae2ac05d9b35c - do not edit this file * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
* *
* MIT License * MIT License
* *
......
/** /**
* llama.cpp - commit 40c6d79fb52f995f47507fedfeaae2ac05d9b35c - do not edit this file * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
* *
* MIT License * MIT License
* *
......
/** /**
* llama.cpp - commit 40c6d79fb52f995f47507fedfeaae2ac05d9b35c - do not edit this file * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
* *
* MIT License * MIT License
* *
...@@ -197,7 +197,7 @@ struct mma_int_C_I16J8 { ...@@ -197,7 +197,7 @@ struct mma_int_C_I16J8 {
__device__ __forceinline__ void mma_K4(const mma_int_A_I16K4 & mma_A, const mma_int_B_J8K4 & mma_B) { __device__ __forceinline__ void mma_K4(const mma_int_A_I16K4 & mma_A, const mma_int_B_J8K4 & mma_B) {
#ifdef INT8_MMA_AVAILABLE #ifdef INT8_MMA_AVAILABLE
#if __CUDA_ARCH__ >= CC_AMPERE #if __CUDA_ARCH__ >= GGML_CUDA_CC_AMPERE
asm("mma.sync.aligned.m16n8k16.row.col.s32.s8.s8.s32 {%0, %1, %2, %3}, {%4, %5}, {%6}, {%0, %1, %2, %3};" asm("mma.sync.aligned.m16n8k16.row.col.s32.s8.s8.s32 {%0, %1, %2, %3}, {%4, %5}, {%6}, {%0, %1, %2, %3};"
: "+r"(x[0]), "+r"(x[1]), "+r"(x[2]), "+r"(x[3]) : "+r"(x[0]), "+r"(x[1]), "+r"(x[2]), "+r"(x[3])
: "r"(mma_A.x[0]), "r"(mma_A.x[1]), "r"(mma_B.x[0])); : "r"(mma_A.x[0]), "r"(mma_A.x[1]), "r"(mma_B.x[0]));
...@@ -209,7 +209,7 @@ struct mma_int_C_I16J8 { ...@@ -209,7 +209,7 @@ struct mma_int_C_I16J8 {
asm("mma.sync.aligned.m8n8k16.row.col.s32.s8.s8.s32 {%0, %1}, {%2}, {%3}, {%0, %1};" asm("mma.sync.aligned.m8n8k16.row.col.s32.s8.s8.s32 {%0, %1}, {%2}, {%3}, {%0, %1};"
: "+r"(x[2]), "+r"(x[3]) : "+r"(x[2]), "+r"(x[3])
: "r"(mma_A.x[1]), "r"(mma_B.x[0])); : "r"(mma_A.x[1]), "r"(mma_B.x[0]));
#endif // __CUDA_ARCH__ >= CC_AMPERE #endif // __CUDA_ARCH__ >= GGML_CUDA_CC_AMPERE
#else #else
GGML_UNUSED(mma_A); GGML_UNUSED(mma_A);
GGML_UNUSED(mma_B); GGML_UNUSED(mma_B);
...@@ -219,7 +219,7 @@ struct mma_int_C_I16J8 { ...@@ -219,7 +219,7 @@ struct mma_int_C_I16J8 {
__device__ __forceinline__ void mma_K8(const mma_int_A_I16K8 & mma_A, const mma_int_B_J8K8 & mma_B) { __device__ __forceinline__ void mma_K8(const mma_int_A_I16K8 & mma_A, const mma_int_B_J8K8 & mma_B) {
#ifdef INT8_MMA_AVAILABLE #ifdef INT8_MMA_AVAILABLE
#if __CUDA_ARCH__ >= CC_AMPERE #if __CUDA_ARCH__ >= GGML_CUDA_CC_AMPERE
asm("mma.sync.aligned.m16n8k32.row.col.s32.s8.s8.s32 {%0, %1, %2, %3}, {%4, %5, %6, %7}, {%8, %9}, {%0, %1, %2, %3};" asm("mma.sync.aligned.m16n8k32.row.col.s32.s8.s8.s32 {%0, %1, %2, %3}, {%4, %5, %6, %7}, {%8, %9}, {%0, %1, %2, %3};"
: "+r"(x[0]), "+r"(x[1]), "+r"(x[2]), "+r"(x[3]) : "+r"(x[0]), "+r"(x[1]), "+r"(x[2]), "+r"(x[3])
: "r"(mma_A.x[0]), "r"(mma_A.x[1]), "r"(mma_A.x[2]), "r"(mma_A.x[3]), "r"(mma_B.x[0]), "r"(mma_B.x[1])); : "r"(mma_A.x[0]), "r"(mma_A.x[1]), "r"(mma_A.x[2]), "r"(mma_A.x[3]), "r"(mma_B.x[0]), "r"(mma_B.x[1]));
...@@ -237,7 +237,7 @@ struct mma_int_C_I16J8 { ...@@ -237,7 +237,7 @@ struct mma_int_C_I16J8 {
asm("mma.sync.aligned.m8n8k16.row.col.s32.s8.s8.s32 {%0, %1}, {%2}, {%3}, {%0, %1};" asm("mma.sync.aligned.m8n8k16.row.col.s32.s8.s8.s32 {%0, %1}, {%2}, {%3}, {%0, %1};"
: "+r"(x[2]), "+r"(x[3]) : "+r"(x[2]), "+r"(x[3])
: "r"(mma_A.x[3]), "r"(mma_B.x[1])); : "r"(mma_A.x[3]), "r"(mma_B.x[1]));
#endif // __CUDA_ARCH__ >= CC_AMPERE #endif // __CUDA_ARCH__ >= GGML_CUDA_CC_AMPERE
#else #else
GGML_UNUSED(mma_A); GGML_UNUSED(mma_A);
GGML_UNUSED(mma_B); GGML_UNUSED(mma_B);
......
/** /**
* llama.cpp - commit 40c6d79fb52f995f47507fedfeaae2ac05d9b35c - do not edit this file * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
* *
* MIT License * MIT License
* *
...@@ -53,7 +53,7 @@ void ggml_cuda_op_mul_mat_q( ...@@ -53,7 +53,7 @@ void ggml_cuda_op_mul_mat_q(
// The stream-k decomposition is only faster for recent NVIDIA GPUs. // The stream-k decomposition is only faster for recent NVIDIA GPUs.
// Also its fixup needs to allocate a temporary buffer in the memory pool. // Also its fixup needs to allocate a temporary buffer in the memory pool.
// There are multiple parallel CUDA streams for src1_ncols != ne11 which would introduce a race condition for this buffer. // There are multiple parallel CUDA streams for src1_ncols != ne11 which would introduce a race condition for this buffer.
const bool use_stream_k = compute_capability >= CC_VOLTA && compute_capability < CC_OFFSET_AMD && src1_ncols == ne11; const bool use_stream_k = compute_capability >= GGML_CUDA_CC_VOLTA && compute_capability < GGML_CUDA_CC_OFFSET_AMD && src1_ncols == ne11;
const mmq_args args = {src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stride00, src1_padded_row_size, src1_ncols, ne11, nrows_dst, use_stream_k}; const mmq_args args = {src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stride00, src1_padded_row_size, src1_ncols, ne11, nrows_dst, use_stream_k};
switch (src0->type) { switch (src0->type) {
...@@ -162,7 +162,7 @@ bool ggml_cuda_should_use_mmq(enum ggml_type type, int cc, int64_t ne11) { ...@@ -162,7 +162,7 @@ bool ggml_cuda_should_use_mmq(enum ggml_type type, int cc, int64_t ne11) {
return true; return true;
} }
if (cc < MIN_CC_DP4A) { if (cc < GGML_CUDA_CC_DP4A) {
return false; return false;
} }
...@@ -170,9 +170,9 @@ bool ggml_cuda_should_use_mmq(enum ggml_type type, int cc, int64_t ne11) { ...@@ -170,9 +170,9 @@ bool ggml_cuda_should_use_mmq(enum ggml_type type, int cc, int64_t ne11) {
return true; return true;
#endif //GGML_CUDA_FORCE_MMQ #endif //GGML_CUDA_FORCE_MMQ
if (cc < CC_OFFSET_AMD) { if (cc < GGML_CUDA_CC_OFFSET_AMD) {
return cc < CC_VOLTA || ne11 < MMQ_DP4A_MAX_BATCH_SIZE; return cc < GGML_CUDA_CC_VOLTA || ne11 < MMQ_DP4A_MAX_BATCH_SIZE;
} }
return (cc < CC_RDNA3 && cc != CC_CDNA && cc != CC_VEGA20) || ne11 < MMQ_DP4A_MAX_BATCH_SIZE; return (cc < GGML_CUDA_CC_RDNA3 && cc != GGML_CUDA_CC_CDNA && cc != GGML_CUDA_CC_VEGA20) || ne11 < MMQ_DP4A_MAX_BATCH_SIZE;
} }
/** /**
* llama.cpp - commit 40c6d79fb52f995f47507fedfeaae2ac05d9b35c - do not edit this file * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
* *
* MIT License * MIT License
* *
...@@ -115,9 +115,9 @@ struct tile_x_sizes { ...@@ -115,9 +115,9 @@ struct tile_x_sizes {
static constexpr int get_mmq_x_max_host(const int cc) { static constexpr int get_mmq_x_max_host(const int cc) {
return int8_mma_available(cc) ? 128 : return int8_mma_available(cc) ? 128 :
#ifdef GGML_CUDA_FORCE_MMQ #ifdef GGML_CUDA_FORCE_MMQ
cc >= CC_VOLTA && cc < CC_OFFSET_AMD ? 128 : 64; cc >= GGML_CUDA_CC_VOLTA && cc < GGML_CUDA_CC_OFFSET_AMD ? 128 : 64;
#else #else
cc >= CC_VOLTA && cc < CC_OFFSET_AMD ? MMQ_DP4A_MAX_BATCH_SIZE : 64; cc >= GGML_CUDA_CC_VOLTA && cc < GGML_CUDA_CC_OFFSET_AMD ? MMQ_DP4A_MAX_BATCH_SIZE : 64;
#endif // GGML_CUDA_FORCE_MMQ #endif // GGML_CUDA_FORCE_MMQ
} }
...@@ -130,23 +130,23 @@ static constexpr __device__ int get_mmq_x_max_device() { ...@@ -130,23 +130,23 @@ static constexpr __device__ int get_mmq_x_max_device() {
return 128; return 128;
#else // defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__) #else // defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
#if __CUDA_ARCH__ >= CC_VOLTA #if __CUDA_ARCH__ >= GGML_CUDA_CC_VOLTA
#ifdef GGML_CUDA_FORCE_MMQ #ifdef GGML_CUDA_FORCE_MMQ
return MMQ_DP4A_MAX_BATCH_SIZE; return MMQ_DP4A_MAX_BATCH_SIZE;
#else // GGML_CUDA_FORCE_MMQ #else // GGML_CUDA_FORCE_MMQ
return 128; return 128;
#endif // GGML_CUDA_FORCE_MMQ #endif // GGML_CUDA_FORCE_MMQ
#else // __CUDA_ARCH__ >= CC_VOLTA #else // __CUDA_ARCH__ >= GGML_CUDA_CC_VOLTA
return 64; return 64;
#endif // __CUDA_ARCH__ >= CC_VOLTA #endif // __CUDA_ARCH__ >= GGML_CUDA_CC_VOLTA
#endif // defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__) #endif // defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
#endif // INT8_MMA_AVAILABLE #endif // INT8_MMA_AVAILABLE
} }
static constexpr int get_mmq_y_host(const int cc) { static constexpr int get_mmq_y_host(const int cc) {
return cc >= CC_OFFSET_AMD ? (cc == CC_RDNA1 ? 64 : 128) : (cc >= CC_VOLTA ? 128 : 64); return cc >= GGML_CUDA_CC_OFFSET_AMD ? (cc == GGML_CUDA_CC_RDNA1 ? 64 : 128) : (cc >= GGML_CUDA_CC_VOLTA ? 128 : 64);
} }
static constexpr __device__ int get_mmq_y_device() { static constexpr __device__ int get_mmq_y_device() {
...@@ -157,11 +157,11 @@ static constexpr __device__ int get_mmq_y_device() { ...@@ -157,11 +157,11 @@ static constexpr __device__ int get_mmq_y_device() {
return 128; return 128;
#endif // defined RDNA1 #endif // defined RDNA1
#else #else
#if __CUDA_ARCH__ >= CC_VOLTA #if __CUDA_ARCH__ >= GGML_CUDA_CC_VOLTA
return 128; return 128;
#else #else
return 64; return 64;
#endif // __CUDA_ARCH__ >= CC_VOLTA #endif // __CUDA_ARCH__ >= GGML_CUDA_CC_VOLTA
#endif // defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__) #endif // defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
} }
...@@ -2600,11 +2600,11 @@ template <ggml_type type, int mmq_x, int nwarps, bool need_check> ...@@ -2600,11 +2600,11 @@ template <ggml_type type, int mmq_x, int nwarps, bool need_check>
__launch_bounds__(WARP_SIZE*nwarps, 2) __launch_bounds__(WARP_SIZE*nwarps, 2)
#endif // defined(RDNA3) || defined(RDNA2) || defined(CDNA) || defined(GCN) #endif // defined(RDNA3) || defined(RDNA2) || defined(CDNA) || defined(GCN)
#else #else
#if __CUDA_ARCH__ >= CC_VOLTA #if __CUDA_ARCH__ >= GGML_CUDA_CC_VOLTA
__launch_bounds__(WARP_SIZE*nwarps, 1) __launch_bounds__(WARP_SIZE*nwarps, 1)
#else #else
__launch_bounds__(WARP_SIZE*nwarps, 2) __launch_bounds__(WARP_SIZE*nwarps, 2)
#endif // __CUDA_ARCH__ >= CC_VOLTA #endif // __CUDA_ARCH__ >= GGML_CUDA_CC_VOLTA
#endif // defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__) #endif // defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
static __global__ void mul_mat_q( static __global__ void mul_mat_q(
const char * __restrict__ x, const char * __restrict__ yc, float * __restrict__ dst, float * __restrict__ tmp_fixup, const char * __restrict__ x, const char * __restrict__ yc, float * __restrict__ dst, float * __restrict__ tmp_fixup,
...@@ -2620,7 +2620,7 @@ static __global__ void mul_mat_q( ...@@ -2620,7 +2620,7 @@ static __global__ void mul_mat_q(
constexpr int mmq_y = get_mmq_y_device(); constexpr int mmq_y = get_mmq_y_device();
// On AMD or old CUDA the performance with stream-k was worse, use conventional tiling instead: // On AMD or old CUDA the performance with stream-k was worse, use conventional tiling instead:
#if (defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) || __CUDA_ARCH__ < CC_VOLTA #if (defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) || __CUDA_ARCH__ < GGML_CUDA_CC_VOLTA
{ {
constexpr bool fixup = false; constexpr bool fixup = false;
mul_mat_q_process_tile<type, mmq_x, nwarps, need_check, fixup> mul_mat_q_process_tile<type, mmq_x, nwarps, need_check, fixup>
...@@ -2628,7 +2628,7 @@ static __global__ void mul_mat_q( ...@@ -2628,7 +2628,7 @@ static __global__ void mul_mat_q(
blockIdx.x, blockIdx.y, 0, ne00/qk); blockIdx.x, blockIdx.y, 0, ne00/qk);
return; return;
} }
#endif // (defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) || __CUDA_ARCH__ < CC_VOLTA #endif // (defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) || __CUDA_ARCH__ < GGML_CUDA_CC_VOLTA
const int64_t blocks_per_ne00 = ne00 / qk; const int64_t blocks_per_ne00 = ne00 / qk;
constexpr int blocks_per_iter = MMQ_ITER_K / qk; constexpr int blocks_per_iter = MMQ_ITER_K / qk;
...@@ -2851,7 +2851,7 @@ void mul_mat_q_case(ggml_backend_cuda_context & ctx, const mmq_args & args, cuda ...@@ -2851,7 +2851,7 @@ void mul_mat_q_case(ggml_backend_cuda_context & ctx, const mmq_args & args, cuda
const int mmq_x_max = get_mmq_x_max_host(cc); const int mmq_x_max = get_mmq_x_max_host(cc);
const int mmq_y = get_mmq_y_host(cc); const int mmq_y = get_mmq_y_host(cc);
const int block_num_y = (args.ne01 + mmq_y - 1) / mmq_y; const int block_num_y = (args.ne01 + mmq_y - 1) / mmq_y;
const bool use_stream_k = cc >= CC_VOLTA && cc < CC_OFFSET_AMD; const bool use_stream_k = cc >= GGML_CUDA_CC_VOLTA && cc < GGML_CUDA_CC_OFFSET_AMD;
int mmq_x_best = 0; int mmq_x_best = 0;
int nparts_best = INT_MAX; int nparts_best = INT_MAX;
......
/** /**
* llama.cpp - commit 40c6d79fb52f995f47507fedfeaae2ac05d9b35c - do not edit this file * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
* *
* MIT License * MIT License
* *
...@@ -83,7 +83,7 @@ static __global__ void mul_mat_vec( ...@@ -83,7 +83,7 @@ static __global__ void mul_mat_vec(
if (block_size > WARP_SIZE) { if (block_size > WARP_SIZE) {
buf_iw[tid/WARP_SIZE] = sumf; buf_iw[tid/WARP_SIZE] = sumf;
__syncthreads(); __syncthreads();
if (tid > WARP_SIZE) { if (tid >= WARP_SIZE) {
return; return;
} }
sumf = buf_iw[tid]; sumf = buf_iw[tid];
......
/** /**
* llama.cpp - commit 40c6d79fb52f995f47507fedfeaae2ac05d9b35c - do not edit this file * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
* *
* MIT License * MIT License
* *
......
/** /**
* llama.cpp - commit 40c6d79fb52f995f47507fedfeaae2ac05d9b35c - do not edit this file * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
* *
* MIT License * MIT License
* *
...@@ -168,7 +168,7 @@ static void mul_mat_vec_q_cuda( ...@@ -168,7 +168,7 @@ static void mul_mat_vec_q_cuda(
int64_t nwarps = 1; int64_t nwarps = 1;
int64_t rows_per_cuda_block = 1; int64_t rows_per_cuda_block = 1;
if (ggml_cuda_info().devices[id].cc < CC_CDNA || ggml_cuda_info().devices[id].cc == CC_RDNA1) { // NVIDIA and AMD older than RDNA2 but not CDNA if (ggml_cuda_info().devices[id].cc < GGML_CUDA_CC_CDNA || ggml_cuda_info().devices[id].cc == GGML_CUDA_CC_RDNA1) { // NVIDIA and AMD older than RDNA2 but not CDNA
switch(ncols_y) { switch(ncols_y) {
case 1: case 1:
nwarps = 4; nwarps = 4;
......
/** /**
* llama.cpp - commit 40c6d79fb52f995f47507fedfeaae2ac05d9b35c - do not edit this file * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
* *
* MIT License * MIT License
* *
......
/** /**
* llama.cpp - commit 40c6d79fb52f995f47507fedfeaae2ac05d9b35c - do not edit this file * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
* *
* MIT License * MIT License
* *
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment