Unverified Commit 0cefd46f authored by Jeffrey Morgan's avatar Jeffrey Morgan Committed by GitHub
Browse files

llama: update to commit de4c07f93 (#10655)

parent ad035ad5
......@@ -10,10 +10,11 @@ static __global__ void k_get_rows(
/*const size_t nb00,*/ const size_t nb01, const size_t nb02, const size_t nb03,
const size_t s10, const size_t s11, const size_t s12/*, const size_t s13*/) {
const int i00 = (blockIdx.x*blockDim.x + threadIdx.x)*2;
const int i10 = blockDim.y*blockIdx.y + threadIdx.y;
const int i11 = (blockIdx.z*blockDim.z + threadIdx.z)/ne12;
const int i12 = (blockIdx.z*blockDim.z + threadIdx.z)%ne12;
// The x and y dimensions of the grid are swapped because the maximum allowed grid size for x is higher.
const int i00 = (blockIdx.y * blockDim.x + threadIdx.x)*2;
const int i10 = blockIdx.x;
const int i11 = blockIdx.z / ne12;
const int i12 = blockIdx.z % ne12;
if (i00 >= ne00) {
return;
......@@ -46,10 +47,11 @@ static __global__ void k_get_rows_float(
/*const size_t nb00,*/ const size_t nb01, const size_t nb02, const size_t nb03,
const size_t s10, const size_t s11, const size_t s12/*, const size_t s13*/) {
const int i00 = blockIdx.x*blockDim.x + threadIdx.x;
const int i10 = blockDim.y*blockIdx.y + threadIdx.y;
const int i11 = (blockIdx.z*blockDim.z + threadIdx.z)/ne12;
const int i12 = (blockIdx.z*blockDim.z + threadIdx.z)%ne12;
// The x and y dimensions of the grid are swapped because the maximum allowed grid size for x is higher.
const int i00 = blockIdx.y * blockDim.x + threadIdx.x;
const int i10 = blockIdx.x;
const int i11 = blockIdx.z / ne12;
const int i12 = blockIdx.z % ne12;
if (i00 >= ne00) {
return;
......@@ -94,8 +96,8 @@ static void get_rows_cuda_q(
const size_t nb1, const size_t nb2, const size_t nb3,
cudaStream_t stream) {
const dim3 block_dims(CUDA_GET_ROWS_BLOCK_SIZE, 1, 1);
const int block_num_x = (ne00 + 2*CUDA_GET_ROWS_BLOCK_SIZE - 1) / (2*CUDA_GET_ROWS_BLOCK_SIZE);
const dim3 block_nums(block_num_x, ne10, ne11*ne12);
const int block_num_y = (ne00 + 2*CUDA_GET_ROWS_BLOCK_SIZE - 1) / (2*CUDA_GET_ROWS_BLOCK_SIZE);
const dim3 block_nums(ne10, block_num_y, ne11*ne12);
// strides in elements
// const size_t s0 = nb0 / sizeof(dst_t);
......@@ -127,8 +129,8 @@ static void get_rows_cuda_float(
const size_t nb1, const size_t nb2, const size_t nb3,
cudaStream_t stream) {
const dim3 block_dims(CUDA_GET_ROWS_BLOCK_SIZE, 1, 1);
const int block_num_x = (ne00 + CUDA_GET_ROWS_BLOCK_SIZE - 1) / CUDA_GET_ROWS_BLOCK_SIZE;
const dim3 block_nums(block_num_x, ne10, ne11*ne12);
const int block_num_y = (ne00 + CUDA_GET_ROWS_BLOCK_SIZE - 1) / CUDA_GET_ROWS_BLOCK_SIZE;
const dim3 block_nums(ne10, block_num_y, ne11*ne12);
// strides in elements
// const size_t s0 = nb0 / sizeof(dst_t);
......
......@@ -556,8 +556,8 @@ static enum ggml_status ggml_backend_cuda_buffer_init_tensor(ggml_backend_buffer
if (ggml_is_quantized(tensor->type) && tensor->view_src == nullptr && ggml_backend_buffer_get_usage(buffer) != GGML_BACKEND_BUFFER_USAGE_COMPUTE) {
// initialize padding to 0 to avoid possible NaN values
size_t original_size = ggml_nbytes(tensor);
size_t padded_size = ggml_backend_buft_get_alloc_size(buffer->buft, tensor);
const size_t original_size = ggml_nbytes(tensor);
const size_t padded_size = ggml_backend_buft_get_alloc_size(buffer->buft, tensor);
if (padded_size > original_size) {
ggml_cuda_set_device(ctx->device);
......@@ -680,6 +680,7 @@ static size_t ggml_backend_cuda_buffer_type_get_alloc_size(ggml_backend_buffer_t
if (ggml_is_quantized(tensor->type)) {
if (ne0 % MATRIX_ROW_PADDING != 0) {
GGML_ASSERT(tensor->nb[0] == ggml_element_size(tensor));
size += ggml_row_size(tensor->type, MATRIX_ROW_PADDING - ne0 % MATRIX_ROW_PADDING);
}
}
......@@ -802,6 +803,7 @@ static void * ggml_backend_cuda_split_buffer_get_base(ggml_backend_buffer_t buff
static enum ggml_status ggml_backend_cuda_split_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor) {
GGML_ASSERT(tensor->view_src == nullptr); // views of split tensors are not supported
GGML_ASSERT(ggml_is_contiguous(tensor) && "split buffers only supported for contiguous tensors");
ggml_backend_cuda_split_buffer_context * ctx = (ggml_backend_cuda_split_buffer_context *)buffer->context;
ggml_backend_cuda_split_buffer_type_context * buft_ctx = (ggml_backend_cuda_split_buffer_type_context *)buffer->buft->context;
......@@ -853,6 +855,7 @@ static void ggml_backend_cuda_split_buffer_set_tensor(ggml_backend_buffer_t buff
// split tensors must always be set in their entirety at once
GGML_ASSERT(offset == 0);
GGML_ASSERT(size == ggml_nbytes(tensor));
GGML_ASSERT(ggml_is_contiguous(tensor) && "split buffers only supported for contiguous tensors");
ggml_backend_cuda_split_buffer_type_context * buft_ctx = (ggml_backend_cuda_split_buffer_type_context *)buffer->buft->context;
......@@ -891,6 +894,7 @@ static void ggml_backend_cuda_split_buffer_get_tensor(ggml_backend_buffer_t buff
// split tensors must always be set in their entirety at once
GGML_ASSERT(offset == 0);
GGML_ASSERT(size == ggml_nbytes(tensor));
GGML_ASSERT(ggml_is_contiguous(tensor) && "split buffers only supported for contiguous tensors");
ggml_backend_cuda_split_buffer_type_context * buft_ctx = (ggml_backend_cuda_split_buffer_type_context *)buffer->buft->context;
......@@ -972,6 +976,7 @@ static size_t ggml_backend_cuda_split_buffer_type_get_alignment(ggml_backend_buf
static size_t ggml_backend_cuda_split_buffer_type_get_alloc_size(ggml_backend_buffer_type_t buft, const ggml_tensor * tensor) {
ggml_backend_cuda_split_buffer_type_context * ctx = (ggml_backend_cuda_split_buffer_type_context *)buft->context;
GGML_ASSERT(ggml_is_contiguous(tensor) && "split buffers only supported for contiguous tensors");
size_t total_size = 0;
......@@ -1534,6 +1539,8 @@ static void ggml_cuda_op_mul_mat(
// If src0 is on a temporary compute buffer (partial offloading) there may be some padding that needs to be cleared:
if (ne00 % MATRIX_ROW_PADDING != 0 && ggml_is_quantized(src0->type) && ggml_backend_buffer_get_usage(src0->buffer) == GGML_BACKEND_BUFFER_USAGE_COMPUTE && src0->view_src == nullptr) {
GGML_ASSERT(ggml_is_contiguously_allocated(src0));
GGML_ASSERT(!src0->view_src);
const size_t nbytes_data = ggml_row_size(src0->type, (dev[id].row_high - dev[id].row_low)*ne00);
const size_t nbytes_padding = ggml_row_size(src0->type, MATRIX_ROW_PADDING - ne00 % MATRIX_ROW_PADDING);
CUDA_CHECK(cudaMemsetAsync(dev[id].src0_dd + nbytes_data, 0, nbytes_padding, stream));
......@@ -1905,13 +1912,19 @@ static void ggml_cuda_mul_mat_batched_cublas(ggml_backend_cuda_context & ctx, co
static void ggml_cuda_mul_mat(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
const bool split = ggml_backend_buft_is_cuda_split(src0->buffer->buft);
// If src0 is a temporary compute buffer it may have some padding that needs to be cleared for mul_mat_vec_q or mul_mat_q.
// But if src0 is also a view of another tensor then this cannot be done safely because it may overwrite valid tensor data.
// Therefore, in such cases use cuBLAS.
const bool bad_padding_clear = ggml_backend_buffer_get_usage(src0->buffer) == GGML_BACKEND_BUFFER_USAGE_COMPUTE
&& ggml_nbytes(src0) != ggml_backend_buffer_get_alloc_size(src0->buffer, src0) && src0->view_src;
bool use_mul_mat_vec = (src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16 || src0->type == GGML_TYPE_BF16)
&& src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32
&& src0->ne[0] % 2 == 0 && src1->ne[1] == 1;
bool use_mul_mat_vec_q = ggml_is_quantized(src0->type)
bool use_mul_mat_vec_q = ggml_is_quantized(src0->type) && !bad_padding_clear
&& src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32
&& src1->ne[1] <= MMVQ_MAX_BATCH_SIZE;
bool use_mul_mat_q = ggml_is_quantized(src0->type)
bool use_mul_mat_q = ggml_is_quantized(src0->type) && !bad_padding_clear
&& src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32;
bool any_gpus_with_slow_fp16 = false;
......@@ -2065,9 +2078,11 @@ static void ggml_cuda_mul_mat_id(ggml_backend_cuda_context & ctx, ggml_tensor *
}
ggml_tensor src0_slice = *src0;
src0_slice.ne[2] = 1;
src0_slice.nb[3] = src0_slice.nb[2];
src0_slice.data = (char *) src0->data + i02*nb02;
src0_slice.ne[2] = 1;
src0_slice.nb[3] = src0_slice.nb[2];
src0_slice.op = GGML_OP_VIEW;
src0_slice.view_src = dst->src[0]; // non-const pointer to src0
src0_slice.data = (char *) src0->data + i02*nb02;
ggml_tensor src1_slice;
memset(&src1_slice, 0, sizeof(src1_slice));
......@@ -3213,16 +3228,16 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
return false;
#endif // FLASH_ATTN_AVAILABLE
if (op->src[1]->ne[0] != op->src[2]->ne[0]) {
// different head sizes of K and V are not supported yet
return false;
const int cc = ggml_cuda_info().devices[dev_ctx->device].cc;
if (!new_mma_available(cc) || cc < GGML_CUDA_CC_AMPERE) {
return false;
}
const int gqa_ratio = op->src[0]->ne[2] / op->src[1]->ne[2];
return op->src[1]->ne[0] == 576 && op->src[2]->ne[0] == 512 && op->src[3] && gqa_ratio % 16 == 0;
}
if (op->src[0]->ne[0] == 192) {
return false;
}
if (op->src[0]->ne[0] == 576) {
// DeepSeek MLA
return false;
}
if (op->src[0]->ne[3] != 1) {
return false;
}
......
......@@ -89,6 +89,17 @@ void ggml_cuda_mul_mat_q(
const float * src1_d = (const float *) src1->data;
float * dst_d = (float *) dst->data;
// If src0 is a temporary compute buffer, clear any potential padding.
if (ggml_backend_buffer_get_usage(src0->buffer) == GGML_BACKEND_BUFFER_USAGE_COMPUTE) {
const size_t size_data = ggml_nbytes(src0);
const size_t size_alloc = ggml_backend_buffer_get_alloc_size(src0->buffer, src0);
if (size_alloc > size_data) {
GGML_ASSERT(ggml_is_contiguously_allocated(src0));
GGML_ASSERT(!src0->view_src);
CUDA_CHECK(cudaMemsetAsync((char *) src0->data + size_data, 0, size_alloc - size_data, stream));
}
}
const int64_t ne10_padded = GGML_PAD(ne10, MATRIX_ROW_PADDING);
const int64_t s01 = src0->nb[1] / ts_src0;
......@@ -118,7 +129,7 @@ void ggml_cuda_mul_mat_q(
const mmq_args args = {
src0_d, src0->type, (const int *) src1_q8_1.ptr, nullptr, nullptr, dst_d,
ne00, ne01, ne1, s01, s1,
ne00, ne01, ne1, s01, ne11, s1,
ne02, ne12, s02, s12, s2,
ne03, ne13, s03, s13, s3,
use_stream_k};
......@@ -202,7 +213,7 @@ void ggml_cuda_mul_mat_q(
// Note that ne02 is used instead of ne12 because the number of y channels determines the z dimension of the CUDA grid.
const mmq_args args = {
src0_d, src0->type, (const int *) src1_q8_1.ptr, ids_dst_dev, expert_bounds_dev, dst_d,
ne00, ne01, ne_get_rows, s01, s1,
ne00, ne01, ne_get_rows, s01, ne_get_rows, s1,
ne02, ne02, s02, s12, s2,
ne03, ne13, s03, s13, s3,
use_stream_k};
......@@ -241,7 +252,7 @@ void ggml_cuda_op_mul_mat_q(
ggml_cuda_highest_compiled_arch(cc) >= GGML_CUDA_CC_VOLTA && src1_ncols == ne11;
const mmq_args args = {
src0_dd_i, src0->type, (const int *) src1_ddq_i, nullptr, nullptr, dst_dd_i,
ne00, row_diff, src1_ncols, stride01, nrows_dst,
ne00, row_diff, src1_ncols, stride01, ne11, nrows_dst,
1, 1, 0, 0, 0,
1, 1, 0, 0, 0,
use_stream_k};
......
......@@ -2522,7 +2522,7 @@ template <ggml_type type, int mmq_x, int nwarps, bool need_check, bool fixup>
static __device__ __forceinline__ void mul_mat_q_process_tile(
const char * __restrict__ x, const int offset_x, const int * __restrict__ y,
const int * __restrict__ ids_dst, float * __restrict__ dst, float * __restrict__ tmp_fixup,
const int nrows_x, const int ncols_y, const int stride_row_x, const int stride_col_dst,
const int stride_row_x, const int ncols_y, const int stride_col_dst,
const int tile_x_max_i, const int tile_y_max_j, const int kb0_start, const int kb0_stop) {
constexpr int qk = ggml_cuda_type_traits<type>::qk;
......@@ -2606,7 +2606,7 @@ template <ggml_type type, int mmq_x, int nwarps, bool need_check>
static __global__ void mul_mat_q(
const char * __restrict__ x, const int * __restrict__ y, const int32_t * __restrict__ ids_dst,
const int32_t * __restrict__ expert_bounds, float * __restrict__ dst, float * __restrict__ tmp_fixup,
const int ncols_x, const int nrows_x, const int ncols_y, const int stride_row_x, const int stride_col_dst,
const int ncols_x, const int nrows_x, const int ncols_dst, const int stride_row_x, const int ncols_y, const int stride_col_dst,
const int channel_ratio, const int nchannels_y, const int stride_channel_x, const int stride_channel_y, const int stride_channel_dst,
const int sample_ratio, const int nsamples_y, const int stride_sample_x, const int stride_sample_y, const int stride_sample_dst) {
......@@ -2619,8 +2619,8 @@ static __global__ void mul_mat_q(
constexpr int qk = ggml_cuda_type_traits<type>::qk;
constexpr int mmq_y = get_mmq_y_device();
const int ntx = (ncols_y + mmq_x - 1) / mmq_x; // Number of tiles x
const int nty = (nrows_x + mmq_y - 1) / mmq_y; // Number of tiles y
const int ntx = (ncols_dst + mmq_x - 1) / mmq_x; // Number of tiles x
const int nty = (nrows_x + mmq_y - 1) / mmq_y; // Number of tiles y
// Initialize the ids for writing back data with just the index.
// For regular matrix multiplications this is never changed.
......@@ -2636,6 +2636,7 @@ static __global__ void mul_mat_q(
ids_dst_shared[j] = j;
}
__syncthreads();
// On AMD or old CUDA the performance with stream-k was worse, use conventional tiling instead:
#if (defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) || __CUDA_ARCH__ < GGML_CUDA_CC_VOLTA
......@@ -2647,8 +2648,8 @@ static __global__ void mul_mat_q(
// Defaults for regular matrix multiplication:
int col_low = 0;
int col_high = ncols_y;
int col_diff = ncols_y;
int col_high = ncols_dst;
int col_diff = ncols_dst;
int offset_y = wt*stride_sample_y + zt*stride_channel_y;
int offset_dst = wt*stride_sample_dst + zt*stride_channel_dst + jt*mmq_x*stride_col_dst;
......@@ -2664,6 +2665,7 @@ static __global__ void mul_mat_q(
return;
}
// __syncthreads(); // There is no previous tile that could cause a race condition.
#pragma unroll
for (int j0 = 0; j0 < mmq_x; j0 += nwarps*WARP_SIZE) {
const int j = j0 + threadIdx.y*WARP_SIZE + threadIdx.x;
......@@ -2674,6 +2676,7 @@ static __global__ void mul_mat_q(
ids_dst_shared[j] = ids_dst[col_low + jt*mmq_x + j];
}
__syncthreads();
}
offset_y += (col_low + jt*mmq_x)*(sizeof(block_q8_1_mmq)/sizeof(int));
......@@ -2686,7 +2689,7 @@ static __global__ void mul_mat_q(
constexpr bool fixup = false;
mul_mat_q_process_tile<type, mmq_x, nwarps, need_check, fixup>
(x, offset_x, y + offset_y, ids_dst_shared, dst + offset_dst, tmp_fixup, nrows_x, ncols_y, stride_row_x, stride_col_dst,
(x, offset_x, y + offset_y, ids_dst_shared, dst + offset_dst, tmp_fixup, stride_row_x, ncols_y, stride_col_dst,
tile_x_max_i, tile_y_max_j, 0, ncols_x/qk);
return;
}
......@@ -2717,8 +2720,8 @@ static __global__ void mul_mat_q(
// Defaults for regular matrix multiplication:
int col_low = 0;
int col_high = ncols_y;
int col_diff = ncols_y;
int col_high = ncols_dst;
int col_diff = ncols_dst;
int offset_y = wt*stride_sample_y + zt*stride_channel_y;
int offset_dst = wt*stride_sample_dst + zt*stride_channel_dst + jt*mmq_x*stride_col_dst;
......@@ -2740,6 +2743,7 @@ static __global__ void mul_mat_q(
continue;
}
__syncthreads();
#pragma unroll
for (int j0 = 0; j0 < mmq_x; j0 += nwarps*WARP_SIZE) {
const int j = j0 + threadIdx.y*WARP_SIZE + threadIdx.x;
......@@ -2750,6 +2754,7 @@ static __global__ void mul_mat_q(
ids_dst_shared[j] = ids_dst[col_low + jt*mmq_x + j];
}
__syncthreads();
}
offset_y += (col_low + jt*mmq_x)*(sizeof(block_q8_1_mmq)/sizeof(int));
......@@ -2762,7 +2767,7 @@ static __global__ void mul_mat_q(
constexpr bool fixup = false; // All but (potentially) the last iterations write their data to dst rather than the fixup buffer.
mul_mat_q_process_tile<type, mmq_x, nwarps, need_check, fixup>
(x, offset_x, y + offset_y, ids_dst_shared, dst + offset_dst, tmp_fixup, nrows_x, ncols_y, stride_row_x, stride_col_dst,
(x, offset_x, y + offset_y, ids_dst_shared, dst + offset_dst, tmp_fixup, stride_row_x, ncols_y, stride_col_dst,
tile_x_max_i, tile_y_max_j, kb0_start, kb0_stop);
kbc += blocks_per_ne00;
......@@ -2787,8 +2792,8 @@ static __global__ void mul_mat_q(
// Defaults for regular matrix multiplication:
int col_low = 0;
int col_high = ncols_y;
int col_diff = ncols_y;
int col_high = ncols_dst;
int col_diff = ncols_dst;
int offset_y = wt*stride_sample_y + zt*stride_channel_y;
int offset_dst = wt*stride_sample_dst + zt*stride_channel_dst + jt*mmq_x*stride_col_dst;
......@@ -2805,6 +2810,7 @@ static __global__ void mul_mat_q(
}
// The memory layout for the fixup buffer is always contiguous, therefore reset ids:
__syncthreads();
#pragma unroll
for (int j0 = 0; j0 < mmq_x; j0 += nwarps*WARP_SIZE) {
const int j = j0 + threadIdx.y*WARP_SIZE + threadIdx.x;
......@@ -2815,6 +2821,7 @@ static __global__ void mul_mat_q(
ids_dst_shared[j] = j;
}
__syncthreads();
}
offset_y += (col_low + jt*mmq_x)*(sizeof(block_q8_1_mmq)/sizeof(int));
......@@ -2827,7 +2834,7 @@ static __global__ void mul_mat_q(
constexpr bool fixup = true; // Last index writes its data to fixup buffer to avoid data races with other blocks.
mul_mat_q_process_tile<type, mmq_x, nwarps, need_check, fixup>
(x, offset_x, y + offset_y, ids_dst_shared, dst + offset_dst, tmp_fixup, nrows_x, ncols_y, stride_row_x, stride_col_dst,
(x, offset_x, y + offset_y, ids_dst_shared, dst + offset_dst, tmp_fixup, stride_row_x, ncols_y, stride_col_dst,
tile_x_max_i, tile_y_max_j, kb0_start, kb0_stop);
}
......@@ -2835,7 +2842,7 @@ static __global__ void mul_mat_q(
template <ggml_type type, int mmq_x, int nwarps, bool need_check>
static __global__ void mul_mat_q_stream_k_fixup(
const int32_t * ids_dst, const int32_t * expert_bounds, float * __restrict__ dst, const float * __restrict__ tmp_last_tile,
const int ncols_x, const int nrows_x, const int ncols_y, const int stride_col_dst,
const int ncols_x, const int nrows_x, const int ncols_dst, const int stride_col_dst,
const int nchannels_y, const int stride_channel_dst, const int nsamples_y, const int stride_sample_dst) {
constexpr int mmq_y = get_mmq_y_device();
constexpr int qk = ggml_cuda_type_traits<type>::qk;
......@@ -2844,8 +2851,8 @@ static __global__ void mul_mat_q_stream_k_fixup(
float sum[mmq_x*mmq_y / (nwarps*WARP_SIZE)] = {0.0f};
const int ntx = (ncols_y + mmq_x - 1) / mmq_x;
const int nty = (nrows_x + mmq_y - 1) / mmq_y;
const int ntx = (ncols_dst + mmq_x - 1) / mmq_x;
const int nty = (nrows_x + mmq_y - 1) / mmq_y;
const int bidx0 = blockIdx.x;
......@@ -2918,8 +2925,8 @@ static __global__ void mul_mat_q_stream_k_fixup(
const int offset_dst = wt*stride_sample_dst + zt*stride_channel_dst + jt*mmq_x*stride_col_dst + it*mmq_y;
dst += offset_dst;
const int i_max = nrows_x - it*mmq_y - 1;
const int j_max = ncols_y - jt*mmq_x - 1;
const int i_max = nrows_x - it*mmq_y - 1;
const int j_max = ncols_dst - jt*mmq_x - 1;
#pragma unroll
for (int j0 = 0; j0 < mmq_x; j0 += nwarps) {
......@@ -2951,6 +2958,7 @@ static __global__ void mul_mat_q_stream_k_fixup(
for (int j = threadIdx.y*WARP_SIZE + threadIdx.x; j < mmq_x; j += nwarps*WARP_SIZE) {
ids_dst_shared[j] = ids_dst[col_low + j];
}
__syncthreads();
const int offset_dst = it*mmq_y;
dst += offset_dst;
......@@ -2981,7 +2989,7 @@ static __global__ void mul_mat_q_stream_k_fixup(
struct mmq_args {
const char * x; ggml_type type_x; const int * y; const int32_t * ids_dst; const int32_t * expert_bounds; float * dst;
int64_t ncols_x; int64_t nrows_x; int64_t ncols_y; int64_t stride_row_x; int64_t nrows_dst;
int64_t ncols_x; int64_t nrows_x; int64_t ncols_dst; int64_t stride_row_x; int64_t ncols_y; int64_t nrows_dst;
int64_t nchannels_x; int64_t nchannels_y; int64_t stride_channel_x; int64_t stride_channel_y; int64_t stride_channel_dst;
int64_t nsamples_x; int64_t nsamples_y; int64_t stride_sample_x; int64_t stride_sample_y; int64_t stride_sample_dst;
bool use_stream_k;
......@@ -3017,8 +3025,8 @@ static void launch_mul_mat_q(ggml_backend_cuda_context & ctx, const mmq_args & a
}
#endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && !defined(GGML_USE_MUSA)
const int nty = (args.nrows_x + mmq_y - 1) / mmq_y;
const int ntx = (args.ncols_y + mmq_x - 1) / mmq_x;
const int nty = (args.nrows_x + mmq_y - 1) / mmq_y;
const int ntx = (args.ncols_dst + mmq_x - 1) / mmq_x;
const int ntzw = args.nchannels_y * args.nsamples_y;
const dim3 block_nums_xy_tiling(nty, ntx, ntzw);
......@@ -3032,14 +3040,14 @@ static void launch_mul_mat_q(ggml_backend_cuda_context & ctx, const mmq_args & a
constexpr bool need_check = false;
mul_mat_q<type, mmq_x, MMQ_NWARPS, need_check><<<block_nums_xy_tiling, block_dims, nbytes_shared, stream>>>
(args.x, args.y, args.ids_dst, args.expert_bounds, args.dst, nullptr,
args.ncols_x, args.nrows_x, args.ncols_y, args.stride_row_x, args.nrows_dst,
args.ncols_x, args.nrows_x, args.ncols_dst, args.stride_row_x, args.ncols_y, args.nrows_dst,
channel_ratio, args.nchannels_y, args.stride_channel_x, args.stride_channel_y, args.stride_channel_dst,
sample_ratio, args.nsamples_y, args.stride_sample_x, args.stride_sample_y, args.stride_sample_dst);
} else {
constexpr bool need_check = true;
mul_mat_q<type, mmq_x, MMQ_NWARPS, need_check><<<block_nums_xy_tiling, block_dims, nbytes_shared, stream>>>
(args.x, args.y, args.ids_dst, args.expert_bounds, args.dst, nullptr,
args.ncols_x, args.nrows_x, args.ncols_y, args.stride_row_x, args.nrows_dst,
args.ncols_x, args.nrows_x, args.ncols_dst, args.stride_row_x, args.ncols_y, args.nrows_dst,
channel_ratio, args.nchannels_y, args.stride_channel_x, args.stride_channel_y, args.stride_channel_dst,
sample_ratio, args.nsamples_y, args.stride_sample_x, args.stride_sample_y, args.stride_sample_dst);
}
......@@ -3060,7 +3068,7 @@ static void launch_mul_mat_q(ggml_backend_cuda_context & ctx, const mmq_args & a
mul_mat_q<type, mmq_x, MMQ_NWARPS, need_check><<<block_nums_stream_k, block_dims, nbytes_shared, stream>>>
(args.x, args.y, args.ids_dst, args.expert_bounds, args.dst, tmp_fixup.ptr,
args.ncols_x, args.nrows_x, args.ncols_y, args.stride_row_x, args.nrows_dst,
args.ncols_x, args.nrows_x, args.ncols_dst, args.stride_row_x, args.ncols_y, args.nrows_dst,
channel_ratio, args.nchannels_y, args.stride_channel_x, args.stride_channel_y, args.stride_channel_dst,
sample_ratio, args.nsamples_y, args.stride_sample_x, args.stride_sample_y, args.stride_sample_dst);
......@@ -3069,14 +3077,14 @@ static void launch_mul_mat_q(ggml_backend_cuda_context & ctx, const mmq_args & a
}
mul_mat_q_stream_k_fixup<type, mmq_x, MMQ_NWARPS, need_check><<<block_nums_stream_k, block_dims, 0, stream>>>
(args.ids_dst, args.expert_bounds, args.dst, tmp_fixup.ptr, args.ncols_x, args.nrows_x, args.ncols_y,
(args.ids_dst, args.expert_bounds, args.dst, tmp_fixup.ptr, args.ncols_x, args.nrows_x, args.ncols_dst,
args.nrows_dst, args.nchannels_y, args.stride_channel_dst, args.nsamples_y, args.stride_sample_dst);
} else {
constexpr bool need_check = true;
mul_mat_q<type, mmq_x, MMQ_NWARPS, need_check><<<block_nums_stream_k, block_dims, nbytes_shared, stream>>>
(args.x, args.y, args.ids_dst, args.expert_bounds, args.dst, tmp_fixup.ptr,
args.ncols_x, args.nrows_x, args.ncols_y, args.stride_row_x, args.nrows_dst,
args.ncols_x, args.nrows_x, args.ncols_dst, args.stride_row_x, args.ncols_y, args.nrows_dst,
channel_ratio, args.nchannels_y, args.stride_channel_x, args.stride_channel_y, args.stride_channel_dst,
sample_ratio, args.nsamples_y, args.stride_sample_x, args.stride_sample_y, args.stride_sample_dst);
......@@ -3085,7 +3093,7 @@ static void launch_mul_mat_q(ggml_backend_cuda_context & ctx, const mmq_args & a
}
mul_mat_q_stream_k_fixup<type, mmq_x, MMQ_NWARPS, need_check><<<block_nums_stream_k, block_dims, 0, stream>>>
(args.ids_dst, args.expert_bounds, args.dst, tmp_fixup.ptr, args.ncols_x, args.nrows_x, args.ncols_y,
(args.ids_dst, args.expert_bounds, args.dst, tmp_fixup.ptr, args.ncols_x, args.nrows_x, args.ncols_dst,
args.nrows_dst, args.nchannels_y, args.stride_channel_dst, args.nsamples_y, args.stride_sample_dst);
}
}
......
......@@ -513,6 +513,17 @@ void ggml_cuda_mul_mat_vec_q(
const int32_t * ids_d = ids ? (const int32_t *) ids->data : nullptr;
float * dst_d = (float *) dst->data;
// If src0 is a temporary compute buffer, clear any potential padding.
if (ggml_backend_buffer_get_usage(src0->buffer) == GGML_BACKEND_BUFFER_USAGE_COMPUTE) {
const size_t size_data = ggml_nbytes(src0);
const size_t size_alloc = ggml_backend_buffer_get_alloc_size(src0->buffer, src0);
if (size_alloc > size_data) {
GGML_ASSERT(ggml_is_contiguously_allocated(src0));
GGML_ASSERT(!src0->view_src);
CUDA_CHECK(cudaMemsetAsync((char *) src0->data + size_data, 0, size_alloc - size_data, stream));
}
}
const int64_t ne10_padded = GGML_PAD(ne10, MATRIX_ROW_PADDING);
ggml_cuda_pool_alloc<char> src1_q8_1(ctx.pool(), ne13*ne12 * ne11*ne10_padded * sizeof(block_q8_1)/QK8_1);
{
......
......@@ -163,6 +163,7 @@ void quantize_mmq_q8_1_cuda(
const float * x, const int32_t * ids, void * vy, const ggml_type type_src0,
const int64_t ne00, const int64_t s01, const int64_t s02, const int64_t s03,
const int64_t ne0, const int64_t ne1, const int64_t ne2, const int64_t ne3, cudaStream_t stream) {
GGML_ASSERT(ne00 % 4 == 0);
GGML_ASSERT(ne0 % (4*QK8_1) == 0);
const int64_t block_num_x = (ne0 + 4*CUDA_QUANTIZE_BLOCK_SIZE_MMQ - 1) / (4*CUDA_QUANTIZE_BLOCK_SIZE_MMQ);
......
......@@ -31,7 +31,7 @@ void ggml_cuda_op_sum(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
GGML_ASSERT(src0->type == GGML_TYPE_F32);
GGML_ASSERT( dst->type == GGML_TYPE_F32);
GGML_ASSERT(ggml_is_contiguous(src0));
GGML_ASSERT(ggml_is_contiguously_allocated(src0));
const float * src0_d = (const float *) src0->data;
float * dst_d = (float *) dst->data;
......
// This file has been autogenerated by generate_cu_files.py, do not edit manually.
#include "../fattn-mma-f16.cuh"
DECL_FATTN_MMA_F16_CASE(576, 512, 1, 16);
......@@ -2,9 +2,9 @@
#include "../fattn-mma-f16.cuh"
DECL_FATTN_MMA_F16_CASE(64, 1, 8);
DECL_FATTN_MMA_F16_CASE(80, 1, 8);
DECL_FATTN_MMA_F16_CASE(96, 1, 8);
DECL_FATTN_MMA_F16_CASE(112, 1, 8);
DECL_FATTN_MMA_F16_CASE(128, 1, 8);
DECL_FATTN_MMA_F16_CASE(256, 1, 8);
DECL_FATTN_MMA_F16_CASE(64, 64, 1, 8);
DECL_FATTN_MMA_F16_CASE(80, 80, 1, 8);
DECL_FATTN_MMA_F16_CASE(96, 96, 1, 8);
DECL_FATTN_MMA_F16_CASE(112, 112, 1, 8);
DECL_FATTN_MMA_F16_CASE(128, 128, 1, 8);
DECL_FATTN_MMA_F16_CASE(256, 256, 1, 8);
......@@ -2,9 +2,9 @@
#include "../fattn-mma-f16.cuh"
DECL_FATTN_MMA_F16_CASE(64, 16, 1);
DECL_FATTN_MMA_F16_CASE(80, 16, 1);
DECL_FATTN_MMA_F16_CASE(96, 16, 1);
DECL_FATTN_MMA_F16_CASE(112, 16, 1);
DECL_FATTN_MMA_F16_CASE(128, 16, 1);
DECL_FATTN_MMA_F16_CASE(256, 16, 1);
DECL_FATTN_MMA_F16_CASE(64, 64, 16, 1);
DECL_FATTN_MMA_F16_CASE(80, 80, 16, 1);
DECL_FATTN_MMA_F16_CASE(96, 96, 16, 1);
DECL_FATTN_MMA_F16_CASE(112, 112, 16, 1);
DECL_FATTN_MMA_F16_CASE(128, 128, 16, 1);
DECL_FATTN_MMA_F16_CASE(256, 256, 16, 1);
......@@ -2,9 +2,9 @@
#include "../fattn-mma-f16.cuh"
DECL_FATTN_MMA_F16_CASE(64, 16, 2);
DECL_FATTN_MMA_F16_CASE(80, 16, 2);
DECL_FATTN_MMA_F16_CASE(96, 16, 2);
DECL_FATTN_MMA_F16_CASE(112, 16, 2);
DECL_FATTN_MMA_F16_CASE(128, 16, 2);
DECL_FATTN_MMA_F16_CASE(256, 16, 2);
DECL_FATTN_MMA_F16_CASE(64, 64, 16, 2);
DECL_FATTN_MMA_F16_CASE(80, 80, 16, 2);
DECL_FATTN_MMA_F16_CASE(96, 96, 16, 2);
DECL_FATTN_MMA_F16_CASE(112, 112, 16, 2);
DECL_FATTN_MMA_F16_CASE(128, 128, 16, 2);
DECL_FATTN_MMA_F16_CASE(256, 256, 16, 2);
......@@ -2,9 +2,9 @@
#include "../fattn-mma-f16.cuh"
DECL_FATTN_MMA_F16_CASE(64, 16, 4);
DECL_FATTN_MMA_F16_CASE(80, 16, 4);
DECL_FATTN_MMA_F16_CASE(96, 16, 4);
DECL_FATTN_MMA_F16_CASE(112, 16, 4);
DECL_FATTN_MMA_F16_CASE(128, 16, 4);
DECL_FATTN_MMA_F16_CASE(256, 16, 4);
DECL_FATTN_MMA_F16_CASE(64, 64, 16, 4);
DECL_FATTN_MMA_F16_CASE(80, 80, 16, 4);
DECL_FATTN_MMA_F16_CASE(96, 96, 16, 4);
DECL_FATTN_MMA_F16_CASE(112, 112, 16, 4);
DECL_FATTN_MMA_F16_CASE(128, 128, 16, 4);
DECL_FATTN_MMA_F16_CASE(256, 256, 16, 4);
// This file has been autogenerated by generate_cu_files.py, do not edit manually.
#include "../fattn-mma-f16.cuh"
DECL_FATTN_MMA_F16_CASE(576, 512, 2, 16);
......@@ -2,9 +2,9 @@
#include "../fattn-mma-f16.cuh"
DECL_FATTN_MMA_F16_CASE(64, 2, 4);
DECL_FATTN_MMA_F16_CASE(80, 2, 4);
DECL_FATTN_MMA_F16_CASE(96, 2, 4);
DECL_FATTN_MMA_F16_CASE(112, 2, 4);
DECL_FATTN_MMA_F16_CASE(128, 2, 4);
DECL_FATTN_MMA_F16_CASE(256, 2, 4);
DECL_FATTN_MMA_F16_CASE(64, 64, 2, 4);
DECL_FATTN_MMA_F16_CASE(80, 80, 2, 4);
DECL_FATTN_MMA_F16_CASE(96, 96, 2, 4);
DECL_FATTN_MMA_F16_CASE(112, 112, 2, 4);
DECL_FATTN_MMA_F16_CASE(128, 128, 2, 4);
DECL_FATTN_MMA_F16_CASE(256, 256, 2, 4);
......@@ -2,9 +2,9 @@
#include "../fattn-mma-f16.cuh"
DECL_FATTN_MMA_F16_CASE(64, 2, 8);
DECL_FATTN_MMA_F16_CASE(80, 2, 8);
DECL_FATTN_MMA_F16_CASE(96, 2, 8);
DECL_FATTN_MMA_F16_CASE(112, 2, 8);
DECL_FATTN_MMA_F16_CASE(128, 2, 8);
DECL_FATTN_MMA_F16_CASE(256, 2, 8);
DECL_FATTN_MMA_F16_CASE(64, 64, 2, 8);
DECL_FATTN_MMA_F16_CASE(80, 80, 2, 8);
DECL_FATTN_MMA_F16_CASE(96, 96, 2, 8);
DECL_FATTN_MMA_F16_CASE(112, 112, 2, 8);
DECL_FATTN_MMA_F16_CASE(128, 128, 2, 8);
DECL_FATTN_MMA_F16_CASE(256, 256, 2, 8);
......@@ -2,9 +2,9 @@
#include "../fattn-mma-f16.cuh"
DECL_FATTN_MMA_F16_CASE(64, 32, 1);
DECL_FATTN_MMA_F16_CASE(80, 32, 1);
DECL_FATTN_MMA_F16_CASE(96, 32, 1);
DECL_FATTN_MMA_F16_CASE(112, 32, 1);
DECL_FATTN_MMA_F16_CASE(128, 32, 1);
DECL_FATTN_MMA_F16_CASE(256, 32, 1);
DECL_FATTN_MMA_F16_CASE(64, 64, 32, 1);
DECL_FATTN_MMA_F16_CASE(80, 80, 32, 1);
DECL_FATTN_MMA_F16_CASE(96, 96, 32, 1);
DECL_FATTN_MMA_F16_CASE(112, 112, 32, 1);
DECL_FATTN_MMA_F16_CASE(128, 128, 32, 1);
DECL_FATTN_MMA_F16_CASE(256, 256, 32, 1);
......@@ -2,9 +2,9 @@
#include "../fattn-mma-f16.cuh"
DECL_FATTN_MMA_F16_CASE(64, 32, 2);
DECL_FATTN_MMA_F16_CASE(80, 32, 2);
DECL_FATTN_MMA_F16_CASE(96, 32, 2);
DECL_FATTN_MMA_F16_CASE(112, 32, 2);
DECL_FATTN_MMA_F16_CASE(128, 32, 2);
DECL_FATTN_MMA_F16_CASE(256, 32, 2);
DECL_FATTN_MMA_F16_CASE(64, 64, 32, 2);
DECL_FATTN_MMA_F16_CASE(80, 80, 32, 2);
DECL_FATTN_MMA_F16_CASE(96, 96, 32, 2);
DECL_FATTN_MMA_F16_CASE(112, 112, 32, 2);
DECL_FATTN_MMA_F16_CASE(128, 128, 32, 2);
DECL_FATTN_MMA_F16_CASE(256, 256, 32, 2);
// This file has been autogenerated by generate_cu_files.py, do not edit manually.
#include "../fattn-mma-f16.cuh"
DECL_FATTN_MMA_F16_CASE(576, 512, 4, 16);
......@@ -2,9 +2,9 @@
#include "../fattn-mma-f16.cuh"
DECL_FATTN_MMA_F16_CASE(64, 4, 2);
DECL_FATTN_MMA_F16_CASE(80, 4, 2);
DECL_FATTN_MMA_F16_CASE(96, 4, 2);
DECL_FATTN_MMA_F16_CASE(112, 4, 2);
DECL_FATTN_MMA_F16_CASE(128, 4, 2);
DECL_FATTN_MMA_F16_CASE(256, 4, 2);
DECL_FATTN_MMA_F16_CASE(64, 64, 4, 2);
DECL_FATTN_MMA_F16_CASE(80, 80, 4, 2);
DECL_FATTN_MMA_F16_CASE(96, 96, 4, 2);
DECL_FATTN_MMA_F16_CASE(112, 112, 4, 2);
DECL_FATTN_MMA_F16_CASE(128, 128, 4, 2);
DECL_FATTN_MMA_F16_CASE(256, 256, 4, 2);
......@@ -2,9 +2,9 @@
#include "../fattn-mma-f16.cuh"
DECL_FATTN_MMA_F16_CASE(64, 4, 4);
DECL_FATTN_MMA_F16_CASE(80, 4, 4);
DECL_FATTN_MMA_F16_CASE(96, 4, 4);
DECL_FATTN_MMA_F16_CASE(112, 4, 4);
DECL_FATTN_MMA_F16_CASE(128, 4, 4);
DECL_FATTN_MMA_F16_CASE(256, 4, 4);
DECL_FATTN_MMA_F16_CASE(64, 64, 4, 4);
DECL_FATTN_MMA_F16_CASE(80, 80, 4, 4);
DECL_FATTN_MMA_F16_CASE(96, 96, 4, 4);
DECL_FATTN_MMA_F16_CASE(112, 112, 4, 4);
DECL_FATTN_MMA_F16_CASE(128, 128, 4, 4);
DECL_FATTN_MMA_F16_CASE(256, 256, 4, 4);
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment