Commit 386c53bb authored by xuxzh1's avatar xuxzh1 🎱
Browse files

update

parent 4667452a
......@@ -24,12 +24,12 @@
* SOFTWARE.
*/
#include "mmvq.cuh"
#include "vecdotq.cuh"
#include "mmvq.cuh"
#include "vecdotq.cuh"
typedef float (*vec_dot_q_cuda_t)(const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & kbx, const int & iqs);
typedef float (*vec_dot_q_cuda_t)(const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & kbx, const int & iqs);
static constexpr __device__ vec_dot_q_cuda_t get_vec_dot_q_cuda(ggml_type type) {
static constexpr __device__ vec_dot_q_cuda_t get_vec_dot_q_cuda(ggml_type type) {
return type == GGML_TYPE_Q4_0 ? vec_dot_q4_0_q8_1 :
type == GGML_TYPE_Q4_1 ? vec_dot_q4_1_q8_1 :
type == GGML_TYPE_Q5_0 ? vec_dot_q5_0_q8_1 :
......@@ -50,9 +50,9 @@ static constexpr __device__ vec_dot_q_cuda_t get_vec_dot_q_cuda(ggml_type type)
type == GGML_TYPE_IQ4_XS ? vec_dot_iq4_xs_q8_1 :
type == GGML_TYPE_IQ3_S ? vec_dot_iq3_s_q8_1 :
nullptr;
}
}
static constexpr __device__ int get_vdr_mmvq(ggml_type type) {
static constexpr __device__ int get_vdr_mmvq(ggml_type type) {
return type == GGML_TYPE_Q4_0 ? VDR_Q4_0_Q8_1_MMVQ :
type == GGML_TYPE_Q4_1 ? VDR_Q4_1_Q8_1_MMVQ :
type == GGML_TYPE_Q5_0 ? VDR_Q5_0_Q8_1_MMVQ :
......@@ -71,30 +71,29 @@ static constexpr __device__ int get_vdr_mmvq(ggml_type type) {
type == GGML_TYPE_IQ4_NL ? VDR_IQ4_NL_Q8_1_MMVQ :
type == GGML_TYPE_IQ4_XS ? VDR_IQ4_XS_Q8_1_MMVQ :
1;
}
template <ggml_type type, int ncols_y>
#if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__))
// tell the compiler to use as many registers as it wants, see nwarps definition below
__launch_bounds__((ncols_y <= 4 ? 4 : 2)*16, 1)
#endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__))
static __global__ void mul_mat_vec_q(
}
template <ggml_type type, int ncols_y>
// #if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__))
// tell the compiler to use as many registers as it wants, see nwarps definition below
//__launch_bounds__((ncols_y <= 4 ? 4 : 2)*WARP_SIZE, 1)
// #endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__))
static __global__ void mul_mat_vec_q(
const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
const int ncols_x, const int nrows_x, const int nrows_y, const int nrows_dst) {
constexpr int qk = ggml_cuda_type_traits<type>::qk;
constexpr int qi = ggml_cuda_type_traits<type>::qi;
constexpr int vdr = get_vdr_mmvq(type);
constexpr vec_dot_q_cuda_t vec_dot_q_cuda = get_vec_dot_q_cuda(type);
#if defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__) && (defined(RDNA2) || defined(RDNA3))
#if defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__) && (defined(RDNA2) || defined(RDNA3))
constexpr int nwarps = 1;
constexpr int rows_per_cuda_block = 1;
#else
#else
constexpr int nwarps = ncols_y <= 4 ? 4 : 2;
constexpr int rows_per_cuda_block = ncols_y == 1 ? 1 : 2;
#endif // defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__) && !defined(RDNA2) && !defined(RDNA3)
constexpr int rows_per_cuda_block = ncols_y == 1 ? 1 : 1;
#endif // defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__) && !defined(RDNA2) && !defined(RDNA3)
const int tid = WARP_SIZE*threadIdx.y + threadIdx.x;
const int row0 = rows_per_cuda_block*blockIdx.x;
......@@ -102,7 +101,7 @@ static __global__ void mul_mat_vec_q(
const int blocks_per_col_y = nrows_y / QK8_1;
constexpr int blocks_per_iter = vdr * nwarps*WARP_SIZE / qi;
// partial sum for each thread
// partial sum for each thread
float tmp[ncols_y][rows_per_cuda_block] = {0.0f};
const block_q8_1 * y = (const block_q8_1 *) vy;
......@@ -112,24 +111,22 @@ static __global__ void mul_mat_vec_q(
// x block quant index when casting the quants to int
const int kqs = vdr * (tid % (qi/vdr));
#pragma unroll ncols_y
#pragma unroll ncols_y
for (int j = 0; j < ncols_y; ++j) {
#pragma unroll rows_per_cuda_block
#pragma unroll rows_per_cuda_block
for (int i = 0; i < rows_per_cuda_block; ++i) {
//tmp[j][i] += vec_dot_q_cuda(vx, &y[j*blocks_per_col_y + kby], (row0 + i)*blocks_per_row_x + kbx, kqs);
atomicAdd(&tmp[j][i], vec_dot_q_cuda(vx, &y[j*blocks_per_col_y + kby], (row0 + i)*blocks_per_row_x + kbx, kqs));
tmp[j][i] += vec_dot_q_cuda(vx, &y[j*blocks_per_col_y + kby], (row0 + i)*blocks_per_row_x + kbx, kqs);
}
}
}
__shared__ float tmp_shared[nwarps-1 > 0 ? nwarps-1 : 1][ncols_y][rows_per_cuda_block][WARP_SIZE];
if (threadIdx.y > 0) {
#pragma unroll ncols_y
#pragma unroll ncols_y
for (int j = 0; j < ncols_y; ++j) {
#pragma unroll rows_per_cuda_block
#pragma unroll rows_per_cuda_block
for (int i = 0; i < rows_per_cuda_block; ++i) {
//tmp_shared[threadIdx.y-1][j][i][threadIdx.x] = tmp[j][i];
atomicExch(&tmp_shared[threadIdx.y-1][j][i][threadIdx.x], tmp[j][i]);
tmp_shared[threadIdx.y-1][j][i][threadIdx.x] = tmp[j][i];
}
}
}
......@@ -139,28 +136,25 @@ static __global__ void mul_mat_vec_q(
}
// sum up partial sums and write back result
#pragma unroll ncols_y
#pragma unroll ncols_y
for (int j = 0; j < ncols_y; ++j) {
#pragma unroll rows_per_cuda_block
#pragma unroll rows_per_cuda_block
for (int i = 0; i < rows_per_cuda_block; ++i) {
#pragma unroll
#pragma unroll
for (int l = 0; l < nwarps-1; ++l) {
//tmp[j][i] += tmp_shared[l][j][i][threadIdx.x];
atomicAdd(&tmp[j][i], tmp_shared[l][j][i][threadIdx.x]);
tmp[j][i] += tmp_shared[l][j][i][threadIdx.x];
}
//tmp[j][i] = warp_reduce_sum(tmp[j][i]);
atomicExch(&tmp[j][i], warp_reduce_sum(tmp[j][i]));
tmp[j][i] = warp_reduce_sum(tmp[j][i]);
}
if (threadIdx.x < rows_per_cuda_block && (rows_per_cuda_block == 1 || row0 + threadIdx.x < nrows_dst)) {
//dst[j*nrows_dst + row0 + threadIdx.x] = tmp[j][threadIdx.x];
atomicExch(&dst[j*nrows_dst + row0 + threadIdx.x], tmp[j][threadIdx.x]);
dst[j*nrows_dst + row0 + threadIdx.x] = tmp[j][threadIdx.x];
}
}
}
}
template <ggml_type type>
static void mul_mat_vec_q_cuda(
template <ggml_type type>
static void mul_mat_vec_q_cuda(
const void * vx, const void * vy, float * dst,
const int ncols_x, const int nrows_x, const int nrows_y, const int ncols_y, const int nrows_dst, cudaStream_t stream) {
......@@ -182,7 +176,7 @@ static void mul_mat_vec_q_cuda(
case 3:
case 4:
nwarps = 4;
rows_per_cuda_block = 2;
rows_per_cuda_block = 1;
break;
case 5:
case 6:
......@@ -229,142 +223,142 @@ static void mul_mat_vec_q_cuda(
GGML_ABORT("fatal error");
break;
}
}
}
static void mul_mat_vec_q4_0_q8_1_cuda(
static void mul_mat_vec_q4_0_q8_1_cuda(
const void * vx, const void * vy, float * dst,
const int ncols_x, const int nrows_x, const int nrows_y, const int ncols_y, const int nrows_dst, cudaStream_t stream) {
mul_mat_vec_q_cuda<GGML_TYPE_Q4_0>(vx, vy, dst, ncols_x, nrows_x, nrows_y, ncols_y, nrows_dst, stream);
}
}
static void mul_mat_vec_q4_1_q8_1_cuda(
static void mul_mat_vec_q4_1_q8_1_cuda(
const void * vx, const void * vy, float * dst,
const int ncols_x, const int nrows_x, const int nrows_y, const int ncols_y, const int nrows_dst, cudaStream_t stream) {
mul_mat_vec_q_cuda<GGML_TYPE_Q4_1>(vx, vy, dst, ncols_x, nrows_x, nrows_y, ncols_y, nrows_dst, stream);
}
}
static void mul_mat_vec_q5_0_q8_1_cuda(
static void mul_mat_vec_q5_0_q8_1_cuda(
const void * vx, const void * vy, float * dst,
const int ncols_x, const int nrows_x, const int nrows_y, const int ncols_y, const int nrows_dst, cudaStream_t stream) {
mul_mat_vec_q_cuda<GGML_TYPE_Q5_0>(vx, vy, dst, ncols_x, nrows_x, nrows_y, ncols_y, nrows_dst, stream);
}
}
static void mul_mat_vec_q5_1_q8_1_cuda(
static void mul_mat_vec_q5_1_q8_1_cuda(
const void * vx, const void * vy, float * dst,
const int ncols_x, const int nrows_x, const int nrows_y, const int ncols_y, const int nrows_dst, cudaStream_t stream) {
mul_mat_vec_q_cuda<GGML_TYPE_Q5_1>(vx, vy, dst, ncols_x, nrows_x, nrows_y, ncols_y, nrows_dst, stream);
}
}
static void mul_mat_vec_q8_0_q8_1_cuda(
static void mul_mat_vec_q8_0_q8_1_cuda(
const void * vx, const void * vy, float * dst,
const int ncols_x, const int nrows_x, const int nrows_y, const int ncols_y, const int nrows_dst, cudaStream_t stream) {
mul_mat_vec_q_cuda<GGML_TYPE_Q8_0>(vx, vy, dst, ncols_x, nrows_x, nrows_y, ncols_y, nrows_dst, stream);
}
}
static void mul_mat_vec_q2_K_q8_1_cuda(
static void mul_mat_vec_q2_K_q8_1_cuda(
const void * vx, const void * vy, float * dst,
const int ncols_x, const int nrows_x, const int nrows_y, const int ncols_y, const int nrows_dst, cudaStream_t stream) {
mul_mat_vec_q_cuda<GGML_TYPE_Q2_K>(vx, vy, dst, ncols_x, nrows_x, nrows_y, ncols_y, nrows_dst, stream);
}
}
static void mul_mat_vec_q3_K_q8_1_cuda(
static void mul_mat_vec_q3_K_q8_1_cuda(
const void * vx, const void * vy, float * dst,
const int ncols_x, const int nrows_x, const int nrows_y, const int ncols_y, const int nrows_dst, cudaStream_t stream) {
mul_mat_vec_q_cuda<GGML_TYPE_Q3_K>(vx, vy, dst, ncols_x, nrows_x, nrows_y, ncols_y, nrows_dst, stream);
}
}
static void mul_mat_vec_q4_K_q8_1_cuda(
static void mul_mat_vec_q4_K_q8_1_cuda(
const void * vx, const void * vy, float * dst,
const int ncols_x, const int nrows_x, const int nrows_y, const int ncols_y, const int nrows_dst, cudaStream_t stream) {
mul_mat_vec_q_cuda<GGML_TYPE_Q4_K>(vx, vy, dst, ncols_x, nrows_x, nrows_y, ncols_y, nrows_dst, stream);
}
}
static void mul_mat_vec_q5_K_q8_1_cuda(
static void mul_mat_vec_q5_K_q8_1_cuda(
const void * vx, const void * vy, float * dst,
const int ncols_x, const int nrows_x, const int nrows_y, const int ncols_y, const int nrows_dst, cudaStream_t stream) {
mul_mat_vec_q_cuda<GGML_TYPE_Q5_K>(vx, vy, dst, ncols_x, nrows_x, nrows_y, ncols_y, nrows_dst, stream);
}
}
static void mul_mat_vec_q6_K_q8_1_cuda(
static void mul_mat_vec_q6_K_q8_1_cuda(
const void * vx, const void * vy, float * dst,
const int ncols_x, const int nrows_x, const int nrows_y, const int ncols_y, const int nrows_dst, cudaStream_t stream) {
mul_mat_vec_q_cuda<GGML_TYPE_Q6_K>(vx, vy, dst, ncols_x, nrows_x, nrows_y, ncols_y, nrows_dst, stream);
}
}
static void mul_mat_vec_iq2_xxs_q8_1_cuda(
static void mul_mat_vec_iq2_xxs_q8_1_cuda(
const void * vx, const void * vy, float * dst,
const int ncols_x, const int nrows_x, const int nrows_y, const int ncols_y, const int nrows_dst, cudaStream_t stream) {
mul_mat_vec_q_cuda<GGML_TYPE_IQ2_XXS>(vx, vy, dst, ncols_x, nrows_x, nrows_y, ncols_y, nrows_dst, stream);
}
}
static void mul_mat_vec_iq2_xs_q8_1_cuda(
static void mul_mat_vec_iq2_xs_q8_1_cuda(
const void * vx, const void * vy, float * dst,
const int ncols_x, const int nrows_x, const int nrows_y, const int ncols_y, const int nrows_dst, cudaStream_t stream) {
mul_mat_vec_q_cuda<GGML_TYPE_IQ2_XS>(vx, vy, dst, ncols_x, nrows_x, nrows_y, ncols_y, nrows_dst, stream);
}
}
static void mul_mat_vec_iq2_s_q8_1_cuda(
static void mul_mat_vec_iq2_s_q8_1_cuda(
const void * vx, const void * vy, float * dst,
const int ncols_x, const int nrows_x, const int nrows_y, const int ncols_y, const int nrows_dst, cudaStream_t stream) {
mul_mat_vec_q_cuda<GGML_TYPE_IQ2_S>(vx, vy, dst, ncols_x, nrows_x, nrows_y, ncols_y, nrows_dst, stream);
}
}
static void mul_mat_vec_iq3_xxs_q8_1_cuda(
static void mul_mat_vec_iq3_xxs_q8_1_cuda(
const void * vx, const void * vy, float * dst,
const int ncols_x, const int nrows_x, const int nrows_y, const int ncols_y, const int nrows_dst, cudaStream_t stream) {
mul_mat_vec_q_cuda<GGML_TYPE_IQ3_XXS>(vx, vy, dst, ncols_x, nrows_x, nrows_y, ncols_y, nrows_dst, stream);
}
}
static void mul_mat_vec_iq1_s_q8_1_cuda(
static void mul_mat_vec_iq1_s_q8_1_cuda(
const void * vx, const void * vy, float * dst,
const int ncols_x, const int nrows_x, const int nrows_y, const int ncols_y, const int nrows_dst, cudaStream_t stream) {
mul_mat_vec_q_cuda<GGML_TYPE_IQ1_S>(vx, vy, dst, ncols_x, nrows_x, nrows_y, ncols_y, nrows_dst, stream);
}
}
static void mul_mat_vec_iq1_m_q8_1_cuda(
static void mul_mat_vec_iq1_m_q8_1_cuda(
const void * vx, const void * vy, float * dst,
const int ncols_x, const int nrows_x, const int nrows_y, const int ncols_y, const int nrows_dst, cudaStream_t stream) {
mul_mat_vec_q_cuda<GGML_TYPE_IQ1_M>(vx, vy, dst, ncols_x, nrows_x, nrows_y, ncols_y, nrows_dst, stream);
}
}
static void mul_mat_vec_iq4_nl_q8_1_cuda(
static void mul_mat_vec_iq4_nl_q8_1_cuda(
const void * vx, const void * vy, float * dst,
const int ncols_x, const int nrows_x, const int nrows_y, const int ncols_y, const int nrows_dst, cudaStream_t stream) {
mul_mat_vec_q_cuda<GGML_TYPE_IQ4_NL>(vx, vy, dst, ncols_x, nrows_x, nrows_y, ncols_y, nrows_dst, stream);
}
}
static void mul_mat_vec_iq4_xs_q8_1_cuda(
static void mul_mat_vec_iq4_xs_q8_1_cuda(
const void * vx, const void * vy, float * dst,
const int ncols_x, const int nrows_x, const int nrows_y, const int ncols_y, const int nrows_dst, cudaStream_t stream) {
mul_mat_vec_q_cuda<GGML_TYPE_IQ4_XS>(vx, vy, dst, ncols_x, nrows_x, nrows_y, ncols_y, nrows_dst, stream);
}
}
static void mul_mat_vec_iq3_s_q8_1_cuda(
static void mul_mat_vec_iq3_s_q8_1_cuda(
const void * vx, const void * vy, float * dst,
const int ncols_x, const int nrows_x, const int nrows_y, const int ncols_y, const int nrows_dst, cudaStream_t stream) {
mul_mat_vec_q_cuda<GGML_TYPE_IQ3_S>(vx, vy, dst, ncols_x, nrows_x, nrows_y, ncols_y, nrows_dst, stream);
}
}
void ggml_cuda_op_mul_mat_vec_q(
void ggml_cuda_op_mul_mat_vec_q(
ggml_backend_cuda_context & ctx,
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, const char * src0_dd_i, const float * src1_ddf_i,
const char * src1_ddq_i, float * dst_dd_i, const int64_t row_low, const int64_t row_high, const int64_t src1_ncols,
......@@ -452,4 +446,5 @@ void ggml_cuda_op_mul_mat_vec_q(
GGML_UNUSED(src1_ddf_i);
GGML_UNUSED(src1_ncols);
GGML_UNUSED(src1_padded_row_size);
}
}
\ No newline at end of file
......@@ -24,10 +24,10 @@
* SOFTWARE.
*/
#include "quantize.cuh"
#include <cstdint>
#include "quantize.cuh"
#include <cstdint>
static __global__ __launch_bounds__(1024) void quantize_q8_1(const float * __restrict__ x, void * __restrict__ vy, const int64_t kx, const int64_t kx0_padded) {
static __global__ __launch_bounds__(1024) void quantize_q8_1(const float * __restrict__ x, void * __restrict__ vy, const int64_t kx, const int64_t kx0_padded) {
const int64_t ix0 = (int64_t)blockDim.x*blockIdx.x + threadIdx.x;
if (ix0 >= kx0_padded) {
......@@ -58,14 +58,14 @@ static __global__ __launch_bounds__(1024) void quantize_q8_1(const float * __res
if (iqs > 0) {
return;
}
ggml_half2 ds = {d, sum};
y[ib].ds = ds;
y[ib].ds = ggml_half2 {d, sum};
//reinterpret_cast<half&>(y[ib].ds) = ds;
//reinterpret_cast<half&>(y[ib].ds.y) = sum;
}
}
template <mmq_q8_1_ds_layout ds_layout>
static __global__ void quantize_mmq_q8_1(
template <mmq_q8_1_ds_layout ds_layout>
static __global__ void quantize_mmq_q8_1(
const float * __restrict__ x, void * __restrict__ vy, const int64_t kx0, const int64_t kx1, const int64_t kx0_padded) {
constexpr int vals_per_scale = ds_layout == MMQ_Q8_1_DS_LAYOUT_D2S6 ? 64 : 32;
......@@ -95,7 +95,7 @@ static __global__ void quantize_mmq_q8_1(
amax = fmaxf(amax, fabsf(xi.w));
// Exchange max. abs. value between vals_per_scale/4 threads.
#pragma unroll
#pragma unroll
for (int offset = vals_per_scale/8; offset > 0; offset >>= 1) {
amax = fmaxf(amax, __shfl_xor_sync(0xFFFFFFFF, amax, offset, WARP_SIZE));
}
......@@ -105,7 +105,7 @@ static __global__ void quantize_mmq_q8_1(
sum = xi.x + xi.y + xi.z + xi.w;
// Exchange calculate sum across vals_per_sum/4 threads.
#pragma unroll
#pragma unroll
for (int offset = vals_per_sum/8; offset > 0; offset >>= 1) {
sum += __shfl_xor_sync(0xFFFFFFFF, sum, offset, WARP_SIZE);
}
......@@ -151,9 +151,9 @@ static __global__ void quantize_mmq_q8_1(
} else {
y[ib].d4[iqs/32] = d;
}
}
}
void quantize_row_q8_1_cuda(
void quantize_row_q8_1_cuda(
const float * x, void * vy, const int64_t kx0, const int64_t kx1, const int64_t channels,
const int64_t kx0_padded, const ggml_type type_x, cudaStream_t stream) {
......@@ -165,9 +165,9 @@ void quantize_row_q8_1_cuda(
quantize_q8_1<<<num_blocks, block_size, 0, stream>>>(x, vy, kx0, kx0_padded);
GGML_UNUSED(type_x);
}
}
void quantize_mmq_q8_1_cuda(
void quantize_mmq_q8_1_cuda(
const float * x, void * vy, const int64_t kx0, const int64_t kx1, const int64_t channels,
const int64_t kx0_padded, const ggml_type type_x, cudaStream_t stream) {
......@@ -193,4 +193,5 @@ void quantize_mmq_q8_1_cuda(
GGML_ABORT("fatal error");
break;
}
}
}
\ No newline at end of file
......@@ -505,7 +505,7 @@ static __device__ __forceinline__ float vec_dot_q6_K_q8_1_impl_mmvq(
float sumf = 0.0f;
#pragma unroll
#pragma unroll QR6_K
for (int i = 0; i < QR6_K; ++i) {
const int sc = scales[4*i];
......@@ -803,7 +803,7 @@ static __device__ __forceinline__ float vec_dot_q6_K_q8_1(
int u[QR6_K];
float d8[QR6_K];
#pragma unroll
#pragma unroll QR6_K
for (int i = 0; i < QR6_K; ++i) {
u[i] = get_int_b4(bq8_1[bq8_offset + 2*i].qs, iqs % QI8_1);
d8[i] = __low2float(bq8_1[bq8_offset + 2*i].ds);
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment