Unverified Commit 8dd12c87 authored by Jeffrey Morgan's avatar Jeffrey Morgan Committed by GitHub
Browse files

llama: update to commit e1e8e099 (#10513)

parent e6d2d041
...@@ -393,8 +393,8 @@ extern "C" { ...@@ -393,8 +393,8 @@ extern "C" {
// precision // precision
enum ggml_prec { enum ggml_prec {
GGML_PREC_DEFAULT, GGML_PREC_DEFAULT = 0, // stored as ggml_tensor.op_params, 0 by default
GGML_PREC_F32, GGML_PREC_F32 = 10,
}; };
// model file types // model file types
...@@ -481,6 +481,7 @@ extern "C" { ...@@ -481,6 +481,7 @@ extern "C" {
GGML_OP_CONV_TRANSPOSE_1D, GGML_OP_CONV_TRANSPOSE_1D,
GGML_OP_IM2COL, GGML_OP_IM2COL,
GGML_OP_IM2COL_BACK, GGML_OP_IM2COL_BACK,
GGML_OP_CONV_2D_DW,
GGML_OP_CONV_TRANSPOSE_2D, GGML_OP_CONV_TRANSPOSE_2D,
GGML_OP_POOL_1D, GGML_OP_POOL_1D,
GGML_OP_POOL_2D, GGML_OP_POOL_2D,
...@@ -678,6 +679,9 @@ extern "C" { ...@@ -678,6 +679,9 @@ extern "C" {
GGML_API bool ggml_is_contiguous_1(const struct ggml_tensor * tensor); // contiguous for dims >= 1 GGML_API bool ggml_is_contiguous_1(const struct ggml_tensor * tensor); // contiguous for dims >= 1
GGML_API bool ggml_is_contiguous_2(const struct ggml_tensor * tensor); // contiguous for dims >= 2 GGML_API bool ggml_is_contiguous_2(const struct ggml_tensor * tensor); // contiguous for dims >= 2
// true for tensor that is stored in memory as CxWxHxN and has been permuted to WxHxCxN
GGML_API bool ggml_is_contiguous_channels(const struct ggml_tensor * tensor);
GGML_API bool ggml_are_same_shape (const struct ggml_tensor * t0, const struct ggml_tensor * t1); GGML_API bool ggml_are_same_shape (const struct ggml_tensor * t0, const struct ggml_tensor * t1);
GGML_API bool ggml_are_same_stride(const struct ggml_tensor * t0, const struct ggml_tensor * t1); GGML_API bool ggml_are_same_stride(const struct ggml_tensor * t0, const struct ggml_tensor * t1);
...@@ -1661,7 +1665,7 @@ extern "C" { ...@@ -1661,7 +1665,7 @@ extern "C" {
struct ggml_tensor * a, struct ggml_tensor * a,
struct ggml_tensor * b); struct ggml_tensor * b);
// depthwise // depthwise (via im2col and mul_mat)
GGML_API struct ggml_tensor * ggml_conv_2d_dw( GGML_API struct ggml_tensor * ggml_conv_2d_dw(
struct ggml_context * ctx, struct ggml_context * ctx,
struct ggml_tensor * a, // convolution kernel struct ggml_tensor * a, // convolution kernel
...@@ -1673,6 +1677,22 @@ extern "C" { ...@@ -1673,6 +1677,22 @@ extern "C" {
int d0, // dilation dimension 0 int d0, // dilation dimension 0
int d1); // dilation dimension 1 int d1); // dilation dimension 1
// Depthwise 2D convolution
// may be faster than ggml_conv_2d_dw, but not available in all backends
// a: KW KH 1 C convolution kernel
// b: W H C N input data
// res: W_out H_out C N
GGML_API struct ggml_tensor * ggml_conv_2d_dw_direct(
struct ggml_context * ctx,
struct ggml_tensor * a,
struct ggml_tensor * b,
int stride0,
int stride1,
int pad0,
int pad1,
int dilation0,
int dilation1);
GGML_API struct ggml_tensor * ggml_conv_transpose_2d_p0( GGML_API struct ggml_tensor * ggml_conv_transpose_2d_p0(
struct ggml_context * ctx, struct ggml_context * ctx,
struct ggml_tensor * a, struct ggml_tensor * a,
......
...@@ -267,6 +267,7 @@ function(ggml_add_cpu_backend_variant tag_name) ...@@ -267,6 +267,7 @@ function(ggml_add_cpu_backend_variant tag_name)
set(GGML_CPU_TAG_NAME ${tag_name}) set(GGML_CPU_TAG_NAME ${tag_name})
# other: OPENMP LLAMAFILE CPU_HBM # other: OPENMP LLAMAFILE CPU_HBM
foreach (feat NATIVE foreach (feat NATIVE
SSE42
AVX AVX2 BMI2 AVX_VNNI FMA F16C AVX AVX2 BMI2 AVX_VNNI FMA F16C
AVX512 AVX512_VBMI AVX512_VNNI AVX512_BF16 AVX512 AVX512_VBMI AVX512_VNNI AVX512_BF16
AMX_TILE AMX_INT8 AMX_BF16) AMX_TILE AMX_INT8 AMX_BF16)
...@@ -288,11 +289,13 @@ if (GGML_CPU_ALL_VARIANTS) ...@@ -288,11 +289,13 @@ if (GGML_CPU_ALL_VARIANTS)
message(FATAL_ERROR "GGML_CPU_ALL_VARIANTS requires GGML_BACKEND_DL") message(FATAL_ERROR "GGML_CPU_ALL_VARIANTS requires GGML_BACKEND_DL")
endif() endif()
add_custom_target(ggml-cpu) add_custom_target(ggml-cpu)
ggml_add_cpu_backend_variant(sandybridge AVX) ggml_add_cpu_backend_variant(x64)
ggml_add_cpu_backend_variant(haswell AVX F16C AVX2 BMI2 FMA) ggml_add_cpu_backend_variant(sse42 SSE42)
ggml_add_cpu_backend_variant(skylakex AVX F16C AVX2 BMI2 FMA AVX512) ggml_add_cpu_backend_variant(sandybridge SSE42 AVX)
ggml_add_cpu_backend_variant(icelake AVX F16C AVX2 BMI2 FMA AVX512 AVX512_VBMI AVX512_VNNI) ggml_add_cpu_backend_variant(haswell SSE42 AVX F16C AVX2 BMI2 FMA)
ggml_add_cpu_backend_variant(alderlake AVX F16C AVX2 BMI2 FMA AVX_VNNI) ggml_add_cpu_backend_variant(skylakex SSE42 AVX F16C AVX2 BMI2 FMA AVX512)
ggml_add_cpu_backend_variant(icelake SSE42 AVX F16C AVX2 BMI2 FMA AVX512 AVX512_VBMI AVX512_VNNI)
ggml_add_cpu_backend_variant(alderlake SSE42 AVX F16C AVX2 BMI2 FMA AVX_VNNI)
elseif (GGML_CPU) elseif (GGML_CPU)
ggml_add_cpu_backend_variant_impl("") ggml_add_cpu_backend_variant_impl("")
endif() endif()
......
...@@ -222,7 +222,7 @@ function(ggml_add_cpu_backend_variant_impl tag_name) ...@@ -222,7 +222,7 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
elseif (GGML_AVX) elseif (GGML_AVX)
list(APPEND ARCH_FLAGS /arch:AVX) list(APPEND ARCH_FLAGS /arch:AVX)
list(APPEND ARCH_DEFINITIONS GGML_AVX) list(APPEND ARCH_DEFINITIONS GGML_AVX)
else () elseif (GGML_SSE42)
list(APPEND ARCH_FLAGS /arch:SSE4.2) list(APPEND ARCH_FLAGS /arch:SSE4.2)
list(APPEND ARCH_DEFINITIONS GGML_SSE42) list(APPEND ARCH_DEFINITIONS GGML_SSE42)
endif() endif()
...@@ -237,8 +237,10 @@ function(ggml_add_cpu_backend_variant_impl tag_name) ...@@ -237,8 +237,10 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
if (GGML_NATIVE) if (GGML_NATIVE)
list(APPEND ARCH_FLAGS -march=native) list(APPEND ARCH_FLAGS -march=native)
else () else ()
list(APPEND ARCH_FLAGS -msse4.2) if (GGML_SSE42)
list(APPEND ARCH_DEFINITIONS GGML_SSE42) list(APPEND ARCH_FLAGS -msse4.2)
list(APPEND ARCH_DEFINITIONS GGML_SSE42)
endif()
if (GGML_F16C) if (GGML_F16C)
list(APPEND ARCH_FLAGS -mf16c) list(APPEND ARCH_FLAGS -mf16c)
list(APPEND ARCH_DEFINITIONS GGML_F16C) list(APPEND ARCH_DEFINITIONS GGML_F16C)
...@@ -350,10 +352,14 @@ function(ggml_add_cpu_backend_variant_impl tag_name) ...@@ -350,10 +352,14 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
# TODO: Separation to determine activation of VX/VXE/VXE2 # TODO: Separation to determine activation of VX/VXE/VXE2
if (${S390X_M} MATCHES "8561|8562") if (${S390X_M} MATCHES "8561|8562")
message(STATUS "z15 target") message(STATUS "z15 target")
list(APPEND ARCH_FLAGS -march=z15 -mtune=z15) list(APPEND ARCH_FLAGS -march=z15)
elseif (${S390X_M} MATCHES "3931") elseif (${S390X_M} MATCHES "3931")
message(STATUS "z16 target") message(STATUS "z16 target")
list(APPEND ARCH_FLAGS -march=z16 -mtune=z16) list(APPEND ARCH_FLAGS -march=z16)
elseif (${S390X_M} MATCHES "9175|9176")
# NOTE: Only available from GCC 15.1.0 onwards. Any z17 machine with compile issues must first verify their GCC version.
message(STATUS "z17 target")
list(APPEND ARCH_FLAGS -march=z17)
else() else()
message(STATUS "Unknown target") message(STATUS "Unknown target")
message(WARNING "Unknown target. If you are compiling for z14 and earlier, you might have to add -DGGML_VXE=OFF.") message(WARNING "Unknown target. If you are compiling for z14 and earlier, you might have to add -DGGML_VXE=OFF.")
......
...@@ -263,7 +263,7 @@ void test_x86_is() { ...@@ -263,7 +263,7 @@ void test_x86_is() {
static int ggml_backend_cpu_x86_score() { static int ggml_backend_cpu_x86_score() {
// FIXME: this does not check for OS support // FIXME: this does not check for OS support
int score = 0; int score = 1;
cpuid_x86 is; cpuid_x86 is;
#ifdef GGML_FMA #ifdef GGML_FMA
......
...@@ -217,7 +217,7 @@ static const struct ggml_type_traits_cpu type_traits_cpu[GGML_TYPE_COUNT] = { ...@@ -217,7 +217,7 @@ static const struct ggml_type_traits_cpu type_traits_cpu[GGML_TYPE_COUNT] = {
.nrows = 1, .nrows = 1,
}, },
[GGML_TYPE_F16] = { [GGML_TYPE_F16] = {
.from_float = (ggml_from_float_t) ggml_fp32_to_fp16_row, .from_float = (ggml_from_float_t) ggml_cpu_fp32_to_fp16,
.vec_dot = (ggml_vec_dot_t) ggml_vec_dot_f16, .vec_dot = (ggml_vec_dot_t) ggml_vec_dot_f16,
.vec_dot_type = GGML_TYPE_F16, .vec_dot_type = GGML_TYPE_F16,
.nrows = 1, .nrows = 1,
...@@ -358,7 +358,7 @@ static const struct ggml_type_traits_cpu type_traits_cpu[GGML_TYPE_COUNT] = { ...@@ -358,7 +358,7 @@ static const struct ggml_type_traits_cpu type_traits_cpu[GGML_TYPE_COUNT] = {
.from_float = quantize_row_q8_K, .from_float = quantize_row_q8_K,
}, },
[GGML_TYPE_BF16] = { [GGML_TYPE_BF16] = {
.from_float = (ggml_from_float_t) ggml_fp32_to_bf16_row, .from_float = (ggml_from_float_t) ggml_cpu_fp32_to_bf16,
.vec_dot = (ggml_vec_dot_t) ggml_vec_dot_bf16, .vec_dot = (ggml_vec_dot_t) ggml_vec_dot_bf16,
.vec_dot_type = GGML_TYPE_BF16, .vec_dot_type = GGML_TYPE_BF16,
.nrows = 1, .nrows = 1,
...@@ -1934,6 +1934,10 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm ...@@ -1934,6 +1934,10 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
{ {
ggml_compute_forward_im2col_back_f32(params, tensor); ggml_compute_forward_im2col_back_f32(params, tensor);
} break; } break;
case GGML_OP_CONV_2D_DW:
{
ggml_compute_forward_conv_2d_dw(params, tensor);
} break;
case GGML_OP_CONV_TRANSPOSE_2D: case GGML_OP_CONV_TRANSPOSE_2D:
{ {
ggml_compute_forward_conv_transpose_2d(params, tensor); ggml_compute_forward_conv_transpose_2d(params, tensor);
...@@ -2274,6 +2278,7 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) { ...@@ -2274,6 +2278,7 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
} break; } break;
case GGML_OP_IM2COL: case GGML_OP_IM2COL:
case GGML_OP_IM2COL_BACK: case GGML_OP_IM2COL_BACK:
case GGML_OP_CONV_2D_DW:
case GGML_OP_CONV_TRANSPOSE_1D: case GGML_OP_CONV_TRANSPOSE_1D:
case GGML_OP_CONV_TRANSPOSE_2D: case GGML_OP_CONV_TRANSPOSE_2D:
{ {
...@@ -3172,6 +3177,93 @@ enum ggml_status ggml_graph_compute_with_ctx(struct ggml_context * ctx, struct g ...@@ -3172,6 +3177,93 @@ enum ggml_status ggml_graph_compute_with_ctx(struct ggml_context * ctx, struct g
return ggml_graph_compute(cgraph, &cplan); return ggml_graph_compute(cgraph, &cplan);
} }
void ggml_cpu_fp32_to_fp16(const float * x, ggml_fp16_t * y, int64_t n) {
int64_t i = 0;
#if defined(__F16C__)
#if defined(__AVX512F__)
for (; i + 15 < n; i += 16) {
__m512 x_vec = _mm512_loadu_ps(x + i);
__m256i y_vec = _mm512_cvtps_ph(x_vec, _MM_FROUND_TO_NEAREST_INT);
_mm256_storeu_si256((__m256i *)(y + i), y_vec);
}
#endif
for (; i + 7 < n; i += 8) {
__m256 x_vec = _mm256_loadu_ps(x + i);
__m128i y_vec = _mm256_cvtps_ph(x_vec, _MM_FROUND_TO_NEAREST_INT);
_mm_storeu_si128((__m128i *)(y + i), y_vec);
}
for (; i + 3 < n; i += 4) {
__m128 x_vec = _mm_loadu_ps(x + i);
__m128i y_vec = _mm_cvtps_ph(x_vec, _MM_FROUND_TO_NEAREST_INT);
_mm_storel_epi64((__m128i *)(y + i), y_vec);
}
#endif
for (; i < n; ++i) {
y[i] = GGML_FP32_TO_FP16(x[i]);
}
}
void ggml_cpu_fp16_to_fp32(const ggml_fp16_t * x, float * y, int64_t n) {
int64_t i = 0;
#if defined(__F16C__)
#if defined(__AVX512F__)
for (; i + 15 < n; i += 16) {
__m256i x_vec = _mm256_loadu_si256((const __m256i *)(x + i));
__m512 y_vec = _mm512_cvtph_ps(x_vec);
_mm512_storeu_ps(y + i, y_vec);
}
#endif
for (; i + 7 < n; i += 8) {
__m128i x_vec = _mm_loadu_si128((const __m128i *)(x + i));
__m256 y_vec = _mm256_cvtph_ps(x_vec);
_mm256_storeu_ps(y + i, y_vec);
}
for (; i + 3 < n; i += 4) {
__m128i x_vec = _mm_loadl_epi64((const __m128i *)(x + i));
__m128 y_vec = _mm_cvtph_ps(x_vec);
_mm_storeu_ps(y + i, y_vec);
}
#endif
for (; i < n; ++i) {
y[i] = GGML_FP16_TO_FP32(x[i]);
}
}
void ggml_cpu_fp32_to_bf16(const float * x, ggml_bf16_t * y, int64_t n) {
int64_t i = 0;
for (; i < n; ++i) {
y[i] = GGML_FP32_TO_BF16(x[i]);
}
}
void ggml_cpu_bf16_to_fp32(const ggml_bf16_t * x, float * y, int64_t n) {
int64_t i = 0;
#if defined(__AVX2__)
#if defined(__AVX512F__)
for (; i + 15 < n; i += 16) {
_mm512_storeu_ps(y + i,
_mm512_castsi512_ps(
_mm512_slli_epi32(
_mm512_cvtepu16_epi32(
_mm256_loadu_si256(
(const __m256i *)(x + i))),
16)));
}
#endif
for (; i + 7 < n; i += 8) {
_mm256_storeu_ps(y + i,
_mm256_castsi256_ps(
_mm256_slli_epi32(
_mm256_cvtepu16_epi32(
_mm_loadu_si128(
(const __m128i *)(x + i))),
16)));
}
#endif
for (; i < n; i++) {
y[i] = GGML_BF16_TO_FP32(x[i]);
}
}
int ggml_cpu_has_avx(void) { int ggml_cpu_has_avx(void) {
#if defined(__AVX__) #if defined(__AVX__)
......
...@@ -4222,7 +4222,7 @@ static void ggml_compute_forward_get_rows_f16( ...@@ -4222,7 +4222,7 @@ static void ggml_compute_forward_get_rows_f16(
GGML_ASSERT(i01 >= 0 && i01 < ne01); GGML_ASSERT(i01 >= 0 && i01 < ne01);
ggml_fp16_to_fp32_row( ggml_cpu_fp16_to_fp32(
(const ggml_fp16_t*) ((char *) src0->data + i01*nb01 + i11*nb02 + i12*nb03), (const ggml_fp16_t*) ((char *) src0->data + i01*nb01 + i11*nb02 + i12*nb03),
(float *) ((char *) dst->data + i10*nb1 + i11*nb2 + i12*nb3), nc); (float *) ((char *) dst->data + i10*nb1 + i11*nb2 + i12*nb3), nc);
} }
...@@ -4263,7 +4263,7 @@ static void ggml_compute_forward_get_rows_bf16( ...@@ -4263,7 +4263,7 @@ static void ggml_compute_forward_get_rows_bf16(
GGML_ASSERT(i01 >= 0 && i01 < ne01); GGML_ASSERT(i01 >= 0 && i01 < ne01);
ggml_bf16_to_fp32_row( ggml_cpu_bf16_to_fp32(
(const ggml_bf16_t *) ((char *) src0->data + i01*nb01 + i11*nb02 + i12*nb03), (const ggml_bf16_t *) ((char *) src0->data + i01*nb01 + i11*nb02 + i12*nb03),
(float *) ((char *) dst->data + i10*nb1 + i11*nb2 + i12*nb3), nc); (float *) ((char *) dst->data + i10*nb1 + i11*nb2 + i12*nb3), nc);
} }
...@@ -6064,6 +6064,178 @@ void ggml_compute_forward_conv_transpose_2d( ...@@ -6064,6 +6064,178 @@ void ggml_compute_forward_conv_transpose_2d(
} }
} }
// ggml_compute_forward_conv_2d_dw
struct ggml_conv_2d_dw_params {
int64_t channels;
int64_t batch;
int64_t src_w;
int64_t src_h;
int64_t dst_w;
int64_t dst_h;
int64_t knl_w;
int64_t knl_h;
int stride_x;
int stride_y;
int pad_x;
int pad_y;
int dilation_x;
int dilation_y;
};
static void ggml_compute_forward_conv_2d_dw_cwhn(
const ggml_compute_params * params,
const ggml_tensor * src,
const ggml_tensor * kernel,
ggml_tensor * dst,
const ggml_conv_2d_dw_params & p) {
const int64_t c = p.channels;
const float * knl_data = (const float *)kernel->data;
const int64_t rows_total = p.dst_h * p.batch;
const int64_t rows_per_thread = (rows_total + params->nth - 1) / params->nth;
const int64_t row_start = params->ith * rows_per_thread;
const int64_t row_end = MIN(row_start + rows_per_thread, rows_total);
#ifdef GGML_SIMD
const int64_t pkg_size = GGML_F32_EPR;
const int64_t pkg_count = c / pkg_size;
const int64_t c_pkg_end = pkg_count * pkg_size;
#else
const int64_t c_pkg_end = 0;
#endif
for (int64_t row = row_start; row < row_end; ++row) {
const int64_t dst_y = row % p.dst_h;
const float * src_data = (const float *)src->data + (row / p.dst_h) * p.src_w * p.src_h * c;
for (int64_t dst_x = 0; dst_x < p.dst_w; ++dst_x) {
float * dst_data = (float *)dst->data + (row * p.dst_w + dst_x) * c;
const int64_t src_y_base = dst_y * p.stride_y - p.pad_y;
const int64_t src_x_base = dst_x * p.stride_x - p.pad_x;
#ifdef GGML_SIMD
// Vectorized loop
for (int64_t c_i = 0; c_i < c_pkg_end; c_i += pkg_size) {
GGML_F32_VEC sum = GGML_F32_VEC_ZERO;
for (int64_t knl_y = 0; knl_y < p.knl_h; ++knl_y) {
const int64_t src_y = src_y_base + knl_y * p.dilation_y;
if (src_y < 0 || src_y >= p.src_h) {
continue;
}
for (int64_t knl_x = 0; knl_x < p.knl_w; ++knl_x) {
const int64_t src_x = src_x_base + knl_x * p.dilation_x;
if (src_x < 0 || src_x >= p.src_w) {
continue;
}
GGML_F32_VEC k = GGML_F32_VEC_LOAD(knl_data + (knl_y * p.knl_w + knl_x) * c + c_i);
GGML_F32_VEC s = GGML_F32_VEC_LOAD(src_data + (src_y * p.src_w + src_x) * c + c_i);
sum = GGML_F32_VEC_FMA(sum, k, s);
}
}
GGML_F32_VEC_STORE(dst_data + c_i, sum);
}
#endif
// Scalar loop
for (int64_t c_i = c_pkg_end; c_i < c; ++c_i) {
float sum = 0.0f;
for (int64_t knl_y = 0; knl_y < p.knl_h; ++knl_y) {
const int64_t src_y = src_y_base + knl_y * p.dilation_y;
if (src_y < 0 || src_y >= p.src_h) {
continue;
}
for (int64_t knl_x = 0; knl_x < p.knl_w; ++knl_x) {
const int64_t src_x = src_x_base + knl_x * p.dilation_x;
if (src_x < 0 || src_x >= p.src_w) {
continue;
}
sum += knl_data[(knl_y * p.knl_w + knl_x) * c + c_i]
* src_data[(src_y * p.src_w + src_x) * c + c_i];
}
}
dst_data[c_i] = sum;
}
}
}
}
static void ggml_compute_forward_conv_2d_dw_whcn(
const ggml_compute_params * params,
const ggml_tensor * src,
const ggml_tensor * kernel,
ggml_tensor * dst,
const ggml_conv_2d_dw_params & p) {
const int64_t n = p.channels * p.batch;
const int64_t per_thread = (n + params->nth - 1) / params->nth;
const int64_t start = params->ith * per_thread;
const int64_t end = MIN(start + per_thread, n);
for (int64_t i = start; i < end; ++i) {
const float * knl_data = (const float *)kernel->data + (i % p.channels) * p.knl_w * p.knl_h;
const float * src_data = (const float *)src->data + i * p.src_w * p.src_h;
float * dst_data = (float *)dst->data + i * p.dst_w * p.dst_h;
for (int64_t dst_y = 0; dst_y < p.dst_h; ++dst_y) {
for (int64_t dst_x = 0; dst_x < p.dst_w; ++dst_x) {
float sum = 0.0f;
for (int64_t knl_y = 0; knl_y < p.knl_h; ++knl_y) {
const int64_t src_y = dst_y * p.stride_y + knl_y * p.dilation_y - p.pad_y;
if (src_y < 0 || src_y >= p.src_h) {
continue;
}
for (int64_t knl_x = 0; knl_x < p.knl_w; ++knl_x) {
const int64_t src_x = dst_x * p.stride_x + knl_x * p.dilation_x - p.pad_x;
if (src_x < 0 || src_x >= p.src_w) {
continue;
}
sum += knl_data[knl_y * p.knl_w + knl_x]
* src_data[src_y * p.src_w + src_x];
}
}
dst_data[dst_y * p.dst_w + dst_x] = sum;
}
}
}
}
void ggml_compute_forward_conv_2d_dw(
const ggml_compute_params * params,
ggml_tensor * dst) {
const ggml_tensor * kernel = dst->src[0];
const ggml_tensor * src = dst->src[1];
ggml_conv_2d_dw_params p;
p.channels = src->ne[2];
p.batch = src->ne[3];
p.src_w = src->ne[0];
p.src_h = src->ne[1];
p.dst_w = dst->ne[0];
p.dst_h = dst->ne[1];
p.knl_w = kernel->ne[0];
p.knl_h = kernel->ne[1];
p.stride_x = dst->op_params[0];
p.stride_y = dst->op_params[1];
p.pad_x = dst->op_params[2];
p.pad_y = dst->op_params[3];
p.dilation_x = dst->op_params[4];
p.dilation_y = dst->op_params[5];
GGML_ASSERT(kernel->ne[3] == p.channels);
GGML_ASSERT(dst->ne[3] == p.batch);
if (ggml_is_contiguous(src)) {
ggml_compute_forward_conv_2d_dw_whcn(params, src, kernel, dst, p);
} else if (ggml_is_contiguous_channels(src)) {
// kernel should also have channels most contiguous in memory
GGML_ASSERT(kernel->nb[0] >= kernel->nb[2] && kernel->nb[1] >= kernel->nb[0]);
ggml_compute_forward_conv_2d_dw_cwhn(params, src, kernel, dst, p);
} else {
GGML_ABORT("non-contiguous memory layout not supported");
}
}
// ggml_compute_forward_pool_1d_sk_p0 // ggml_compute_forward_pool_1d_sk_p0
static void ggml_compute_forward_pool_1d_sk_p0( static void ggml_compute_forward_pool_1d_sk_p0(
......
...@@ -65,6 +65,7 @@ void ggml_compute_forward_conv_transpose_1d(const struct ggml_compute_params * p ...@@ -65,6 +65,7 @@ void ggml_compute_forward_conv_transpose_1d(const struct ggml_compute_params * p
void ggml_compute_forward_im2col(const struct ggml_compute_params * params, struct ggml_tensor * dst); void ggml_compute_forward_im2col(const struct ggml_compute_params * params, struct ggml_tensor * dst);
void ggml_compute_forward_im2col_back_f32(const struct ggml_compute_params * params, struct ggml_tensor * dst); void ggml_compute_forward_im2col_back_f32(const struct ggml_compute_params * params, struct ggml_tensor * dst);
void ggml_compute_forward_conv_transpose_2d(const struct ggml_compute_params * params, struct ggml_tensor * dst); void ggml_compute_forward_conv_transpose_2d(const struct ggml_compute_params * params, struct ggml_tensor * dst);
void ggml_compute_forward_conv_2d_dw(const struct ggml_compute_params * params, struct ggml_tensor * dst);
void ggml_compute_forward_pool_1d(const struct ggml_compute_params * params, struct ggml_tensor * dst); void ggml_compute_forward_pool_1d(const struct ggml_compute_params * params, struct ggml_tensor * dst);
void ggml_compute_forward_pool_2d(const struct ggml_compute_params * params, struct ggml_tensor * dst); void ggml_compute_forward_pool_2d(const struct ggml_compute_params * params, struct ggml_tensor * dst);
void ggml_compute_forward_pool_2d_back(const struct ggml_compute_params * params, struct ggml_tensor * dst); void ggml_compute_forward_pool_2d_back(const struct ggml_compute_params * params, struct ggml_tensor * dst);
......
...@@ -341,7 +341,7 @@ static inline void __avx_f32cx8_store(ggml_fp16_t *x, __m256 y) { ...@@ -341,7 +341,7 @@ static inline void __avx_f32cx8_store(ggml_fp16_t *x, __m256 y) {
#define GGML_F32_EPR 4 #define GGML_F32_EPR 4
#define GGML_F32x4 vector float #define GGML_F32x4 vector float
#define GGML_F32x4_ZERO 0.0f #define GGML_F32x4_ZERO {0.0f}
#define GGML_F32x4_SET1 vec_splats #define GGML_F32x4_SET1 vec_splats
#define GGML_F32x4_LOAD(p) vec_xl(0, p) #define GGML_F32x4_LOAD(p) vec_xl(0, p)
#define GGML_F32x4_STORE(p, r) vec_xst(r, 0, p) #define GGML_F32x4_STORE(p, r) vec_xst(r, 0, p)
......
...@@ -78,13 +78,13 @@ ...@@ -78,13 +78,13 @@
// Moore Threads // Moore Threads
#define GGML_CUDA_MUSA_ARCH_IS_QY1 (__MUSA_ARCH__ <= 210) #define GGML_CUDA_MUSA_ARCH_IS_QY1 (__MUSA_ARCH__ <= 210)
#define GGML_CUDA_CC_QY1 (GGML_MUSA_CC_OFFSET_MTHREADS + 0x210) // MTT S80, MTT S3000 #define GGML_CUDA_CC_QY1 (GGML_CUDA_CC_OFFSET_MTHREADS + 0x210) // MTT S80, MTT S3000
#define GGML_CUDA_CC_QY2 (GGML_MUSA_CC_OFFSET_MTHREADS + 0x220) // MTT S4000 #define GGML_CUDA_CC_QY2 (GGML_CUDA_CC_OFFSET_MTHREADS + 0x220) // MTT S4000
#define GGML_CUDA_CC_NG (GGML_MUSA_CC_OFFSET_MTHREADS + 0x310) // TBD #define GGML_CUDA_CC_NG (GGML_CUDA_CC_OFFSET_MTHREADS + 0x310) // TBD
#define GGML_CUDA_CC_IS_MTHREADS(cc) (cc >= GGML_CUDA_CC_OFFSET_MTHREADS && cc < GGML_CUDA_CC_OFFSET_AMD) #define GGML_CUDA_CC_IS_MTHREADS(cc) (cc >= GGML_CUDA_CC_OFFSET_MTHREADS && cc < GGML_CUDA_CC_OFFSET_AMD)
#define GGML_CUDA_CC_IS_QY1(cc) (cc >= GGML_CUDA_CC_QY1 && cc < GGML_CUDA_CC_QY2) #define GGML_CUDA_CC_IS_QY1(cc) (cc >= GGML_CUDA_CC_QY1 && cc < GGML_CUDA_CC_QY2)
#define GGML_CUDA_CC_IS_QY2(cc) (cc >= GGML_CUDA_CC_QY2 && cc < GGML_CUDA_CC_NEXT) #define GGML_CUDA_CC_IS_QY2(cc) (cc >= GGML_CUDA_CC_QY2 && cc < GGML_CUDA_CC_NG)
#define GGML_CUDA_CC_IS_NG(cc) (cc >= GGML_CUDA_CC_NG) #define GGML_CUDA_CC_IS_NG(cc) (cc >= GGML_CUDA_CC_NG)
#ifdef __CUDA_ARCH_LIST__ #ifdef __CUDA_ARCH_LIST__
......
#include "convert.cuh" #include "convert.cuh"
#include "dequantize.cuh" #include "dequantize.cuh"
#include <cstdint>
#define CUDA_Q8_0_NE_ALIGN 2048 #define CUDA_Q8_0_NE_ALIGN 2048
template <int qk, int qr, dequantize_kernel_t dequantize_kernel, typename dst_t> template <int qk, int qr, dequantize_kernel_t dequantize_kernel, typename dst_t>
...@@ -570,30 +572,46 @@ static void dequantize_row_iq4_xs_cuda(const void * vx, dst_t * y, const int64_t ...@@ -570,30 +572,46 @@ static void dequantize_row_iq4_xs_cuda(const void * vx, dst_t * y, const int64_t
} }
template <typename src_t, typename dst_t> template <typename src_t, typename dst_t>
static __global__ void convert_unary(const void * __restrict__ vx, dst_t * __restrict__ y, const int64_t k) { static __global__ void convert_unary(
const int64_t i = (int64_t)blockDim.x*blockIdx.x + threadIdx.x; const void * __restrict__ vx, dst_t * __restrict__ y, const int64_t ne00, const int64_t ne01, const int64_t ne02,
const int64_t s01, const int64_t s02, const int64_t s03) {
const int64_t i00 = (int64_t)blockDim.x*blockIdx.x + threadIdx.x;
if (i >= k) { if (i00 >= ne00) {
return; return;
} }
const int64_t i01 = blockIdx.y;
const int64_t i02 = blockIdx.z % ne02;
const int64_t i03 = blockIdx.z / ne02;
const src_t * x = (const src_t *) vx; const src_t * x = (const src_t *) vx;
y[i] = float(x[i]); const int64_t ix = i03*s03 + i02*s02 + i01*s01 + i00;
const int64_t iy = ((i03*ne02 + i02)*ne01 + i01)*ne00 + i00;
y[iy] = float(x[ix]);
} }
template <typename src_t, typename dst_t> template <typename src_t, typename dst_t>
static void convert_unary_cuda(const void * __restrict__ vx, dst_t * __restrict__ y, const int64_t k, cudaStream_t stream) { static void convert_unary_cuda(const void * vx, dst_t * y,
const int num_blocks = (k + CUDA_DEQUANTIZE_BLOCK_SIZE - 1) / CUDA_DEQUANTIZE_BLOCK_SIZE; const int64_t ne00, const int64_t ne01, const int64_t ne02, const int64_t ne03,
convert_unary<src_t><<<num_blocks, CUDA_DEQUANTIZE_BLOCK_SIZE, 0, stream>>>(vx, y, k); const int64_t s01, const int64_t s02, const int64_t s03, cudaStream_t stream) {
const dim3 num_blocks((ne00 + CUDA_DEQUANTIZE_BLOCK_SIZE - 1) / CUDA_DEQUANTIZE_BLOCK_SIZE, ne01, ne02*ne03);
convert_unary<src_t><<<num_blocks, CUDA_DEQUANTIZE_BLOCK_SIZE, 0, stream>>>
(vx, y, ne00, ne01, ne02, s01, s02, s03);
}
template <typename src_t, typename dst_t>
static void convert_unary_cont_cuda(const void * vx, dst_t * y, const int64_t k, cudaStream_t stream) {
convert_unary_cuda<src_t>(vx, y, k, 1, 1, 1, k, k, k, stream);
} }
to_bf16_cuda_t ggml_get_to_bf16_cuda(ggml_type type) { to_bf16_cuda_t ggml_get_to_bf16_cuda(ggml_type type) {
switch (type) { switch (type) {
case GGML_TYPE_F32: case GGML_TYPE_F32:
return convert_unary_cuda<float>; return convert_unary_cont_cuda<float>;
case GGML_TYPE_F16: case GGML_TYPE_F16:
return convert_unary_cuda<half>; return convert_unary_cont_cuda<half>;
default: default:
return nullptr; return nullptr;
} }
...@@ -643,9 +661,9 @@ to_fp16_cuda_t ggml_get_to_fp16_cuda(ggml_type type) { ...@@ -643,9 +661,9 @@ to_fp16_cuda_t ggml_get_to_fp16_cuda(ggml_type type) {
case GGML_TYPE_IQ3_S: case GGML_TYPE_IQ3_S:
return dequantize_row_iq3_s_cuda; return dequantize_row_iq3_s_cuda;
case GGML_TYPE_F32: case GGML_TYPE_F32:
return convert_unary_cuda<float>; return convert_unary_cont_cuda<float>;
case GGML_TYPE_BF16: case GGML_TYPE_BF16:
return convert_unary_cuda<nv_bfloat16>; return convert_unary_cont_cuda<nv_bfloat16>;
default: default:
return nullptr; return nullptr;
} }
...@@ -692,7 +710,18 @@ to_fp32_cuda_t ggml_get_to_fp32_cuda(ggml_type type) { ...@@ -692,7 +710,18 @@ to_fp32_cuda_t ggml_get_to_fp32_cuda(ggml_type type) {
case GGML_TYPE_IQ3_S: case GGML_TYPE_IQ3_S:
return dequantize_row_iq3_s_cuda; return dequantize_row_iq3_s_cuda;
case GGML_TYPE_F16: case GGML_TYPE_F16:
return convert_unary_cuda<half>; return convert_unary_cont_cuda<half>;
case GGML_TYPE_BF16:
return convert_unary_cont_cuda<nv_bfloat16>;
default:
return nullptr;
}
}
to_fp16_nc_cuda_t ggml_get_to_fp16_nc_cuda(ggml_type type) {
switch (type) {
case GGML_TYPE_F32:
return convert_unary_cuda<float>;
case GGML_TYPE_BF16: case GGML_TYPE_BF16:
return convert_unary_cuda<nv_bfloat16>; return convert_unary_cuda<nv_bfloat16>;
default: default:
......
...@@ -3,7 +3,7 @@ ...@@ -3,7 +3,7 @@
#define CUDA_DEQUANTIZE_BLOCK_SIZE 256 #define CUDA_DEQUANTIZE_BLOCK_SIZE 256
template<typename T> template<typename T>
using to_t_cuda_t = void (*)(const void * __restrict__ x, T * __restrict__ y, int64_t k, cudaStream_t stream); using to_t_cuda_t = void (*)(const void * x, T * y, int64_t k, cudaStream_t stream);
typedef to_t_cuda_t<float> to_fp32_cuda_t; typedef to_t_cuda_t<float> to_fp32_cuda_t;
typedef to_t_cuda_t<half> to_fp16_cuda_t; typedef to_t_cuda_t<half> to_fp16_cuda_t;
...@@ -14,3 +14,13 @@ to_fp16_cuda_t ggml_get_to_fp16_cuda(ggml_type type); ...@@ -14,3 +14,13 @@ to_fp16_cuda_t ggml_get_to_fp16_cuda(ggml_type type);
to_bf16_cuda_t ggml_get_to_bf16_cuda(ggml_type type); to_bf16_cuda_t ggml_get_to_bf16_cuda(ggml_type type);
to_fp32_cuda_t ggml_get_to_fp32_cuda(ggml_type type); to_fp32_cuda_t ggml_get_to_fp32_cuda(ggml_type type);
// TODO more general support for non-contiguous inputs
template<typename T>
using to_t_nc_cuda_t = void (*)(const void * x, T * y,
int64_t ne00, int64_t ne01, int64_t ne02, int64_t ne03,
int64_t s01, int64_t s02, int64_t s03, cudaStream_t stream);
typedef to_t_nc_cuda_t<half> to_fp16_nc_cuda_t;
to_fp16_nc_cuda_t ggml_get_to_fp16_nc_cuda(ggml_type type);
...@@ -639,6 +639,8 @@ void ggml_cuda_cpy(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, gg ...@@ -639,6 +639,8 @@ void ggml_cuda_cpy(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, gg
if(ctx.cuda_graph->use_cpy_indirection && !disable_indirection_for_this_node) { if(ctx.cuda_graph->use_cpy_indirection && !disable_indirection_for_this_node) {
ctx.cuda_graph->graph_cpynode_index = graph_cpynode_index; ctx.cuda_graph->graph_cpynode_index = graph_cpynode_index;
} }
#else
GGML_UNUSED(disable_indirection_for_this_node);
#endif #endif
} }
......
...@@ -33,8 +33,8 @@ static __global__ void k_get_rows( ...@@ -33,8 +33,8 @@ static __global__ void k_get_rows(
dfloat2 v; dfloat2 v;
dequantize_kernel(src0_row, ib, iqs, v); dequantize_kernel(src0_row, ib, iqs, v);
dst_row[iybs + iqs + 0] = v.x; dst_row[iybs + iqs + 0] = float(v.x);
dst_row[iybs + iqs + y_offset] = v.y; dst_row[iybs + iqs + y_offset] = float(v.y);
} }
template<typename src0_t, typename dst_t> template<typename src0_t, typename dst_t>
...@@ -60,7 +60,7 @@ static __global__ void k_get_rows_float( ...@@ -60,7 +60,7 @@ static __global__ void k_get_rows_float(
dst_t * dst_row = dst + i10*s1 + i11*s2 + i12*s3; dst_t * dst_row = dst + i10*s1 + i11*s2 + i12*s3;
const src0_t * src0_row = (const src0_t *)((const char *) src0 + i01*nb01 + i11*nb02 + i12*nb03); const src0_t * src0_row = (const src0_t *)((const char *) src0 + i01*nb01 + i11*nb02 + i12*nb03);
dst_row[i00] = src0_row[i00]; dst_row[i00] = float(src0_row[i00]);
} }
template<typename grad_t, typename dst_t> template<typename grad_t, typename dst_t>
...@@ -86,122 +86,161 @@ static __global__ void k_get_rows_back_float( ...@@ -86,122 +86,161 @@ static __global__ void k_get_rows_back_float(
dst[dst_row*ncols + col] = sum; dst[dst_row*ncols + col] = sum;
} }
template<int qk, int qr, dequantize_kernel_t dq> template<int qk, int qr, dequantize_kernel_t dq, typename dst_t>
static void get_rows_cuda( static void get_rows_cuda_q(
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, const void * src0_d, const int32_t * src1_d, dst_t * dst_d,
const void * src0_dd, const int32_t * src1_dd, float * dst_dd, cudaStream_t stream) { const int64_t ne00, const size_t nb01, const size_t nb02, const size_t nb03,
const int64_t ne10, const int64_t ne11, const int64_t ne12, const size_t nb10, const size_t nb11, const size_t nb12,
GGML_TENSOR_BINARY_OP_LOCALS const size_t nb1, const size_t nb2, const size_t nb3,
cudaStream_t stream) {
const dim3 block_dims(CUDA_GET_ROWS_BLOCK_SIZE, 1, 1); const dim3 block_dims(CUDA_GET_ROWS_BLOCK_SIZE, 1, 1);
const int block_num_x = (ne00 + 2*CUDA_GET_ROWS_BLOCK_SIZE - 1) / (2*CUDA_GET_ROWS_BLOCK_SIZE); const int block_num_x = (ne00 + 2*CUDA_GET_ROWS_BLOCK_SIZE - 1) / (2*CUDA_GET_ROWS_BLOCK_SIZE);
const dim3 block_nums(block_num_x, ne10, ne11*ne12); const dim3 block_nums(block_num_x, ne10, ne11*ne12);
// strides in elements // strides in elements
//const size_t s0 = nb0 / ggml_element_size(dst); // const size_t s0 = nb0 / sizeof(dst_t);
const size_t s1 = nb1 / ggml_element_size(dst); const size_t s1 = nb1 / sizeof(dst_t);
const size_t s2 = nb2 / ggml_element_size(dst); const size_t s2 = nb2 / sizeof(dst_t);
const size_t s3 = nb3 / ggml_element_size(dst); const size_t s3 = nb3 / sizeof(dst_t);
const size_t s10 = nb10 / ggml_element_size(src1); const size_t s10 = nb10 / sizeof(int32_t);
const size_t s11 = nb11 / ggml_element_size(src1); const size_t s11 = nb11 / sizeof(int32_t);
const size_t s12 = nb12 / ggml_element_size(src1); const size_t s12 = nb12 / sizeof(int32_t);
//const size_t s13 = nb13 / ggml_element_size(src1); // const size_t s13 = nb13 / sizeof(int32_t);
GGML_ASSERT(ne00 % 2 == 0); GGML_ASSERT(ne00 % 2 == 0);
k_get_rows<qk, qr, dq><<<block_nums, block_dims, 0, stream>>>( k_get_rows<qk, qr, dq><<<block_nums, block_dims, 0, stream>>>(
src0_dd, src1_dd, dst_dd, src0_d, src1_d, dst_d,
ne00, /*ne01, ne02, ne03,*/ ne00, /*ne01, ne02, ne03,*/
/*ne10, ne11,*/ ne12, /*ne13,*/ /*ne10, ne11,*/ ne12, /*ne13,*/
/* s0,*/ s1, s2, s3, /* s0,*/ s1, s2, s3,
/* nb00,*/ nb01, nb02, nb03, /* nb00,*/ nb01, nb02, nb03,
s10, s11, s12/*, s13*/); s10, s11, s12/*, s13*/);
GGML_UNUSED(dst);
} }
template<typename src0_t> template<typename src0_t, typename dst_t>
static void get_rows_cuda_float( static void get_rows_cuda_float(
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, const src0_t * src0_d, const int32_t * src1_d, dst_t * dst_d,
const src0_t * src0_dd, const int32_t * src1_dd, float * dst_dd, cudaStream_t stream) { const int64_t ne00, const size_t nb01, const size_t nb02, const size_t nb03,
const int64_t ne10, const int64_t ne11, const int64_t ne12, const size_t nb10, const size_t nb11, const size_t nb12,
GGML_TENSOR_BINARY_OP_LOCALS const size_t nb1, const size_t nb2, const size_t nb3,
cudaStream_t stream) {
GGML_ASSERT(ne13 == 1);
const dim3 block_dims(CUDA_GET_ROWS_BLOCK_SIZE, 1, 1); const dim3 block_dims(CUDA_GET_ROWS_BLOCK_SIZE, 1, 1);
const int block_num_x = (ne00 + CUDA_GET_ROWS_BLOCK_SIZE - 1) / CUDA_GET_ROWS_BLOCK_SIZE; const int block_num_x = (ne00 + CUDA_GET_ROWS_BLOCK_SIZE - 1) / CUDA_GET_ROWS_BLOCK_SIZE;
const dim3 block_nums(block_num_x, ne10, ne11*ne12); const dim3 block_nums(block_num_x, ne10, ne11*ne12);
// strides in elements // strides in elements
//const size_t s0 = nb0 / ggml_element_size(dst); // const size_t s0 = nb0 / sizeof(dst_t);
const size_t s1 = nb1 / ggml_element_size(dst); const size_t s1 = nb1 / sizeof(dst_t);
const size_t s2 = nb2 / ggml_element_size(dst); const size_t s2 = nb2 / sizeof(dst_t);
const size_t s3 = nb3 / ggml_element_size(dst); const size_t s3 = nb3 / sizeof(dst_t);
const size_t s10 = nb10 / ggml_element_size(src1); const size_t s10 = nb10 / sizeof(int32_t);
const size_t s11 = nb11 / ggml_element_size(src1); const size_t s11 = nb11 / sizeof(int32_t);
const size_t s12 = nb12 / ggml_element_size(src1); const size_t s12 = nb12 / sizeof(int32_t);
//const size_t s13 = nb13 / ggml_element_size(src1); // const size_t s13 = nb13 / sizeof(int32_t);
k_get_rows_float<<<block_nums, block_dims, 0, stream>>>( k_get_rows_float<<<block_nums, block_dims, 0, stream>>>(
src0_dd, src1_dd, dst_dd, src0_d, src1_d, dst_d,
ne00, /*ne01, ne02, ne03,*/ ne00, /*ne01, ne02, ne03,*/
/*ne10, ne11,*/ ne12, /*ne13,*/ /*ne10, ne11,*/ ne12, /*ne13,*/
/* s0,*/ s1, s2, s3, /* s0,*/ s1, s2, s3,
/* nb00,*/ nb01, nb02, nb03, /* nb00,*/ nb01, nb02, nb03,
s10, s11, s12/*, s13*/); s10, s11, s12/*, s13*/);
GGML_UNUSED(dst);
} }
void ggml_cuda_op_get_rows(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { template <typename dst_t>
const ggml_tensor * src0 = dst->src[0]; static void ggml_cuda_get_rows_switch_src0_type(
const ggml_tensor * src1 = dst->src[1]; const void * src0_d, const ggml_type src0_type, const int32_t * src1_d, dst_t * dst_d,
const int64_t ne00, const size_t nb01, const size_t nb02, const size_t nb03,
const void * src0_d = (const void *) src0->data; const int64_t ne10, const int64_t ne11, const int64_t ne12, const size_t nb10, const size_t nb11, const size_t nb12,
const int32_t * src1_d = (const int32_t *) src1->data; const size_t nb1, const size_t nb2, const size_t nb3,
float * dst_d = (float *) dst->data; cudaStream_t stream) {
switch (src0_type) {
cudaStream_t stream = ctx.stream();
GGML_ASSERT(src1->type == GGML_TYPE_I32);
GGML_ASSERT(dst->type == GGML_TYPE_F32);
GGML_ASSERT(src0->nb[0] == ggml_type_size(src0->type));
GGML_ASSERT(src1->nb[0] == ggml_type_size(src1->type));
GGML_ASSERT(dst->nb[0] == ggml_type_size(dst->type));
switch (src0->type) {
case GGML_TYPE_F16: case GGML_TYPE_F16:
get_rows_cuda_float(src0, src1, dst, (const half *) src0_d, src1_d, dst_d, stream); get_rows_cuda_float((const half *) src0_d, src1_d, dst_d,
ne00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb1, nb2, nb3, stream);
break; break;
case GGML_TYPE_F32: case GGML_TYPE_F32:
get_rows_cuda_float(src0, src1, dst, (const float *) src0_d, src1_d, dst_d, stream); get_rows_cuda_float((const float *) src0_d, src1_d, dst_d,
ne00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb1, nb2, nb3, stream);
break;
case GGML_TYPE_BF16:
get_rows_cuda_float((const nv_bfloat16 *) src0_d, src1_d, dst_d,
ne00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb1, nb2, nb3, stream);
break; break;
case GGML_TYPE_Q4_0: case GGML_TYPE_Q4_0:
get_rows_cuda<QK4_0, QR4_0, dequantize_q4_0>(src0, src1, dst, src0_d, src1_d, dst_d, stream); get_rows_cuda_q<QK4_0, QR4_0, dequantize_q4_0>(src0_d, src1_d, dst_d,
ne00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb1, nb2, nb3, stream);
break; break;
case GGML_TYPE_Q4_1: case GGML_TYPE_Q4_1:
get_rows_cuda<QK4_1, QR4_1, dequantize_q4_1>(src0, src1, dst, src0_d, src1_d, dst_d, stream); get_rows_cuda_q<QK4_1, QR4_1, dequantize_q4_1>(src0_d, src1_d, dst_d,
ne00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb1, nb2, nb3, stream);
break; break;
case GGML_TYPE_Q5_0: case GGML_TYPE_Q5_0:
get_rows_cuda<QK5_0, QR5_0, dequantize_q5_0>(src0, src1, dst, src0_d, src1_d, dst_d, stream); get_rows_cuda_q<QK5_0, QR5_0, dequantize_q5_0>(src0_d, src1_d, dst_d,
ne00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb1, nb2, nb3, stream);
break; break;
case GGML_TYPE_Q5_1: case GGML_TYPE_Q5_1:
get_rows_cuda<QK5_1, QR5_1, dequantize_q5_1>(src0, src1, dst, src0_d, src1_d, dst_d, stream); get_rows_cuda_q<QK5_1, QR5_1, dequantize_q5_1>(src0_d, src1_d, dst_d,
ne00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb1, nb2, nb3, stream);
break; break;
case GGML_TYPE_Q8_0: case GGML_TYPE_Q8_0:
get_rows_cuda<QK8_0, QR8_0, dequantize_q8_0>(src0, src1, dst, src0_d, src1_d, dst_d, stream); get_rows_cuda_q<QK8_0, QR8_0, dequantize_q8_0>(src0_d, src1_d, dst_d,
ne00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb1, nb2, nb3, stream);
break; break;
default: default:
// TODO: k-quants // TODO: k-quants
GGML_ABORT("%s: unsupported type: %s\n", __func__, ggml_type_name(src0->type)); GGML_ABORT("%s: unsupported src0 type: %s\n", __func__, ggml_type_name(src0_type));
break; break;
} }
} }
void get_rows_cuda(
const void * src0_d, ggml_type src0_type, const int32_t * src1_d, void * dst_d, ggml_type dst_type,
int64_t ne00, size_t nb01, size_t nb02, size_t nb03,
int64_t ne10, int64_t ne11, int64_t ne12, size_t nb10, size_t nb11, size_t nb12,
size_t nb1, size_t nb2, size_t nb3,
cudaStream_t stream) {
switch (dst_type) {
case GGML_TYPE_F32:
ggml_cuda_get_rows_switch_src0_type(src0_d, src0_type, src1_d, (float *) dst_d,
ne00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb1, nb2, nb3, stream);
break;
case GGML_TYPE_F16:
ggml_cuda_get_rows_switch_src0_type(src0_d, src0_type, src1_d, (half *) dst_d,
ne00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb1, nb2, nb3, stream);
break;
case GGML_TYPE_BF16:
ggml_cuda_get_rows_switch_src0_type(src0_d, src0_type, src1_d, (nv_bfloat16 *) dst_d,
ne00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb1, nb2, nb3, stream);
break;
default:
GGML_ABORT("%s: unsupported dst type: %s\n", __func__, ggml_type_name(dst_type));
break;
}
}
void ggml_cuda_op_get_rows(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
const ggml_tensor * src0 = dst->src[0];
const ggml_tensor * src1 = dst->src[1];
cudaStream_t stream = ctx.stream();
GGML_TENSOR_BINARY_OP_LOCALS
GGML_ASSERT(src1->type == GGML_TYPE_I32);
GGML_ASSERT(ne13 == 1);
GGML_ASSERT(src0->nb[0] == ggml_type_size(src0->type));
GGML_ASSERT(src1->nb[0] == ggml_type_size(src1->type));
GGML_ASSERT(dst->nb[0] == ggml_type_size(dst->type));
get_rows_cuda(src0->data, src0->type, (const int32_t *) src1->data, dst->data, dst->type,
ne00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb1, nb2, nb3, stream);
}
void ggml_cuda_op_get_rows_back(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { void ggml_cuda_op_get_rows_back(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
const ggml_tensor * src0 = dst->src[0]; // gradients of forward pass output const ggml_tensor * src0 = dst->src[0]; // gradients of forward pass output
const ggml_tensor * src1 = dst->src[1]; // src1 in forward pass const ggml_tensor * src1 = dst->src[1]; // src1 in forward pass
......
...@@ -3,6 +3,13 @@ ...@@ -3,6 +3,13 @@
#define CUDA_GET_ROWS_BLOCK_SIZE 256 #define CUDA_GET_ROWS_BLOCK_SIZE 256
#define CUDA_GET_ROWS_BACK_BLOCK_SIZE 256 #define CUDA_GET_ROWS_BACK_BLOCK_SIZE 256
void get_rows_cuda(
const void * src0_d, ggml_type src0_type, const int32_t * src1_d, void * dst_d, ggml_type dst_type,
int64_t ne00, size_t nb01, size_t nb02, size_t nb03,
int64_t ne10, int64_t ne11, int64_t ne12, size_t nb10, size_t nb11, size_t nb12,
size_t nb1, size_t nb2, size_t nb3,
cudaStream_t stream);
void ggml_cuda_op_get_rows(ggml_backend_cuda_context & ctx, ggml_tensor * dst); void ggml_cuda_op_get_rows(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
void ggml_cuda_op_get_rows_back(ggml_backend_cuda_context & ctx, ggml_tensor * dst); void ggml_cuda_op_get_rows_back(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
...@@ -3,7 +3,7 @@ ...@@ -3,7 +3,7 @@
// maximum number of src0 rows with which to use mul_mat_vec over cuBLAS if FP16 tensor cores are available // maximum number of src0 rows with which to use mul_mat_vec over cuBLAS if FP16 tensor cores are available
#define MMV_MAX_ROWS 512 #define MMV_MAX_ROWS 512
void ggml_cuda_mul_mat_vec(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst); void ggml_cuda_mul_mat_vec(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * ids, ggml_tensor * dst);
void ggml_cuda_op_mul_mat_vec( void ggml_cuda_op_mul_mat_vec(
ggml_backend_cuda_context & ctx, ggml_backend_cuda_context & ctx,
......
This diff is collapsed.
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment