Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
ollama
Commits
8dd12c87
Unverified
Commit
8dd12c87
authored
May 01, 2025
by
Jeffrey Morgan
Committed by
GitHub
May 01, 2025
Browse files
llama: update to commit e1e8e099 (#10513)
parent
e6d2d041
Changes
68
Expand all
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
1738 additions
and
862 deletions
+1738
-862
ml/backend/ggml/ggml/include/ggml.h
ml/backend/ggml/ggml/include/ggml.h
+23
-3
ml/backend/ggml/ggml/src/CMakeLists.txt
ml/backend/ggml/ggml/src/CMakeLists.txt
+8
-5
ml/backend/ggml/ggml/src/ggml-cpu/CMakeLists.txt
ml/backend/ggml/ggml/src/ggml-cpu/CMakeLists.txt
+11
-5
ml/backend/ggml/ggml/src/ggml-cpu/cpu-feats-x86.cpp
ml/backend/ggml/ggml/src/ggml-cpu/cpu-feats-x86.cpp
+1
-1
ml/backend/ggml/ggml/src/ggml-cpu/ggml-cpu.c
ml/backend/ggml/ggml/src/ggml-cpu/ggml-cpu.c
+94
-2
ml/backend/ggml/ggml/src/ggml-cpu/ops.cpp
ml/backend/ggml/ggml/src/ggml-cpu/ops.cpp
+174
-2
ml/backend/ggml/ggml/src/ggml-cpu/ops.h
ml/backend/ggml/ggml/src/ggml-cpu/ops.h
+1
-0
ml/backend/ggml/ggml/src/ggml-cpu/simd-mappings.h
ml/backend/ggml/ggml/src/ggml-cpu/simd-mappings.h
+1
-1
ml/backend/ggml/ggml/src/ggml-cuda/common.cuh
ml/backend/ggml/ggml/src/ggml-cuda/common.cuh
+4
-4
ml/backend/ggml/ggml/src/ggml-cuda/convert.cu
ml/backend/ggml/ggml/src/ggml-cuda/convert.cu
+41
-12
ml/backend/ggml/ggml/src/ggml-cuda/convert.cuh
ml/backend/ggml/ggml/src/ggml-cuda/convert.cuh
+11
-1
ml/backend/ggml/ggml/src/ggml-cuda/cpy.cu
ml/backend/ggml/ggml/src/ggml-cuda/cpy.cu
+2
-0
ml/backend/ggml/ggml/src/ggml-cuda/getrows.cu
ml/backend/ggml/ggml/src/ggml-cuda/getrows.cu
+105
-66
ml/backend/ggml/ggml/src/ggml-cuda/getrows.cuh
ml/backend/ggml/ggml/src/ggml-cuda/getrows.cuh
+7
-0
ml/backend/ggml/ggml/src/ggml-cuda/ggml-cuda.cu
ml/backend/ggml/ggml/src/ggml-cuda/ggml-cuda.cu
+173
-198
ml/backend/ggml/ggml/src/ggml-cuda/mmq.cu
ml/backend/ggml/ggml/src/ggml-cuda/mmq.cu
+189
-31
ml/backend/ggml/ggml/src/ggml-cuda/mmq.cuh
ml/backend/ggml/ggml/src/ggml-cuda/mmq.cuh
+442
-192
ml/backend/ggml/ggml/src/ggml-cuda/mmv.cu
ml/backend/ggml/ggml/src/ggml-cuda/mmv.cu
+110
-71
ml/backend/ggml/ggml/src/ggml-cuda/mmv.cuh
ml/backend/ggml/ggml/src/ggml-cuda/mmv.cuh
+1
-1
ml/backend/ggml/ggml/src/ggml-cuda/mmvq.cu
ml/backend/ggml/ggml/src/ggml-cuda/mmvq.cu
+340
-267
No files found.
ml/backend/ggml/ggml/include/ggml.h
View file @
8dd12c87
...
...
@@ -393,8 +393,8 @@ extern "C" {
// precision
enum
ggml_prec
{
GGML_PREC_DEFAULT
,
GGML_PREC_F32
,
GGML_PREC_DEFAULT
=
0
,
// stored as ggml_tensor.op_params, 0 by default
GGML_PREC_F32
=
10
,
};
// model file types
...
...
@@ -481,6 +481,7 @@ extern "C" {
GGML_OP_CONV_TRANSPOSE_1D
,
GGML_OP_IM2COL
,
GGML_OP_IM2COL_BACK
,
GGML_OP_CONV_2D_DW
,
GGML_OP_CONV_TRANSPOSE_2D
,
GGML_OP_POOL_1D
,
GGML_OP_POOL_2D
,
...
...
@@ -678,6 +679,9 @@ extern "C" {
GGML_API
bool
ggml_is_contiguous_1
(
const
struct
ggml_tensor
*
tensor
);
// contiguous for dims >= 1
GGML_API
bool
ggml_is_contiguous_2
(
const
struct
ggml_tensor
*
tensor
);
// contiguous for dims >= 2
// true for tensor that is stored in memory as CxWxHxN and has been permuted to WxHxCxN
GGML_API
bool
ggml_is_contiguous_channels
(
const
struct
ggml_tensor
*
tensor
);
GGML_API
bool
ggml_are_same_shape
(
const
struct
ggml_tensor
*
t0
,
const
struct
ggml_tensor
*
t1
);
GGML_API
bool
ggml_are_same_stride
(
const
struct
ggml_tensor
*
t0
,
const
struct
ggml_tensor
*
t1
);
...
...
@@ -1661,7 +1665,7 @@ extern "C" {
struct
ggml_tensor
*
a
,
struct
ggml_tensor
*
b
);
// depthwise
// depthwise
(via im2col and mul_mat)
GGML_API
struct
ggml_tensor
*
ggml_conv_2d_dw
(
struct
ggml_context
*
ctx
,
struct
ggml_tensor
*
a
,
// convolution kernel
...
...
@@ -1673,6 +1677,22 @@ extern "C" {
int
d0
,
// dilation dimension 0
int
d1
);
// dilation dimension 1
// Depthwise 2D convolution
// may be faster than ggml_conv_2d_dw, but not available in all backends
// a: KW KH 1 C convolution kernel
// b: W H C N input data
// res: W_out H_out C N
GGML_API
struct
ggml_tensor
*
ggml_conv_2d_dw_direct
(
struct
ggml_context
*
ctx
,
struct
ggml_tensor
*
a
,
struct
ggml_tensor
*
b
,
int
stride0
,
int
stride1
,
int
pad0
,
int
pad1
,
int
dilation0
,
int
dilation1
);
GGML_API
struct
ggml_tensor
*
ggml_conv_transpose_2d_p0
(
struct
ggml_context
*
ctx
,
struct
ggml_tensor
*
a
,
...
...
ml/backend/ggml/ggml/src/CMakeLists.txt
View file @
8dd12c87
...
...
@@ -267,6 +267,7 @@ function(ggml_add_cpu_backend_variant tag_name)
set
(
GGML_CPU_TAG_NAME
${
tag_name
}
)
# other: OPENMP LLAMAFILE CPU_HBM
foreach
(
feat NATIVE
SSE42
AVX AVX2 BMI2 AVX_VNNI FMA F16C
AVX512 AVX512_VBMI AVX512_VNNI AVX512_BF16
AMX_TILE AMX_INT8 AMX_BF16
)
...
...
@@ -288,11 +289,13 @@ if (GGML_CPU_ALL_VARIANTS)
message
(
FATAL_ERROR
"GGML_CPU_ALL_VARIANTS requires GGML_BACKEND_DL"
)
endif
()
add_custom_target
(
ggml-cpu
)
ggml_add_cpu_backend_variant
(
sandybridge AVX
)
ggml_add_cpu_backend_variant
(
haswell AVX F16C AVX2 BMI2 FMA
)
ggml_add_cpu_backend_variant
(
skylakex AVX F16C AVX2 BMI2 FMA AVX512
)
ggml_add_cpu_backend_variant
(
icelake AVX F16C AVX2 BMI2 FMA AVX512 AVX512_VBMI AVX512_VNNI
)
ggml_add_cpu_backend_variant
(
alderlake AVX F16C AVX2 BMI2 FMA AVX_VNNI
)
ggml_add_cpu_backend_variant
(
x64
)
ggml_add_cpu_backend_variant
(
sse42 SSE42
)
ggml_add_cpu_backend_variant
(
sandybridge SSE42 AVX
)
ggml_add_cpu_backend_variant
(
haswell SSE42 AVX F16C AVX2 BMI2 FMA
)
ggml_add_cpu_backend_variant
(
skylakex SSE42 AVX F16C AVX2 BMI2 FMA AVX512
)
ggml_add_cpu_backend_variant
(
icelake SSE42 AVX F16C AVX2 BMI2 FMA AVX512 AVX512_VBMI AVX512_VNNI
)
ggml_add_cpu_backend_variant
(
alderlake SSE42 AVX F16C AVX2 BMI2 FMA AVX_VNNI
)
elseif
(
GGML_CPU
)
ggml_add_cpu_backend_variant_impl
(
""
)
endif
()
...
...
ml/backend/ggml/ggml/src/ggml-cpu/CMakeLists.txt
View file @
8dd12c87
...
...
@@ -222,7 +222,7 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
elseif
(
GGML_AVX
)
list
(
APPEND ARCH_FLAGS /arch:AVX
)
list
(
APPEND ARCH_DEFINITIONS GGML_AVX
)
else
(
)
else
if
(
GGML_SSE42
)
list
(
APPEND ARCH_FLAGS /arch:SSE4.2
)
list
(
APPEND ARCH_DEFINITIONS GGML_SSE42
)
endif
()
...
...
@@ -237,8 +237,10 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
if
(
GGML_NATIVE
)
list
(
APPEND ARCH_FLAGS -march=native
)
else
()
list
(
APPEND ARCH_FLAGS -msse4.2
)
list
(
APPEND ARCH_DEFINITIONS GGML_SSE42
)
if
(
GGML_SSE42
)
list
(
APPEND ARCH_FLAGS -msse4.2
)
list
(
APPEND ARCH_DEFINITIONS GGML_SSE42
)
endif
()
if
(
GGML_F16C
)
list
(
APPEND ARCH_FLAGS -mf16c
)
list
(
APPEND ARCH_DEFINITIONS GGML_F16C
)
...
...
@@ -350,10 +352,14 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
# TODO: Separation to determine activation of VX/VXE/VXE2
if
(
${
S390X_M
}
MATCHES
"8561|8562"
)
message
(
STATUS
"z15 target"
)
list
(
APPEND ARCH_FLAGS -march=z15
-mtune=z15
)
list
(
APPEND ARCH_FLAGS -march=z15
)
elseif
(
${
S390X_M
}
MATCHES
"3931"
)
message
(
STATUS
"z16 target"
)
list
(
APPEND ARCH_FLAGS -march=z16 -mtune=z16
)
list
(
APPEND ARCH_FLAGS -march=z16
)
elseif
(
${
S390X_M
}
MATCHES
"9175|9176"
)
# NOTE: Only available from GCC 15.1.0 onwards. Any z17 machine with compile issues must first verify their GCC version.
message
(
STATUS
"z17 target"
)
list
(
APPEND ARCH_FLAGS -march=z17
)
else
()
message
(
STATUS
"Unknown target"
)
message
(
WARNING
"Unknown target. If you are compiling for z14 and earlier, you might have to add -DGGML_VXE=OFF."
)
...
...
ml/backend/ggml/ggml/src/ggml-cpu/cpu-feats-x86.cpp
View file @
8dd12c87
...
...
@@ -263,7 +263,7 @@ void test_x86_is() {
static
int
ggml_backend_cpu_x86_score
()
{
// FIXME: this does not check for OS support
int
score
=
0
;
int
score
=
1
;
cpuid_x86
is
;
#ifdef GGML_FMA
...
...
ml/backend/ggml/ggml/src/ggml-cpu/ggml-cpu.c
View file @
8dd12c87
...
...
@@ -217,7 +217,7 @@ static const struct ggml_type_traits_cpu type_traits_cpu[GGML_TYPE_COUNT] = {
.
nrows
=
1
,
},
[
GGML_TYPE_F16
]
=
{
.
from_float
=
(
ggml_from_float_t
)
ggml_fp32_to_fp16
_row
,
.
from_float
=
(
ggml_from_float_t
)
ggml_
cpu_
fp32_to_fp16
,
.
vec_dot
=
(
ggml_vec_dot_t
)
ggml_vec_dot_f16
,
.
vec_dot_type
=
GGML_TYPE_F16
,
.
nrows
=
1
,
...
...
@@ -358,7 +358,7 @@ static const struct ggml_type_traits_cpu type_traits_cpu[GGML_TYPE_COUNT] = {
.
from_float
=
quantize_row_q8_K
,
},
[
GGML_TYPE_BF16
]
=
{
.
from_float
=
(
ggml_from_float_t
)
ggml_fp32_to_bf16
_row
,
.
from_float
=
(
ggml_from_float_t
)
ggml_
cpu_
fp32_to_bf16
,
.
vec_dot
=
(
ggml_vec_dot_t
)
ggml_vec_dot_bf16
,
.
vec_dot_type
=
GGML_TYPE_BF16
,
.
nrows
=
1
,
...
...
@@ -1934,6 +1934,10 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
{
ggml_compute_forward_im2col_back_f32
(
params
,
tensor
);
}
break
;
case
GGML_OP_CONV_2D_DW
:
{
ggml_compute_forward_conv_2d_dw
(
params
,
tensor
);
}
break
;
case
GGML_OP_CONV_TRANSPOSE_2D
:
{
ggml_compute_forward_conv_transpose_2d
(
params
,
tensor
);
...
...
@@ -2274,6 +2278,7 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
}
break
;
case
GGML_OP_IM2COL
:
case
GGML_OP_IM2COL_BACK
:
case
GGML_OP_CONV_2D_DW
:
case
GGML_OP_CONV_TRANSPOSE_1D
:
case
GGML_OP_CONV_TRANSPOSE_2D
:
{
...
...
@@ -3172,6 +3177,93 @@ enum ggml_status ggml_graph_compute_with_ctx(struct ggml_context * ctx, struct g
return
ggml_graph_compute
(
cgraph
,
&
cplan
);
}
void
ggml_cpu_fp32_to_fp16
(
const
float
*
x
,
ggml_fp16_t
*
y
,
int64_t
n
)
{
int64_t
i
=
0
;
#if defined(__F16C__)
#if defined(__AVX512F__)
for
(;
i
+
15
<
n
;
i
+=
16
)
{
__m512
x_vec
=
_mm512_loadu_ps
(
x
+
i
);
__m256i
y_vec
=
_mm512_cvtps_ph
(
x_vec
,
_MM_FROUND_TO_NEAREST_INT
);
_mm256_storeu_si256
((
__m256i
*
)(
y
+
i
),
y_vec
);
}
#endif
for
(;
i
+
7
<
n
;
i
+=
8
)
{
__m256
x_vec
=
_mm256_loadu_ps
(
x
+
i
);
__m128i
y_vec
=
_mm256_cvtps_ph
(
x_vec
,
_MM_FROUND_TO_NEAREST_INT
);
_mm_storeu_si128
((
__m128i
*
)(
y
+
i
),
y_vec
);
}
for
(;
i
+
3
<
n
;
i
+=
4
)
{
__m128
x_vec
=
_mm_loadu_ps
(
x
+
i
);
__m128i
y_vec
=
_mm_cvtps_ph
(
x_vec
,
_MM_FROUND_TO_NEAREST_INT
);
_mm_storel_epi64
((
__m128i
*
)(
y
+
i
),
y_vec
);
}
#endif
for
(;
i
<
n
;
++
i
)
{
y
[
i
]
=
GGML_FP32_TO_FP16
(
x
[
i
]);
}
}
void
ggml_cpu_fp16_to_fp32
(
const
ggml_fp16_t
*
x
,
float
*
y
,
int64_t
n
)
{
int64_t
i
=
0
;
#if defined(__F16C__)
#if defined(__AVX512F__)
for
(;
i
+
15
<
n
;
i
+=
16
)
{
__m256i
x_vec
=
_mm256_loadu_si256
((
const
__m256i
*
)(
x
+
i
));
__m512
y_vec
=
_mm512_cvtph_ps
(
x_vec
);
_mm512_storeu_ps
(
y
+
i
,
y_vec
);
}
#endif
for
(;
i
+
7
<
n
;
i
+=
8
)
{
__m128i
x_vec
=
_mm_loadu_si128
((
const
__m128i
*
)(
x
+
i
));
__m256
y_vec
=
_mm256_cvtph_ps
(
x_vec
);
_mm256_storeu_ps
(
y
+
i
,
y_vec
);
}
for
(;
i
+
3
<
n
;
i
+=
4
)
{
__m128i
x_vec
=
_mm_loadl_epi64
((
const
__m128i
*
)(
x
+
i
));
__m128
y_vec
=
_mm_cvtph_ps
(
x_vec
);
_mm_storeu_ps
(
y
+
i
,
y_vec
);
}
#endif
for
(;
i
<
n
;
++
i
)
{
y
[
i
]
=
GGML_FP16_TO_FP32
(
x
[
i
]);
}
}
void
ggml_cpu_fp32_to_bf16
(
const
float
*
x
,
ggml_bf16_t
*
y
,
int64_t
n
)
{
int64_t
i
=
0
;
for
(;
i
<
n
;
++
i
)
{
y
[
i
]
=
GGML_FP32_TO_BF16
(
x
[
i
]);
}
}
void
ggml_cpu_bf16_to_fp32
(
const
ggml_bf16_t
*
x
,
float
*
y
,
int64_t
n
)
{
int64_t
i
=
0
;
#if defined(__AVX2__)
#if defined(__AVX512F__)
for
(;
i
+
15
<
n
;
i
+=
16
)
{
_mm512_storeu_ps
(
y
+
i
,
_mm512_castsi512_ps
(
_mm512_slli_epi32
(
_mm512_cvtepu16_epi32
(
_mm256_loadu_si256
(
(
const
__m256i
*
)(
x
+
i
))),
16
)));
}
#endif
for
(;
i
+
7
<
n
;
i
+=
8
)
{
_mm256_storeu_ps
(
y
+
i
,
_mm256_castsi256_ps
(
_mm256_slli_epi32
(
_mm256_cvtepu16_epi32
(
_mm_loadu_si128
(
(
const
__m128i
*
)(
x
+
i
))),
16
)));
}
#endif
for
(;
i
<
n
;
i
++
)
{
y
[
i
]
=
GGML_BF16_TO_FP32
(
x
[
i
]);
}
}
int
ggml_cpu_has_avx
(
void
)
{
#if defined(__AVX__)
...
...
ml/backend/ggml/ggml/src/ggml-cpu/ops.cpp
View file @
8dd12c87
...
...
@@ -4222,7 +4222,7 @@ static void ggml_compute_forward_get_rows_f16(
GGML_ASSERT(i01 >= 0 && i01 < ne01);
ggml_fp16_to_fp32
_row
(
ggml_
cpu_
fp16_to_fp32(
(const ggml_fp16_t*) ((char *) src0->data + i01*nb01 + i11*nb02 + i12*nb03),
(float *) ((char *) dst->data + i10*nb1 + i11*nb2 + i12*nb3), nc);
}
...
...
@@ -4263,7 +4263,7 @@ static void ggml_compute_forward_get_rows_bf16(
GGML_ASSERT(i01 >= 0 && i01 < ne01);
ggml_bf16_to_fp32
_row
(
ggml_
cpu_
bf16_to_fp32(
(const ggml_bf16_t *) ((char *) src0->data + i01*nb01 + i11*nb02 + i12*nb03),
(float *) ((char *) dst->data + i10*nb1 + i11*nb2 + i12*nb3), nc);
}
...
...
@@ -6064,6 +6064,178 @@ void ggml_compute_forward_conv_transpose_2d(
}
}
// ggml_compute_forward_conv_2d_dw
struct ggml_conv_2d_dw_params {
int64_t channels;
int64_t batch;
int64_t src_w;
int64_t src_h;
int64_t dst_w;
int64_t dst_h;
int64_t knl_w;
int64_t knl_h;
int stride_x;
int stride_y;
int pad_x;
int pad_y;
int dilation_x;
int dilation_y;
};
static void ggml_compute_forward_conv_2d_dw_cwhn(
const ggml_compute_params * params,
const ggml_tensor * src,
const ggml_tensor * kernel,
ggml_tensor * dst,
const ggml_conv_2d_dw_params & p) {
const int64_t c = p.channels;
const float * knl_data = (const float *)kernel->data;
const int64_t rows_total = p.dst_h * p.batch;
const int64_t rows_per_thread = (rows_total + params->nth - 1) / params->nth;
const int64_t row_start = params->ith * rows_per_thread;
const int64_t row_end = MIN(row_start + rows_per_thread, rows_total);
#ifdef GGML_SIMD
const int64_t pkg_size = GGML_F32_EPR;
const int64_t pkg_count = c / pkg_size;
const int64_t c_pkg_end = pkg_count * pkg_size;
#else
const int64_t c_pkg_end = 0;
#endif
for (int64_t row = row_start; row < row_end; ++row) {
const int64_t dst_y = row % p.dst_h;
const float * src_data = (const float *)src->data + (row / p.dst_h) * p.src_w * p.src_h * c;
for (int64_t dst_x = 0; dst_x < p.dst_w; ++dst_x) {
float * dst_data = (float *)dst->data + (row * p.dst_w + dst_x) * c;
const int64_t src_y_base = dst_y * p.stride_y - p.pad_y;
const int64_t src_x_base = dst_x * p.stride_x - p.pad_x;
#ifdef GGML_SIMD
// Vectorized loop
for (int64_t c_i = 0; c_i < c_pkg_end; c_i += pkg_size) {
GGML_F32_VEC sum = GGML_F32_VEC_ZERO;
for (int64_t knl_y = 0; knl_y < p.knl_h; ++knl_y) {
const int64_t src_y = src_y_base + knl_y * p.dilation_y;
if (src_y < 0 || src_y >= p.src_h) {
continue;
}
for (int64_t knl_x = 0; knl_x < p.knl_w; ++knl_x) {
const int64_t src_x = src_x_base + knl_x * p.dilation_x;
if (src_x < 0 || src_x >= p.src_w) {
continue;
}
GGML_F32_VEC k = GGML_F32_VEC_LOAD(knl_data + (knl_y * p.knl_w + knl_x) * c + c_i);
GGML_F32_VEC s = GGML_F32_VEC_LOAD(src_data + (src_y * p.src_w + src_x) * c + c_i);
sum = GGML_F32_VEC_FMA(sum, k, s);
}
}
GGML_F32_VEC_STORE(dst_data + c_i, sum);
}
#endif
// Scalar loop
for (int64_t c_i = c_pkg_end; c_i < c; ++c_i) {
float sum = 0.0f;
for (int64_t knl_y = 0; knl_y < p.knl_h; ++knl_y) {
const int64_t src_y = src_y_base + knl_y * p.dilation_y;
if (src_y < 0 || src_y >= p.src_h) {
continue;
}
for (int64_t knl_x = 0; knl_x < p.knl_w; ++knl_x) {
const int64_t src_x = src_x_base + knl_x * p.dilation_x;
if (src_x < 0 || src_x >= p.src_w) {
continue;
}
sum += knl_data[(knl_y * p.knl_w + knl_x) * c + c_i]
* src_data[(src_y * p.src_w + src_x) * c + c_i];
}
}
dst_data[c_i] = sum;
}
}
}
}
static void ggml_compute_forward_conv_2d_dw_whcn(
const ggml_compute_params * params,
const ggml_tensor * src,
const ggml_tensor * kernel,
ggml_tensor * dst,
const ggml_conv_2d_dw_params & p) {
const int64_t n = p.channels * p.batch;
const int64_t per_thread = (n + params->nth - 1) / params->nth;
const int64_t start = params->ith * per_thread;
const int64_t end = MIN(start + per_thread, n);
for (int64_t i = start; i < end; ++i) {
const float * knl_data = (const float *)kernel->data + (i % p.channels) * p.knl_w * p.knl_h;
const float * src_data = (const float *)src->data + i * p.src_w * p.src_h;
float * dst_data = (float *)dst->data + i * p.dst_w * p.dst_h;
for (int64_t dst_y = 0; dst_y < p.dst_h; ++dst_y) {
for (int64_t dst_x = 0; dst_x < p.dst_w; ++dst_x) {
float sum = 0.0f;
for (int64_t knl_y = 0; knl_y < p.knl_h; ++knl_y) {
const int64_t src_y = dst_y * p.stride_y + knl_y * p.dilation_y - p.pad_y;
if (src_y < 0 || src_y >= p.src_h) {
continue;
}
for (int64_t knl_x = 0; knl_x < p.knl_w; ++knl_x) {
const int64_t src_x = dst_x * p.stride_x + knl_x * p.dilation_x - p.pad_x;
if (src_x < 0 || src_x >= p.src_w) {
continue;
}
sum += knl_data[knl_y * p.knl_w + knl_x]
* src_data[src_y * p.src_w + src_x];
}
}
dst_data[dst_y * p.dst_w + dst_x] = sum;
}
}
}
}
void ggml_compute_forward_conv_2d_dw(
const ggml_compute_params * params,
ggml_tensor * dst) {
const ggml_tensor * kernel = dst->src[0];
const ggml_tensor * src = dst->src[1];
ggml_conv_2d_dw_params p;
p.channels = src->ne[2];
p.batch = src->ne[3];
p.src_w = src->ne[0];
p.src_h = src->ne[1];
p.dst_w = dst->ne[0];
p.dst_h = dst->ne[1];
p.knl_w = kernel->ne[0];
p.knl_h = kernel->ne[1];
p.stride_x = dst->op_params[0];
p.stride_y = dst->op_params[1];
p.pad_x = dst->op_params[2];
p.pad_y = dst->op_params[3];
p.dilation_x = dst->op_params[4];
p.dilation_y = dst->op_params[5];
GGML_ASSERT(kernel->ne[3] == p.channels);
GGML_ASSERT(dst->ne[3] == p.batch);
if (ggml_is_contiguous(src)) {
ggml_compute_forward_conv_2d_dw_whcn(params, src, kernel, dst, p);
} else if (ggml_is_contiguous_channels(src)) {
// kernel should also have channels most contiguous in memory
GGML_ASSERT(kernel->nb[0] >= kernel->nb[2] && kernel->nb[1] >= kernel->nb[0]);
ggml_compute_forward_conv_2d_dw_cwhn(params, src, kernel, dst, p);
} else {
GGML_ABORT("non-contiguous memory layout not supported");
}
}
// ggml_compute_forward_pool_1d_sk_p0
static void ggml_compute_forward_pool_1d_sk_p0(
...
...
ml/backend/ggml/ggml/src/ggml-cpu/ops.h
View file @
8dd12c87
...
...
@@ -65,6 +65,7 @@ void ggml_compute_forward_conv_transpose_1d(const struct ggml_compute_params * p
void
ggml_compute_forward_im2col
(
const
struct
ggml_compute_params
*
params
,
struct
ggml_tensor
*
dst
);
void
ggml_compute_forward_im2col_back_f32
(
const
struct
ggml_compute_params
*
params
,
struct
ggml_tensor
*
dst
);
void
ggml_compute_forward_conv_transpose_2d
(
const
struct
ggml_compute_params
*
params
,
struct
ggml_tensor
*
dst
);
void
ggml_compute_forward_conv_2d_dw
(
const
struct
ggml_compute_params
*
params
,
struct
ggml_tensor
*
dst
);
void
ggml_compute_forward_pool_1d
(
const
struct
ggml_compute_params
*
params
,
struct
ggml_tensor
*
dst
);
void
ggml_compute_forward_pool_2d
(
const
struct
ggml_compute_params
*
params
,
struct
ggml_tensor
*
dst
);
void
ggml_compute_forward_pool_2d_back
(
const
struct
ggml_compute_params
*
params
,
struct
ggml_tensor
*
dst
);
...
...
ml/backend/ggml/ggml/src/ggml-cpu/simd-mappings.h
View file @
8dd12c87
...
...
@@ -341,7 +341,7 @@ static inline void __avx_f32cx8_store(ggml_fp16_t *x, __m256 y) {
#define GGML_F32_EPR 4
#define GGML_F32x4 vector float
#define GGML_F32x4_ZERO 0.0f
#define GGML_F32x4_ZERO
{
0.0f
}
#define GGML_F32x4_SET1 vec_splats
#define GGML_F32x4_LOAD(p) vec_xl(0, p)
#define GGML_F32x4_STORE(p, r) vec_xst(r, 0, p)
...
...
ml/backend/ggml/ggml/src/ggml-cuda/common.cuh
View file @
8dd12c87
...
...
@@ -78,13 +78,13 @@
// Moore Threads
#define GGML_CUDA_MUSA_ARCH_IS_QY1 (__MUSA_ARCH__ <= 210)
#define GGML_CUDA_CC_QY1 (GGML_
MUS
A_CC_OFFSET_MTHREADS + 0x210) // MTT S80, MTT S3000
#define GGML_CUDA_CC_QY2 (GGML_
MUS
A_CC_OFFSET_MTHREADS + 0x220) // MTT S4000
#define GGML_CUDA_CC_NG (GGML_
MUS
A_CC_OFFSET_MTHREADS + 0x310) // TBD
#define GGML_CUDA_CC_QY1 (GGML_
CUD
A_CC_OFFSET_MTHREADS + 0x210) // MTT S80, MTT S3000
#define GGML_CUDA_CC_QY2 (GGML_
CUD
A_CC_OFFSET_MTHREADS + 0x220) // MTT S4000
#define GGML_CUDA_CC_NG (GGML_
CUD
A_CC_OFFSET_MTHREADS + 0x310) // TBD
#define GGML_CUDA_CC_IS_MTHREADS(cc) (cc >= GGML_CUDA_CC_OFFSET_MTHREADS && cc < GGML_CUDA_CC_OFFSET_AMD)
#define GGML_CUDA_CC_IS_QY1(cc) (cc >= GGML_CUDA_CC_QY1 && cc < GGML_CUDA_CC_QY2)
#define GGML_CUDA_CC_IS_QY2(cc) (cc >= GGML_CUDA_CC_QY2 && cc < GGML_CUDA_CC_N
EXT
)
#define GGML_CUDA_CC_IS_QY2(cc) (cc >= GGML_CUDA_CC_QY2 && cc < GGML_CUDA_CC_N
G
)
#define GGML_CUDA_CC_IS_NG(cc) (cc >= GGML_CUDA_CC_NG)
#ifdef __CUDA_ARCH_LIST__
...
...
ml/backend/ggml/ggml/src/ggml-cuda/convert.cu
View file @
8dd12c87
#include "convert.cuh"
#include "dequantize.cuh"
#include <cstdint>
#define CUDA_Q8_0_NE_ALIGN 2048
template
<
int
qk
,
int
qr
,
dequantize_kernel_t
dequantize_kernel
,
typename
dst_t
>
...
...
@@ -570,30 +572,46 @@ static void dequantize_row_iq4_xs_cuda(const void * vx, dst_t * y, const int64_t
}
template
<
typename
src_t
,
typename
dst_t
>
static
__global__
void
convert_unary
(
const
void
*
__restrict__
vx
,
dst_t
*
__restrict__
y
,
const
int64_t
k
)
{
const
int64_t
i
=
(
int64_t
)
blockDim
.
x
*
blockIdx
.
x
+
threadIdx
.
x
;
static
__global__
void
convert_unary
(
const
void
*
__restrict__
vx
,
dst_t
*
__restrict__
y
,
const
int64_t
ne00
,
const
int64_t
ne01
,
const
int64_t
ne02
,
const
int64_t
s01
,
const
int64_t
s02
,
const
int64_t
s03
)
{
const
int64_t
i00
=
(
int64_t
)
blockDim
.
x
*
blockIdx
.
x
+
threadIdx
.
x
;
if
(
i
>=
k
)
{
if
(
i
00
>=
ne00
)
{
return
;
}
const
int64_t
i01
=
blockIdx
.
y
;
const
int64_t
i02
=
blockIdx
.
z
%
ne02
;
const
int64_t
i03
=
blockIdx
.
z
/
ne02
;
const
src_t
*
x
=
(
const
src_t
*
)
vx
;
y
[
i
]
=
float
(
x
[
i
]);
const
int64_t
ix
=
i03
*
s03
+
i02
*
s02
+
i01
*
s01
+
i00
;
const
int64_t
iy
=
((
i03
*
ne02
+
i02
)
*
ne01
+
i01
)
*
ne00
+
i00
;
y
[
iy
]
=
float
(
x
[
ix
]);
}
template
<
typename
src_t
,
typename
dst_t
>
static
void
convert_unary_cuda
(
const
void
*
__restrict__
vx
,
dst_t
*
__restrict__
y
,
const
int64_t
k
,
cudaStream_t
stream
)
{
const
int
num_blocks
=
(
k
+
CUDA_DEQUANTIZE_BLOCK_SIZE
-
1
)
/
CUDA_DEQUANTIZE_BLOCK_SIZE
;
convert_unary
<
src_t
><<<
num_blocks
,
CUDA_DEQUANTIZE_BLOCK_SIZE
,
0
,
stream
>>>
(
vx
,
y
,
k
);
static
void
convert_unary_cuda
(
const
void
*
vx
,
dst_t
*
y
,
const
int64_t
ne00
,
const
int64_t
ne01
,
const
int64_t
ne02
,
const
int64_t
ne03
,
const
int64_t
s01
,
const
int64_t
s02
,
const
int64_t
s03
,
cudaStream_t
stream
)
{
const
dim3
num_blocks
((
ne00
+
CUDA_DEQUANTIZE_BLOCK_SIZE
-
1
)
/
CUDA_DEQUANTIZE_BLOCK_SIZE
,
ne01
,
ne02
*
ne03
);
convert_unary
<
src_t
><<<
num_blocks
,
CUDA_DEQUANTIZE_BLOCK_SIZE
,
0
,
stream
>>>
(
vx
,
y
,
ne00
,
ne01
,
ne02
,
s01
,
s02
,
s03
);
}
template
<
typename
src_t
,
typename
dst_t
>
static
void
convert_unary_cont_cuda
(
const
void
*
vx
,
dst_t
*
y
,
const
int64_t
k
,
cudaStream_t
stream
)
{
convert_unary_cuda
<
src_t
>
(
vx
,
y
,
k
,
1
,
1
,
1
,
k
,
k
,
k
,
stream
);
}
to_bf16_cuda_t
ggml_get_to_bf16_cuda
(
ggml_type
type
)
{
switch
(
type
)
{
case
GGML_TYPE_F32
:
return
convert_unary_cuda
<
float
>
;
return
convert_unary_
cont_
cuda
<
float
>
;
case
GGML_TYPE_F16
:
return
convert_unary_cuda
<
half
>
;
return
convert_unary_
cont_
cuda
<
half
>
;
default:
return
nullptr
;
}
...
...
@@ -643,9 +661,9 @@ to_fp16_cuda_t ggml_get_to_fp16_cuda(ggml_type type) {
case
GGML_TYPE_IQ3_S
:
return
dequantize_row_iq3_s_cuda
;
case
GGML_TYPE_F32
:
return
convert_unary_cuda
<
float
>
;
return
convert_unary_
cont_
cuda
<
float
>
;
case
GGML_TYPE_BF16
:
return
convert_unary_cuda
<
nv_bfloat16
>
;
return
convert_unary_
cont_
cuda
<
nv_bfloat16
>
;
default:
return
nullptr
;
}
...
...
@@ -692,7 +710,18 @@ to_fp32_cuda_t ggml_get_to_fp32_cuda(ggml_type type) {
case
GGML_TYPE_IQ3_S
:
return
dequantize_row_iq3_s_cuda
;
case
GGML_TYPE_F16
:
return
convert_unary_cuda
<
half
>
;
return
convert_unary_cont_cuda
<
half
>
;
case
GGML_TYPE_BF16
:
return
convert_unary_cont_cuda
<
nv_bfloat16
>
;
default:
return
nullptr
;
}
}
to_fp16_nc_cuda_t
ggml_get_to_fp16_nc_cuda
(
ggml_type
type
)
{
switch
(
type
)
{
case
GGML_TYPE_F32
:
return
convert_unary_cuda
<
float
>
;
case
GGML_TYPE_BF16
:
return
convert_unary_cuda
<
nv_bfloat16
>
;
default:
...
...
ml/backend/ggml/ggml/src/ggml-cuda/convert.cuh
View file @
8dd12c87
...
...
@@ -3,7 +3,7 @@
#define CUDA_DEQUANTIZE_BLOCK_SIZE 256
template
<
typename
T
>
using
to_t_cuda_t
=
void
(
*
)(
const
void
*
__restrict__
x
,
T
*
__restrict__
y
,
int64_t
k
,
cudaStream_t
stream
);
using
to_t_cuda_t
=
void
(
*
)(
const
void
*
x
,
T
*
y
,
int64_t
k
,
cudaStream_t
stream
);
typedef
to_t_cuda_t
<
float
>
to_fp32_cuda_t
;
typedef
to_t_cuda_t
<
half
>
to_fp16_cuda_t
;
...
...
@@ -14,3 +14,13 @@ to_fp16_cuda_t ggml_get_to_fp16_cuda(ggml_type type);
to_bf16_cuda_t
ggml_get_to_bf16_cuda
(
ggml_type
type
);
to_fp32_cuda_t
ggml_get_to_fp32_cuda
(
ggml_type
type
);
// TODO more general support for non-contiguous inputs
template
<
typename
T
>
using
to_t_nc_cuda_t
=
void
(
*
)(
const
void
*
x
,
T
*
y
,
int64_t
ne00
,
int64_t
ne01
,
int64_t
ne02
,
int64_t
ne03
,
int64_t
s01
,
int64_t
s02
,
int64_t
s03
,
cudaStream_t
stream
);
typedef
to_t_nc_cuda_t
<
half
>
to_fp16_nc_cuda_t
;
to_fp16_nc_cuda_t
ggml_get_to_fp16_nc_cuda
(
ggml_type
type
);
ml/backend/ggml/ggml/src/ggml-cuda/cpy.cu
View file @
8dd12c87
...
...
@@ -639,6 +639,8 @@ void ggml_cuda_cpy(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, gg
if
(
ctx
.
cuda_graph
->
use_cpy_indirection
&&
!
disable_indirection_for_this_node
)
{
ctx
.
cuda_graph
->
graph_cpynode_index
=
graph_cpynode_index
;
}
#else
GGML_UNUSED
(
disable_indirection_for_this_node
);
#endif
}
...
...
ml/backend/ggml/ggml/src/ggml-cuda/getrows.cu
View file @
8dd12c87
...
...
@@ -33,8 +33,8 @@ static __global__ void k_get_rows(
dfloat2
v
;
dequantize_kernel
(
src0_row
,
ib
,
iqs
,
v
);
dst_row
[
iybs
+
iqs
+
0
]
=
v
.
x
;
dst_row
[
iybs
+
iqs
+
y_offset
]
=
v
.
y
;
dst_row
[
iybs
+
iqs
+
0
]
=
float
(
v
.
x
)
;
dst_row
[
iybs
+
iqs
+
y_offset
]
=
float
(
v
.
y
)
;
}
template
<
typename
src0_t
,
typename
dst_t
>
...
...
@@ -60,7 +60,7 @@ static __global__ void k_get_rows_float(
dst_t
*
dst_row
=
dst
+
i10
*
s1
+
i11
*
s2
+
i12
*
s3
;
const
src0_t
*
src0_row
=
(
const
src0_t
*
)((
const
char
*
)
src0
+
i01
*
nb01
+
i11
*
nb02
+
i12
*
nb03
);
dst_row
[
i00
]
=
src0_row
[
i00
];
dst_row
[
i00
]
=
float
(
src0_row
[
i00
]
)
;
}
template
<
typename
grad_t
,
typename
dst_t
>
...
...
@@ -86,122 +86,161 @@ static __global__ void k_get_rows_back_float(
dst
[
dst_row
*
ncols
+
col
]
=
sum
;
}
template
<
int
qk
,
int
qr
,
dequantize_kernel_t
dq
>
static
void
get_rows_cuda
(
const
ggml_tensor
*
src0
,
const
ggml_tensor
*
src1
,
ggml_tensor
*
dst
,
const
void
*
src0_dd
,
const
int32_t
*
src1_dd
,
float
*
dst_dd
,
cudaStream_t
stream
)
{
GGML_TENSOR_BINARY_OP_LOCALS
template
<
int
qk
,
int
qr
,
dequantize_kernel_t
dq
,
typename
dst_t
>
static
void
get_rows_cuda
_q
(
const
void
*
src0
_d
,
const
int32_t
*
src1_d
,
dst_t
*
dst
_d
,
const
int64_t
ne00
,
const
size_t
nb01
,
const
size_t
nb02
,
const
size_t
nb03
,
const
int64_t
ne10
,
const
int64_t
ne11
,
const
int64_t
ne12
,
const
size_t
nb10
,
const
size_t
nb11
,
const
size_t
nb12
,
const
size_t
nb1
,
const
size_t
nb2
,
const
size_t
nb3
,
cudaStream_t
stream
)
{
const
dim3
block_dims
(
CUDA_GET_ROWS_BLOCK_SIZE
,
1
,
1
);
const
int
block_num_x
=
(
ne00
+
2
*
CUDA_GET_ROWS_BLOCK_SIZE
-
1
)
/
(
2
*
CUDA_GET_ROWS_BLOCK_SIZE
);
const
dim3
block_nums
(
block_num_x
,
ne10
,
ne11
*
ne12
);
// strides in elements
//const size_t s0 = nb0 /
ggml_element_
size(dst);
const
size_t
s1
=
nb1
/
ggml_element_
size
(
dst
);
const
size_t
s2
=
nb2
/
ggml_element_
size
(
dst
);
const
size_t
s3
=
nb3
/
ggml_element_
size
(
dst
);
//
const size_t s0 = nb0 / size
of
(dst
_t
);
const
size_t
s1
=
nb1
/
size
of
(
dst
_t
);
const
size_t
s2
=
nb2
/
size
of
(
dst
_t
);
const
size_t
s3
=
nb3
/
size
of
(
dst
_t
);
const
size_t
s10
=
nb10
/
ggml_element_size
(
src1
);
const
size_t
s11
=
nb11
/
ggml_element_size
(
src1
);
const
size_t
s12
=
nb12
/
ggml_element_size
(
src1
);
//const size_t s13 = nb13 /
ggml_element_size(src1
);
const
size_t
s10
=
nb10
/
sizeof
(
int32_t
);
const
size_t
s11
=
nb11
/
sizeof
(
int32_t
);
const
size_t
s12
=
nb12
/
sizeof
(
int32_t
);
//
const size_t s13 = nb13 /
sizeof(int32_t
);
GGML_ASSERT
(
ne00
%
2
==
0
);
k_get_rows
<
qk
,
qr
,
dq
><<<
block_nums
,
block_dims
,
0
,
stream
>>>
(
src0_d
d
,
src1_d
d
,
dst_d
d
,
src0_d
,
src1_d
,
dst_d
,
ne00
,
/*ne01, ne02, ne03,*/
/*ne10, ne11,*/
ne12
,
/*ne13,*/
/* s0,*/
s1
,
s2
,
s3
,
/* nb00,*/
nb01
,
nb02
,
nb03
,
s10
,
s11
,
s12
/*, s13*/
);
GGML_UNUSED
(
dst
);
}
template
<
typename
src0_t
>
template
<
typename
src0_t
,
typename
dst_t
>
static
void
get_rows_cuda_float
(
const
ggml_tensor
*
src0
,
const
ggml_tensor
*
src1
,
ggml_tensor
*
dst
,
const
src0_t
*
src0_dd
,
const
int32_t
*
src1_dd
,
float
*
dst_dd
,
cudaStream_t
stream
)
{
GGML_TENSOR_BINARY_OP_LOCALS
GGML_ASSERT
(
ne13
==
1
);
const
src0_t
*
src0_d
,
const
int32_t
*
src1_d
,
dst_t
*
dst_d
,
const
int64_t
ne00
,
const
size_t
nb01
,
const
size_t
nb02
,
const
size_t
nb03
,
const
int64_t
ne10
,
const
int64_t
ne11
,
const
int64_t
ne12
,
const
size_t
nb10
,
const
size_t
nb11
,
const
size_t
nb12
,
const
size_t
nb1
,
const
size_t
nb2
,
const
size_t
nb3
,
cudaStream_t
stream
)
{
const
dim3
block_dims
(
CUDA_GET_ROWS_BLOCK_SIZE
,
1
,
1
);
const
int
block_num_x
=
(
ne00
+
CUDA_GET_ROWS_BLOCK_SIZE
-
1
)
/
CUDA_GET_ROWS_BLOCK_SIZE
;
const
dim3
block_nums
(
block_num_x
,
ne10
,
ne11
*
ne12
);
// strides in elements
//const size_t s0 = nb0 /
ggml_element_
size(dst);
const
size_t
s1
=
nb1
/
ggml_element_
size
(
dst
);
const
size_t
s2
=
nb2
/
ggml_element_
size
(
dst
);
const
size_t
s3
=
nb3
/
ggml_element_
size
(
dst
);
//
const size_t s0 = nb0 / size
of
(dst
_t
);
const
size_t
s1
=
nb1
/
size
of
(
dst
_t
);
const
size_t
s2
=
nb2
/
size
of
(
dst
_t
);
const
size_t
s3
=
nb3
/
size
of
(
dst
_t
);
const
size_t
s10
=
nb10
/
ggml_element_size
(
src1
);
const
size_t
s11
=
nb11
/
ggml_element_size
(
src1
);
const
size_t
s12
=
nb12
/
ggml_element_size
(
src1
);
//const size_t s13 = nb13 /
ggml_element_size(src1
);
const
size_t
s10
=
nb10
/
sizeof
(
int32_t
);
const
size_t
s11
=
nb11
/
sizeof
(
int32_t
);
const
size_t
s12
=
nb12
/
sizeof
(
int32_t
);
//
const size_t s13 = nb13 /
sizeof(int32_t
);
k_get_rows_float
<<<
block_nums
,
block_dims
,
0
,
stream
>>>
(
src0_d
d
,
src1_d
d
,
dst_d
d
,
src0_d
,
src1_d
,
dst_d
,
ne00
,
/*ne01, ne02, ne03,*/
/*ne10, ne11,*/
ne12
,
/*ne13,*/
/* s0,*/
s1
,
s2
,
s3
,
/* nb00,*/
nb01
,
nb02
,
nb03
,
s10
,
s11
,
s12
/*, s13*/
);
GGML_UNUSED
(
dst
);
}
void
ggml_cuda_op_get_rows
(
ggml_backend_cuda_context
&
ctx
,
ggml_tensor
*
dst
)
{
const
ggml_tensor
*
src0
=
dst
->
src
[
0
];
const
ggml_tensor
*
src1
=
dst
->
src
[
1
];
const
void
*
src0_d
=
(
const
void
*
)
src0
->
data
;
const
int32_t
*
src1_d
=
(
const
int32_t
*
)
src1
->
data
;
float
*
dst_d
=
(
float
*
)
dst
->
data
;
cudaStream_t
stream
=
ctx
.
stream
();
GGML_ASSERT
(
src1
->
type
==
GGML_TYPE_I32
);
GGML_ASSERT
(
dst
->
type
==
GGML_TYPE_F32
);
GGML_ASSERT
(
src0
->
nb
[
0
]
==
ggml_type_size
(
src0
->
type
));
GGML_ASSERT
(
src1
->
nb
[
0
]
==
ggml_type_size
(
src1
->
type
));
GGML_ASSERT
(
dst
->
nb
[
0
]
==
ggml_type_size
(
dst
->
type
));
switch
(
src0
->
type
)
{
template
<
typename
dst_t
>
static
void
ggml_cuda_get_rows_switch_src0_type
(
const
void
*
src0_d
,
const
ggml_type
src0_type
,
const
int32_t
*
src1_d
,
dst_t
*
dst_d
,
const
int64_t
ne00
,
const
size_t
nb01
,
const
size_t
nb02
,
const
size_t
nb03
,
const
int64_t
ne10
,
const
int64_t
ne11
,
const
int64_t
ne12
,
const
size_t
nb10
,
const
size_t
nb11
,
const
size_t
nb12
,
const
size_t
nb1
,
const
size_t
nb2
,
const
size_t
nb3
,
cudaStream_t
stream
)
{
switch
(
src0_type
)
{
case
GGML_TYPE_F16
:
get_rows_cuda_float
(
src0
,
src1
,
dst
,
(
const
half
*
)
src0_d
,
src1_d
,
dst_d
,
stream
);
get_rows_cuda_float
((
const
half
*
)
src0_d
,
src1_d
,
dst_d
,
ne00
,
nb01
,
nb02
,
nb03
,
ne10
,
ne11
,
ne12
,
nb10
,
nb11
,
nb12
,
nb1
,
nb2
,
nb3
,
stream
);
break
;
case
GGML_TYPE_F32
:
get_rows_cuda_float
(
src0
,
src1
,
dst
,
(
const
float
*
)
src0_d
,
src1_d
,
dst_d
,
stream
);
get_rows_cuda_float
((
const
float
*
)
src0_d
,
src1_d
,
dst_d
,
ne00
,
nb01
,
nb02
,
nb03
,
ne10
,
ne11
,
ne12
,
nb10
,
nb11
,
nb12
,
nb1
,
nb2
,
nb3
,
stream
);
break
;
case
GGML_TYPE_BF16
:
get_rows_cuda_float
((
const
nv_bfloat16
*
)
src0_d
,
src1_d
,
dst_d
,
ne00
,
nb01
,
nb02
,
nb03
,
ne10
,
ne11
,
ne12
,
nb10
,
nb11
,
nb12
,
nb1
,
nb2
,
nb3
,
stream
);
break
;
case
GGML_TYPE_Q4_0
:
get_rows_cuda
<
QK4_0
,
QR4_0
,
dequantize_q4_0
>
(
src0
,
src1
,
dst
,
src0_d
,
src1_d
,
dst_d
,
stream
);
get_rows_cuda_q
<
QK4_0
,
QR4_0
,
dequantize_q4_0
>
(
src0_d
,
src1_d
,
dst_d
,
ne00
,
nb01
,
nb02
,
nb03
,
ne10
,
ne11
,
ne12
,
nb10
,
nb11
,
nb12
,
nb1
,
nb2
,
nb3
,
stream
);
break
;
case
GGML_TYPE_Q4_1
:
get_rows_cuda
<
QK4_1
,
QR4_1
,
dequantize_q4_1
>
(
src0
,
src1
,
dst
,
src0_d
,
src1_d
,
dst_d
,
stream
);
get_rows_cuda_q
<
QK4_1
,
QR4_1
,
dequantize_q4_1
>
(
src0_d
,
src1_d
,
dst_d
,
ne00
,
nb01
,
nb02
,
nb03
,
ne10
,
ne11
,
ne12
,
nb10
,
nb11
,
nb12
,
nb1
,
nb2
,
nb3
,
stream
);
break
;
case
GGML_TYPE_Q5_0
:
get_rows_cuda
<
QK5_0
,
QR5_0
,
dequantize_q5_0
>
(
src0
,
src1
,
dst
,
src0_d
,
src1_d
,
dst_d
,
stream
);
get_rows_cuda_q
<
QK5_0
,
QR5_0
,
dequantize_q5_0
>
(
src0_d
,
src1_d
,
dst_d
,
ne00
,
nb01
,
nb02
,
nb03
,
ne10
,
ne11
,
ne12
,
nb10
,
nb11
,
nb12
,
nb1
,
nb2
,
nb3
,
stream
);
break
;
case
GGML_TYPE_Q5_1
:
get_rows_cuda
<
QK5_1
,
QR5_1
,
dequantize_q5_1
>
(
src0
,
src1
,
dst
,
src0_d
,
src1_d
,
dst_d
,
stream
);
get_rows_cuda_q
<
QK5_1
,
QR5_1
,
dequantize_q5_1
>
(
src0_d
,
src1_d
,
dst_d
,
ne00
,
nb01
,
nb02
,
nb03
,
ne10
,
ne11
,
ne12
,
nb10
,
nb11
,
nb12
,
nb1
,
nb2
,
nb3
,
stream
);
break
;
case
GGML_TYPE_Q8_0
:
get_rows_cuda
<
QK8_0
,
QR8_0
,
dequantize_q8_0
>
(
src0
,
src1
,
dst
,
src0_d
,
src1_d
,
dst_d
,
stream
);
get_rows_cuda_q
<
QK8_0
,
QR8_0
,
dequantize_q8_0
>
(
src0_d
,
src1_d
,
dst_d
,
ne00
,
nb01
,
nb02
,
nb03
,
ne10
,
ne11
,
ne12
,
nb10
,
nb11
,
nb12
,
nb1
,
nb2
,
nb3
,
stream
);
break
;
default:
// TODO: k-quants
GGML_ABORT
(
"%s: unsupported type: %s
\n
"
,
__func__
,
ggml_type_name
(
src0
->
type
));
GGML_ABORT
(
"%s: unsupported
src0
type: %s
\n
"
,
__func__
,
ggml_type_name
(
src0
_
type
));
break
;
}
}
void
get_rows_cuda
(
const
void
*
src0_d
,
ggml_type
src0_type
,
const
int32_t
*
src1_d
,
void
*
dst_d
,
ggml_type
dst_type
,
int64_t
ne00
,
size_t
nb01
,
size_t
nb02
,
size_t
nb03
,
int64_t
ne10
,
int64_t
ne11
,
int64_t
ne12
,
size_t
nb10
,
size_t
nb11
,
size_t
nb12
,
size_t
nb1
,
size_t
nb2
,
size_t
nb3
,
cudaStream_t
stream
)
{
switch
(
dst_type
)
{
case
GGML_TYPE_F32
:
ggml_cuda_get_rows_switch_src0_type
(
src0_d
,
src0_type
,
src1_d
,
(
float
*
)
dst_d
,
ne00
,
nb01
,
nb02
,
nb03
,
ne10
,
ne11
,
ne12
,
nb10
,
nb11
,
nb12
,
nb1
,
nb2
,
nb3
,
stream
);
break
;
case
GGML_TYPE_F16
:
ggml_cuda_get_rows_switch_src0_type
(
src0_d
,
src0_type
,
src1_d
,
(
half
*
)
dst_d
,
ne00
,
nb01
,
nb02
,
nb03
,
ne10
,
ne11
,
ne12
,
nb10
,
nb11
,
nb12
,
nb1
,
nb2
,
nb3
,
stream
);
break
;
case
GGML_TYPE_BF16
:
ggml_cuda_get_rows_switch_src0_type
(
src0_d
,
src0_type
,
src1_d
,
(
nv_bfloat16
*
)
dst_d
,
ne00
,
nb01
,
nb02
,
nb03
,
ne10
,
ne11
,
ne12
,
nb10
,
nb11
,
nb12
,
nb1
,
nb2
,
nb3
,
stream
);
break
;
default:
GGML_ABORT
(
"%s: unsupported dst type: %s
\n
"
,
__func__
,
ggml_type_name
(
dst_type
));
break
;
}
}
void
ggml_cuda_op_get_rows
(
ggml_backend_cuda_context
&
ctx
,
ggml_tensor
*
dst
)
{
const
ggml_tensor
*
src0
=
dst
->
src
[
0
];
const
ggml_tensor
*
src1
=
dst
->
src
[
1
];
cudaStream_t
stream
=
ctx
.
stream
();
GGML_TENSOR_BINARY_OP_LOCALS
GGML_ASSERT
(
src1
->
type
==
GGML_TYPE_I32
);
GGML_ASSERT
(
ne13
==
1
);
GGML_ASSERT
(
src0
->
nb
[
0
]
==
ggml_type_size
(
src0
->
type
));
GGML_ASSERT
(
src1
->
nb
[
0
]
==
ggml_type_size
(
src1
->
type
));
GGML_ASSERT
(
dst
->
nb
[
0
]
==
ggml_type_size
(
dst
->
type
));
get_rows_cuda
(
src0
->
data
,
src0
->
type
,
(
const
int32_t
*
)
src1
->
data
,
dst
->
data
,
dst
->
type
,
ne00
,
nb01
,
nb02
,
nb03
,
ne10
,
ne11
,
ne12
,
nb10
,
nb11
,
nb12
,
nb1
,
nb2
,
nb3
,
stream
);
}
void
ggml_cuda_op_get_rows_back
(
ggml_backend_cuda_context
&
ctx
,
ggml_tensor
*
dst
)
{
const
ggml_tensor
*
src0
=
dst
->
src
[
0
];
// gradients of forward pass output
const
ggml_tensor
*
src1
=
dst
->
src
[
1
];
// src1 in forward pass
...
...
ml/backend/ggml/ggml/src/ggml-cuda/getrows.cuh
View file @
8dd12c87
...
...
@@ -3,6 +3,13 @@
#define CUDA_GET_ROWS_BLOCK_SIZE 256
#define CUDA_GET_ROWS_BACK_BLOCK_SIZE 256
void
get_rows_cuda
(
const
void
*
src0_d
,
ggml_type
src0_type
,
const
int32_t
*
src1_d
,
void
*
dst_d
,
ggml_type
dst_type
,
int64_t
ne00
,
size_t
nb01
,
size_t
nb02
,
size_t
nb03
,
int64_t
ne10
,
int64_t
ne11
,
int64_t
ne12
,
size_t
nb10
,
size_t
nb11
,
size_t
nb12
,
size_t
nb1
,
size_t
nb2
,
size_t
nb3
,
cudaStream_t
stream
);
void
ggml_cuda_op_get_rows
(
ggml_backend_cuda_context
&
ctx
,
ggml_tensor
*
dst
);
void
ggml_cuda_op_get_rows_back
(
ggml_backend_cuda_context
&
ctx
,
ggml_tensor
*
dst
);
ml/backend/ggml/ggml/src/ggml-cuda/ggml-cuda.cu
View file @
8dd12c87
This diff is collapsed.
Click to expand it.
ml/backend/ggml/ggml/src/ggml-cuda/mmq.cu
View file @
8dd12c87
This diff is collapsed.
Click to expand it.
ml/backend/ggml/ggml/src/ggml-cuda/mmq.cuh
View file @
8dd12c87
This diff is collapsed.
Click to expand it.
ml/backend/ggml/ggml/src/ggml-cuda/mmv.cu
View file @
8dd12c87
This diff is collapsed.
Click to expand it.
ml/backend/ggml/ggml/src/ggml-cuda/mmv.cuh
View file @
8dd12c87
...
...
@@ -3,7 +3,7 @@
// maximum number of src0 rows with which to use mul_mat_vec over cuBLAS if FP16 tensor cores are available
#define MMV_MAX_ROWS 512
void
ggml_cuda_mul_mat_vec
(
ggml_backend_cuda_context
&
ctx
,
const
ggml_tensor
*
src0
,
const
ggml_tensor
*
src1
,
ggml_tensor
*
dst
);
void
ggml_cuda_mul_mat_vec
(
ggml_backend_cuda_context
&
ctx
,
const
ggml_tensor
*
src0
,
const
ggml_tensor
*
src1
,
const
ggml_tensor
*
ids
,
ggml_tensor
*
dst
);
void
ggml_cuda_op_mul_mat_vec
(
ggml_backend_cuda_context
&
ctx
,
...
...
ml/backend/ggml/ggml/src/ggml-cuda/mmvq.cu
View file @
8dd12c87
This diff is collapsed.
Click to expand it.
Prev
1
2
3
4
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment