Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
ollama
Commits
8dd12c87
Unverified
Commit
8dd12c87
authored
May 01, 2025
by
Jeffrey Morgan
Committed by
GitHub
May 01, 2025
Browse files
llama: update to commit e1e8e099 (#10513)
parent
e6d2d041
Changes
68
Expand all
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
1738 additions
and
862 deletions
+1738
-862
ml/backend/ggml/ggml/include/ggml.h
ml/backend/ggml/ggml/include/ggml.h
+23
-3
ml/backend/ggml/ggml/src/CMakeLists.txt
ml/backend/ggml/ggml/src/CMakeLists.txt
+8
-5
ml/backend/ggml/ggml/src/ggml-cpu/CMakeLists.txt
ml/backend/ggml/ggml/src/ggml-cpu/CMakeLists.txt
+11
-5
ml/backend/ggml/ggml/src/ggml-cpu/cpu-feats-x86.cpp
ml/backend/ggml/ggml/src/ggml-cpu/cpu-feats-x86.cpp
+1
-1
ml/backend/ggml/ggml/src/ggml-cpu/ggml-cpu.c
ml/backend/ggml/ggml/src/ggml-cpu/ggml-cpu.c
+94
-2
ml/backend/ggml/ggml/src/ggml-cpu/ops.cpp
ml/backend/ggml/ggml/src/ggml-cpu/ops.cpp
+174
-2
ml/backend/ggml/ggml/src/ggml-cpu/ops.h
ml/backend/ggml/ggml/src/ggml-cpu/ops.h
+1
-0
ml/backend/ggml/ggml/src/ggml-cpu/simd-mappings.h
ml/backend/ggml/ggml/src/ggml-cpu/simd-mappings.h
+1
-1
ml/backend/ggml/ggml/src/ggml-cuda/common.cuh
ml/backend/ggml/ggml/src/ggml-cuda/common.cuh
+4
-4
ml/backend/ggml/ggml/src/ggml-cuda/convert.cu
ml/backend/ggml/ggml/src/ggml-cuda/convert.cu
+41
-12
ml/backend/ggml/ggml/src/ggml-cuda/convert.cuh
ml/backend/ggml/ggml/src/ggml-cuda/convert.cuh
+11
-1
ml/backend/ggml/ggml/src/ggml-cuda/cpy.cu
ml/backend/ggml/ggml/src/ggml-cuda/cpy.cu
+2
-0
ml/backend/ggml/ggml/src/ggml-cuda/getrows.cu
ml/backend/ggml/ggml/src/ggml-cuda/getrows.cu
+105
-66
ml/backend/ggml/ggml/src/ggml-cuda/getrows.cuh
ml/backend/ggml/ggml/src/ggml-cuda/getrows.cuh
+7
-0
ml/backend/ggml/ggml/src/ggml-cuda/ggml-cuda.cu
ml/backend/ggml/ggml/src/ggml-cuda/ggml-cuda.cu
+173
-198
ml/backend/ggml/ggml/src/ggml-cuda/mmq.cu
ml/backend/ggml/ggml/src/ggml-cuda/mmq.cu
+189
-31
ml/backend/ggml/ggml/src/ggml-cuda/mmq.cuh
ml/backend/ggml/ggml/src/ggml-cuda/mmq.cuh
+442
-192
ml/backend/ggml/ggml/src/ggml-cuda/mmv.cu
ml/backend/ggml/ggml/src/ggml-cuda/mmv.cu
+110
-71
ml/backend/ggml/ggml/src/ggml-cuda/mmv.cuh
ml/backend/ggml/ggml/src/ggml-cuda/mmv.cuh
+1
-1
ml/backend/ggml/ggml/src/ggml-cuda/mmvq.cu
ml/backend/ggml/ggml/src/ggml-cuda/mmvq.cu
+340
-267
No files found.
ml/backend/ggml/ggml/include/ggml.h
View file @
8dd12c87
...
...
@@ -393,8 +393,8 @@ extern "C" {
// precision
enum
ggml_prec
{
GGML_PREC_DEFAULT
,
GGML_PREC_F32
,
GGML_PREC_DEFAULT
=
0
,
// stored as ggml_tensor.op_params, 0 by default
GGML_PREC_F32
=
10
,
};
// model file types
...
...
@@ -481,6 +481,7 @@ extern "C" {
GGML_OP_CONV_TRANSPOSE_1D
,
GGML_OP_IM2COL
,
GGML_OP_IM2COL_BACK
,
GGML_OP_CONV_2D_DW
,
GGML_OP_CONV_TRANSPOSE_2D
,
GGML_OP_POOL_1D
,
GGML_OP_POOL_2D
,
...
...
@@ -678,6 +679,9 @@ extern "C" {
GGML_API
bool
ggml_is_contiguous_1
(
const
struct
ggml_tensor
*
tensor
);
// contiguous for dims >= 1
GGML_API
bool
ggml_is_contiguous_2
(
const
struct
ggml_tensor
*
tensor
);
// contiguous for dims >= 2
// true for tensor that is stored in memory as CxWxHxN and has been permuted to WxHxCxN
GGML_API
bool
ggml_is_contiguous_channels
(
const
struct
ggml_tensor
*
tensor
);
GGML_API
bool
ggml_are_same_shape
(
const
struct
ggml_tensor
*
t0
,
const
struct
ggml_tensor
*
t1
);
GGML_API
bool
ggml_are_same_stride
(
const
struct
ggml_tensor
*
t0
,
const
struct
ggml_tensor
*
t1
);
...
...
@@ -1661,7 +1665,7 @@ extern "C" {
struct
ggml_tensor
*
a
,
struct
ggml_tensor
*
b
);
// depthwise
// depthwise
(via im2col and mul_mat)
GGML_API
struct
ggml_tensor
*
ggml_conv_2d_dw
(
struct
ggml_context
*
ctx
,
struct
ggml_tensor
*
a
,
// convolution kernel
...
...
@@ -1673,6 +1677,22 @@ extern "C" {
int
d0
,
// dilation dimension 0
int
d1
);
// dilation dimension 1
// Depthwise 2D convolution
// may be faster than ggml_conv_2d_dw, but not available in all backends
// a: KW KH 1 C convolution kernel
// b: W H C N input data
// res: W_out H_out C N
GGML_API
struct
ggml_tensor
*
ggml_conv_2d_dw_direct
(
struct
ggml_context
*
ctx
,
struct
ggml_tensor
*
a
,
struct
ggml_tensor
*
b
,
int
stride0
,
int
stride1
,
int
pad0
,
int
pad1
,
int
dilation0
,
int
dilation1
);
GGML_API
struct
ggml_tensor
*
ggml_conv_transpose_2d_p0
(
struct
ggml_context
*
ctx
,
struct
ggml_tensor
*
a
,
...
...
ml/backend/ggml/ggml/src/CMakeLists.txt
View file @
8dd12c87
...
...
@@ -267,6 +267,7 @@ function(ggml_add_cpu_backend_variant tag_name)
set
(
GGML_CPU_TAG_NAME
${
tag_name
}
)
# other: OPENMP LLAMAFILE CPU_HBM
foreach
(
feat NATIVE
SSE42
AVX AVX2 BMI2 AVX_VNNI FMA F16C
AVX512 AVX512_VBMI AVX512_VNNI AVX512_BF16
AMX_TILE AMX_INT8 AMX_BF16
)
...
...
@@ -288,11 +289,13 @@ if (GGML_CPU_ALL_VARIANTS)
message
(
FATAL_ERROR
"GGML_CPU_ALL_VARIANTS requires GGML_BACKEND_DL"
)
endif
()
add_custom_target
(
ggml-cpu
)
ggml_add_cpu_backend_variant
(
sandybridge AVX
)
ggml_add_cpu_backend_variant
(
haswell AVX F16C AVX2 BMI2 FMA
)
ggml_add_cpu_backend_variant
(
skylakex AVX F16C AVX2 BMI2 FMA AVX512
)
ggml_add_cpu_backend_variant
(
icelake AVX F16C AVX2 BMI2 FMA AVX512 AVX512_VBMI AVX512_VNNI
)
ggml_add_cpu_backend_variant
(
alderlake AVX F16C AVX2 BMI2 FMA AVX_VNNI
)
ggml_add_cpu_backend_variant
(
x64
)
ggml_add_cpu_backend_variant
(
sse42 SSE42
)
ggml_add_cpu_backend_variant
(
sandybridge SSE42 AVX
)
ggml_add_cpu_backend_variant
(
haswell SSE42 AVX F16C AVX2 BMI2 FMA
)
ggml_add_cpu_backend_variant
(
skylakex SSE42 AVX F16C AVX2 BMI2 FMA AVX512
)
ggml_add_cpu_backend_variant
(
icelake SSE42 AVX F16C AVX2 BMI2 FMA AVX512 AVX512_VBMI AVX512_VNNI
)
ggml_add_cpu_backend_variant
(
alderlake SSE42 AVX F16C AVX2 BMI2 FMA AVX_VNNI
)
elseif
(
GGML_CPU
)
ggml_add_cpu_backend_variant_impl
(
""
)
endif
()
...
...
ml/backend/ggml/ggml/src/ggml-cpu/CMakeLists.txt
View file @
8dd12c87
...
...
@@ -222,7 +222,7 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
elseif
(
GGML_AVX
)
list
(
APPEND ARCH_FLAGS /arch:AVX
)
list
(
APPEND ARCH_DEFINITIONS GGML_AVX
)
else
(
)
else
if
(
GGML_SSE42
)
list
(
APPEND ARCH_FLAGS /arch:SSE4.2
)
list
(
APPEND ARCH_DEFINITIONS GGML_SSE42
)
endif
()
...
...
@@ -237,8 +237,10 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
if
(
GGML_NATIVE
)
list
(
APPEND ARCH_FLAGS -march=native
)
else
()
list
(
APPEND ARCH_FLAGS -msse4.2
)
list
(
APPEND ARCH_DEFINITIONS GGML_SSE42
)
if
(
GGML_SSE42
)
list
(
APPEND ARCH_FLAGS -msse4.2
)
list
(
APPEND ARCH_DEFINITIONS GGML_SSE42
)
endif
()
if
(
GGML_F16C
)
list
(
APPEND ARCH_FLAGS -mf16c
)
list
(
APPEND ARCH_DEFINITIONS GGML_F16C
)
...
...
@@ -350,10 +352,14 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
# TODO: Separation to determine activation of VX/VXE/VXE2
if
(
${
S390X_M
}
MATCHES
"8561|8562"
)
message
(
STATUS
"z15 target"
)
list
(
APPEND ARCH_FLAGS -march=z15
-mtune=z15
)
list
(
APPEND ARCH_FLAGS -march=z15
)
elseif
(
${
S390X_M
}
MATCHES
"3931"
)
message
(
STATUS
"z16 target"
)
list
(
APPEND ARCH_FLAGS -march=z16 -mtune=z16
)
list
(
APPEND ARCH_FLAGS -march=z16
)
elseif
(
${
S390X_M
}
MATCHES
"9175|9176"
)
# NOTE: Only available from GCC 15.1.0 onwards. Any z17 machine with compile issues must first verify their GCC version.
message
(
STATUS
"z17 target"
)
list
(
APPEND ARCH_FLAGS -march=z17
)
else
()
message
(
STATUS
"Unknown target"
)
message
(
WARNING
"Unknown target. If you are compiling for z14 and earlier, you might have to add -DGGML_VXE=OFF."
)
...
...
ml/backend/ggml/ggml/src/ggml-cpu/cpu-feats-x86.cpp
View file @
8dd12c87
...
...
@@ -263,7 +263,7 @@ void test_x86_is() {
static
int
ggml_backend_cpu_x86_score
()
{
// FIXME: this does not check for OS support
int
score
=
0
;
int
score
=
1
;
cpuid_x86
is
;
#ifdef GGML_FMA
...
...
ml/backend/ggml/ggml/src/ggml-cpu/ggml-cpu.c
View file @
8dd12c87
...
...
@@ -217,7 +217,7 @@ static const struct ggml_type_traits_cpu type_traits_cpu[GGML_TYPE_COUNT] = {
.
nrows
=
1
,
},
[
GGML_TYPE_F16
]
=
{
.
from_float
=
(
ggml_from_float_t
)
ggml_fp32_to_fp16
_row
,
.
from_float
=
(
ggml_from_float_t
)
ggml_
cpu_
fp32_to_fp16
,
.
vec_dot
=
(
ggml_vec_dot_t
)
ggml_vec_dot_f16
,
.
vec_dot_type
=
GGML_TYPE_F16
,
.
nrows
=
1
,
...
...
@@ -358,7 +358,7 @@ static const struct ggml_type_traits_cpu type_traits_cpu[GGML_TYPE_COUNT] = {
.
from_float
=
quantize_row_q8_K
,
},
[
GGML_TYPE_BF16
]
=
{
.
from_float
=
(
ggml_from_float_t
)
ggml_fp32_to_bf16
_row
,
.
from_float
=
(
ggml_from_float_t
)
ggml_
cpu_
fp32_to_bf16
,
.
vec_dot
=
(
ggml_vec_dot_t
)
ggml_vec_dot_bf16
,
.
vec_dot_type
=
GGML_TYPE_BF16
,
.
nrows
=
1
,
...
...
@@ -1934,6 +1934,10 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
{
ggml_compute_forward_im2col_back_f32
(
params
,
tensor
);
}
break
;
case
GGML_OP_CONV_2D_DW
:
{
ggml_compute_forward_conv_2d_dw
(
params
,
tensor
);
}
break
;
case
GGML_OP_CONV_TRANSPOSE_2D
:
{
ggml_compute_forward_conv_transpose_2d
(
params
,
tensor
);
...
...
@@ -2274,6 +2278,7 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
}
break
;
case
GGML_OP_IM2COL
:
case
GGML_OP_IM2COL_BACK
:
case
GGML_OP_CONV_2D_DW
:
case
GGML_OP_CONV_TRANSPOSE_1D
:
case
GGML_OP_CONV_TRANSPOSE_2D
:
{
...
...
@@ -3172,6 +3177,93 @@ enum ggml_status ggml_graph_compute_with_ctx(struct ggml_context * ctx, struct g
return
ggml_graph_compute
(
cgraph
,
&
cplan
);
}
void
ggml_cpu_fp32_to_fp16
(
const
float
*
x
,
ggml_fp16_t
*
y
,
int64_t
n
)
{
int64_t
i
=
0
;
#if defined(__F16C__)
#if defined(__AVX512F__)
for
(;
i
+
15
<
n
;
i
+=
16
)
{
__m512
x_vec
=
_mm512_loadu_ps
(
x
+
i
);
__m256i
y_vec
=
_mm512_cvtps_ph
(
x_vec
,
_MM_FROUND_TO_NEAREST_INT
);
_mm256_storeu_si256
((
__m256i
*
)(
y
+
i
),
y_vec
);
}
#endif
for
(;
i
+
7
<
n
;
i
+=
8
)
{
__m256
x_vec
=
_mm256_loadu_ps
(
x
+
i
);
__m128i
y_vec
=
_mm256_cvtps_ph
(
x_vec
,
_MM_FROUND_TO_NEAREST_INT
);
_mm_storeu_si128
((
__m128i
*
)(
y
+
i
),
y_vec
);
}
for
(;
i
+
3
<
n
;
i
+=
4
)
{
__m128
x_vec
=
_mm_loadu_ps
(
x
+
i
);
__m128i
y_vec
=
_mm_cvtps_ph
(
x_vec
,
_MM_FROUND_TO_NEAREST_INT
);
_mm_storel_epi64
((
__m128i
*
)(
y
+
i
),
y_vec
);
}
#endif
for
(;
i
<
n
;
++
i
)
{
y
[
i
]
=
GGML_FP32_TO_FP16
(
x
[
i
]);
}
}
void
ggml_cpu_fp16_to_fp32
(
const
ggml_fp16_t
*
x
,
float
*
y
,
int64_t
n
)
{
int64_t
i
=
0
;
#if defined(__F16C__)
#if defined(__AVX512F__)
for
(;
i
+
15
<
n
;
i
+=
16
)
{
__m256i
x_vec
=
_mm256_loadu_si256
((
const
__m256i
*
)(
x
+
i
));
__m512
y_vec
=
_mm512_cvtph_ps
(
x_vec
);
_mm512_storeu_ps
(
y
+
i
,
y_vec
);
}
#endif
for
(;
i
+
7
<
n
;
i
+=
8
)
{
__m128i
x_vec
=
_mm_loadu_si128
((
const
__m128i
*
)(
x
+
i
));
__m256
y_vec
=
_mm256_cvtph_ps
(
x_vec
);
_mm256_storeu_ps
(
y
+
i
,
y_vec
);
}
for
(;
i
+
3
<
n
;
i
+=
4
)
{
__m128i
x_vec
=
_mm_loadl_epi64
((
const
__m128i
*
)(
x
+
i
));
__m128
y_vec
=
_mm_cvtph_ps
(
x_vec
);
_mm_storeu_ps
(
y
+
i
,
y_vec
);
}
#endif
for
(;
i
<
n
;
++
i
)
{
y
[
i
]
=
GGML_FP16_TO_FP32
(
x
[
i
]);
}
}
void
ggml_cpu_fp32_to_bf16
(
const
float
*
x
,
ggml_bf16_t
*
y
,
int64_t
n
)
{
int64_t
i
=
0
;
for
(;
i
<
n
;
++
i
)
{
y
[
i
]
=
GGML_FP32_TO_BF16
(
x
[
i
]);
}
}
void
ggml_cpu_bf16_to_fp32
(
const
ggml_bf16_t
*
x
,
float
*
y
,
int64_t
n
)
{
int64_t
i
=
0
;
#if defined(__AVX2__)
#if defined(__AVX512F__)
for
(;
i
+
15
<
n
;
i
+=
16
)
{
_mm512_storeu_ps
(
y
+
i
,
_mm512_castsi512_ps
(
_mm512_slli_epi32
(
_mm512_cvtepu16_epi32
(
_mm256_loadu_si256
(
(
const
__m256i
*
)(
x
+
i
))),
16
)));
}
#endif
for
(;
i
+
7
<
n
;
i
+=
8
)
{
_mm256_storeu_ps
(
y
+
i
,
_mm256_castsi256_ps
(
_mm256_slli_epi32
(
_mm256_cvtepu16_epi32
(
_mm_loadu_si128
(
(
const
__m128i
*
)(
x
+
i
))),
16
)));
}
#endif
for
(;
i
<
n
;
i
++
)
{
y
[
i
]
=
GGML_BF16_TO_FP32
(
x
[
i
]);
}
}
int
ggml_cpu_has_avx
(
void
)
{
#if defined(__AVX__)
...
...
ml/backend/ggml/ggml/src/ggml-cpu/ops.cpp
View file @
8dd12c87
...
...
@@ -4222,7 +4222,7 @@ static void ggml_compute_forward_get_rows_f16(
GGML_ASSERT
(
i01
>=
0
&&
i01
<
ne01
);
ggml_fp16_to_fp32
_row
(
ggml_
cpu_
fp16_to_fp32
(
(
const
ggml_fp16_t
*
)
((
char
*
)
src0
->
data
+
i01
*
nb01
+
i11
*
nb02
+
i12
*
nb03
),
(
float
*
)
((
char
*
)
dst
->
data
+
i10
*
nb1
+
i11
*
nb2
+
i12
*
nb3
),
nc
);
}
...
...
@@ -4263,7 +4263,7 @@ static void ggml_compute_forward_get_rows_bf16(
GGML_ASSERT
(
i01
>=
0
&&
i01
<
ne01
);
ggml_bf16_to_fp32
_row
(
ggml_
cpu_
bf16_to_fp32
(
(
const
ggml_bf16_t
*
)
((
char
*
)
src0
->
data
+
i01
*
nb01
+
i11
*
nb02
+
i12
*
nb03
),
(
float
*
)
((
char
*
)
dst
->
data
+
i10
*
nb1
+
i11
*
nb2
+
i12
*
nb3
),
nc
);
}
...
...
@@ -6064,6 +6064,178 @@ void ggml_compute_forward_conv_transpose_2d(
}
}
// ggml_compute_forward_conv_2d_dw
struct
ggml_conv_2d_dw_params
{
int64_t
channels
;
int64_t
batch
;
int64_t
src_w
;
int64_t
src_h
;
int64_t
dst_w
;
int64_t
dst_h
;
int64_t
knl_w
;
int64_t
knl_h
;
int
stride_x
;
int
stride_y
;
int
pad_x
;
int
pad_y
;
int
dilation_x
;
int
dilation_y
;
};
static
void
ggml_compute_forward_conv_2d_dw_cwhn
(
const
ggml_compute_params
*
params
,
const
ggml_tensor
*
src
,
const
ggml_tensor
*
kernel
,
ggml_tensor
*
dst
,
const
ggml_conv_2d_dw_params
&
p
)
{
const
int64_t
c
=
p
.
channels
;
const
float
*
knl_data
=
(
const
float
*
)
kernel
->
data
;
const
int64_t
rows_total
=
p
.
dst_h
*
p
.
batch
;
const
int64_t
rows_per_thread
=
(
rows_total
+
params
->
nth
-
1
)
/
params
->
nth
;
const
int64_t
row_start
=
params
->
ith
*
rows_per_thread
;
const
int64_t
row_end
=
MIN
(
row_start
+
rows_per_thread
,
rows_total
);
#ifdef GGML_SIMD
const
int64_t
pkg_size
=
GGML_F32_EPR
;
const
int64_t
pkg_count
=
c
/
pkg_size
;
const
int64_t
c_pkg_end
=
pkg_count
*
pkg_size
;
#else
const
int64_t
c_pkg_end
=
0
;
#endif
for
(
int64_t
row
=
row_start
;
row
<
row_end
;
++
row
)
{
const
int64_t
dst_y
=
row
%
p
.
dst_h
;
const
float
*
src_data
=
(
const
float
*
)
src
->
data
+
(
row
/
p
.
dst_h
)
*
p
.
src_w
*
p
.
src_h
*
c
;
for
(
int64_t
dst_x
=
0
;
dst_x
<
p
.
dst_w
;
++
dst_x
)
{
float
*
dst_data
=
(
float
*
)
dst
->
data
+
(
row
*
p
.
dst_w
+
dst_x
)
*
c
;
const
int64_t
src_y_base
=
dst_y
*
p
.
stride_y
-
p
.
pad_y
;
const
int64_t
src_x_base
=
dst_x
*
p
.
stride_x
-
p
.
pad_x
;
#ifdef GGML_SIMD
// Vectorized loop
for
(
int64_t
c_i
=
0
;
c_i
<
c_pkg_end
;
c_i
+=
pkg_size
)
{
GGML_F32_VEC
sum
=
GGML_F32_VEC_ZERO
;
for
(
int64_t
knl_y
=
0
;
knl_y
<
p
.
knl_h
;
++
knl_y
)
{
const
int64_t
src_y
=
src_y_base
+
knl_y
*
p
.
dilation_y
;
if
(
src_y
<
0
||
src_y
>=
p
.
src_h
)
{
continue
;
}
for
(
int64_t
knl_x
=
0
;
knl_x
<
p
.
knl_w
;
++
knl_x
)
{
const
int64_t
src_x
=
src_x_base
+
knl_x
*
p
.
dilation_x
;
if
(
src_x
<
0
||
src_x
>=
p
.
src_w
)
{
continue
;
}
GGML_F32_VEC
k
=
GGML_F32_VEC_LOAD
(
knl_data
+
(
knl_y
*
p
.
knl_w
+
knl_x
)
*
c
+
c_i
);
GGML_F32_VEC
s
=
GGML_F32_VEC_LOAD
(
src_data
+
(
src_y
*
p
.
src_w
+
src_x
)
*
c
+
c_i
);
sum
=
GGML_F32_VEC_FMA
(
sum
,
k
,
s
);
}
}
GGML_F32_VEC_STORE
(
dst_data
+
c_i
,
sum
);
}
#endif
// Scalar loop
for
(
int64_t
c_i
=
c_pkg_end
;
c_i
<
c
;
++
c_i
)
{
float
sum
=
0.0
f
;
for
(
int64_t
knl_y
=
0
;
knl_y
<
p
.
knl_h
;
++
knl_y
)
{
const
int64_t
src_y
=
src_y_base
+
knl_y
*
p
.
dilation_y
;
if
(
src_y
<
0
||
src_y
>=
p
.
src_h
)
{
continue
;
}
for
(
int64_t
knl_x
=
0
;
knl_x
<
p
.
knl_w
;
++
knl_x
)
{
const
int64_t
src_x
=
src_x_base
+
knl_x
*
p
.
dilation_x
;
if
(
src_x
<
0
||
src_x
>=
p
.
src_w
)
{
continue
;
}
sum
+=
knl_data
[(
knl_y
*
p
.
knl_w
+
knl_x
)
*
c
+
c_i
]
*
src_data
[(
src_y
*
p
.
src_w
+
src_x
)
*
c
+
c_i
];
}
}
dst_data
[
c_i
]
=
sum
;
}
}
}
}
static
void
ggml_compute_forward_conv_2d_dw_whcn
(
const
ggml_compute_params
*
params
,
const
ggml_tensor
*
src
,
const
ggml_tensor
*
kernel
,
ggml_tensor
*
dst
,
const
ggml_conv_2d_dw_params
&
p
)
{
const
int64_t
n
=
p
.
channels
*
p
.
batch
;
const
int64_t
per_thread
=
(
n
+
params
->
nth
-
1
)
/
params
->
nth
;
const
int64_t
start
=
params
->
ith
*
per_thread
;
const
int64_t
end
=
MIN
(
start
+
per_thread
,
n
);
for
(
int64_t
i
=
start
;
i
<
end
;
++
i
)
{
const
float
*
knl_data
=
(
const
float
*
)
kernel
->
data
+
(
i
%
p
.
channels
)
*
p
.
knl_w
*
p
.
knl_h
;
const
float
*
src_data
=
(
const
float
*
)
src
->
data
+
i
*
p
.
src_w
*
p
.
src_h
;
float
*
dst_data
=
(
float
*
)
dst
->
data
+
i
*
p
.
dst_w
*
p
.
dst_h
;
for
(
int64_t
dst_y
=
0
;
dst_y
<
p
.
dst_h
;
++
dst_y
)
{
for
(
int64_t
dst_x
=
0
;
dst_x
<
p
.
dst_w
;
++
dst_x
)
{
float
sum
=
0.0
f
;
for
(
int64_t
knl_y
=
0
;
knl_y
<
p
.
knl_h
;
++
knl_y
)
{
const
int64_t
src_y
=
dst_y
*
p
.
stride_y
+
knl_y
*
p
.
dilation_y
-
p
.
pad_y
;
if
(
src_y
<
0
||
src_y
>=
p
.
src_h
)
{
continue
;
}
for
(
int64_t
knl_x
=
0
;
knl_x
<
p
.
knl_w
;
++
knl_x
)
{
const
int64_t
src_x
=
dst_x
*
p
.
stride_x
+
knl_x
*
p
.
dilation_x
-
p
.
pad_x
;
if
(
src_x
<
0
||
src_x
>=
p
.
src_w
)
{
continue
;
}
sum
+=
knl_data
[
knl_y
*
p
.
knl_w
+
knl_x
]
*
src_data
[
src_y
*
p
.
src_w
+
src_x
];
}
}
dst_data
[
dst_y
*
p
.
dst_w
+
dst_x
]
=
sum
;
}
}
}
}
void
ggml_compute_forward_conv_2d_dw
(
const
ggml_compute_params
*
params
,
ggml_tensor
*
dst
)
{
const
ggml_tensor
*
kernel
=
dst
->
src
[
0
];
const
ggml_tensor
*
src
=
dst
->
src
[
1
];
ggml_conv_2d_dw_params
p
;
p
.
channels
=
src
->
ne
[
2
];
p
.
batch
=
src
->
ne
[
3
];
p
.
src_w
=
src
->
ne
[
0
];
p
.
src_h
=
src
->
ne
[
1
];
p
.
dst_w
=
dst
->
ne
[
0
];
p
.
dst_h
=
dst
->
ne
[
1
];
p
.
knl_w
=
kernel
->
ne
[
0
];
p
.
knl_h
=
kernel
->
ne
[
1
];
p
.
stride_x
=
dst
->
op_params
[
0
];
p
.
stride_y
=
dst
->
op_params
[
1
];
p
.
pad_x
=
dst
->
op_params
[
2
];
p
.
pad_y
=
dst
->
op_params
[
3
];
p
.
dilation_x
=
dst
->
op_params
[
4
];
p
.
dilation_y
=
dst
->
op_params
[
5
];
GGML_ASSERT
(
kernel
->
ne
[
3
]
==
p
.
channels
);
GGML_ASSERT
(
dst
->
ne
[
3
]
==
p
.
batch
);
if
(
ggml_is_contiguous
(
src
))
{
ggml_compute_forward_conv_2d_dw_whcn
(
params
,
src
,
kernel
,
dst
,
p
);
}
else
if
(
ggml_is_contiguous_channels
(
src
))
{
// kernel should also have channels most contiguous in memory
GGML_ASSERT
(
kernel
->
nb
[
0
]
>=
kernel
->
nb
[
2
]
&&
kernel
->
nb
[
1
]
>=
kernel
->
nb
[
0
]);
ggml_compute_forward_conv_2d_dw_cwhn
(
params
,
src
,
kernel
,
dst
,
p
);
}
else
{
GGML_ABORT
(
"non-contiguous memory layout not supported"
);
}
}
// ggml_compute_forward_pool_1d_sk_p0
static
void
ggml_compute_forward_pool_1d_sk_p0
(
...
...
ml/backend/ggml/ggml/src/ggml-cpu/ops.h
View file @
8dd12c87
...
...
@@ -65,6 +65,7 @@ void ggml_compute_forward_conv_transpose_1d(const struct ggml_compute_params * p
void
ggml_compute_forward_im2col
(
const
struct
ggml_compute_params
*
params
,
struct
ggml_tensor
*
dst
);
void
ggml_compute_forward_im2col_back_f32
(
const
struct
ggml_compute_params
*
params
,
struct
ggml_tensor
*
dst
);
void
ggml_compute_forward_conv_transpose_2d
(
const
struct
ggml_compute_params
*
params
,
struct
ggml_tensor
*
dst
);
void
ggml_compute_forward_conv_2d_dw
(
const
struct
ggml_compute_params
*
params
,
struct
ggml_tensor
*
dst
);
void
ggml_compute_forward_pool_1d
(
const
struct
ggml_compute_params
*
params
,
struct
ggml_tensor
*
dst
);
void
ggml_compute_forward_pool_2d
(
const
struct
ggml_compute_params
*
params
,
struct
ggml_tensor
*
dst
);
void
ggml_compute_forward_pool_2d_back
(
const
struct
ggml_compute_params
*
params
,
struct
ggml_tensor
*
dst
);
...
...
ml/backend/ggml/ggml/src/ggml-cpu/simd-mappings.h
View file @
8dd12c87
...
...
@@ -341,7 +341,7 @@ static inline void __avx_f32cx8_store(ggml_fp16_t *x, __m256 y) {
#define GGML_F32_EPR 4
#define GGML_F32x4 vector float
#define GGML_F32x4_ZERO 0.0f
#define GGML_F32x4_ZERO
{
0.0f
}
#define GGML_F32x4_SET1 vec_splats
#define GGML_F32x4_LOAD(p) vec_xl(0, p)
#define GGML_F32x4_STORE(p, r) vec_xst(r, 0, p)
...
...
ml/backend/ggml/ggml/src/ggml-cuda/common.cuh
View file @
8dd12c87
...
...
@@ -78,13 +78,13 @@
// Moore Threads
#define GGML_CUDA_MUSA_ARCH_IS_QY1 (__MUSA_ARCH__ <= 210)
#define GGML_CUDA_CC_QY1 (GGML_
MUS
A_CC_OFFSET_MTHREADS + 0x210) // MTT S80, MTT S3000
#define GGML_CUDA_CC_QY2 (GGML_
MUS
A_CC_OFFSET_MTHREADS + 0x220) // MTT S4000
#define GGML_CUDA_CC_NG (GGML_
MUS
A_CC_OFFSET_MTHREADS + 0x310) // TBD
#define GGML_CUDA_CC_QY1 (GGML_
CUD
A_CC_OFFSET_MTHREADS + 0x210) // MTT S80, MTT S3000
#define GGML_CUDA_CC_QY2 (GGML_
CUD
A_CC_OFFSET_MTHREADS + 0x220) // MTT S4000
#define GGML_CUDA_CC_NG (GGML_
CUD
A_CC_OFFSET_MTHREADS + 0x310) // TBD
#define GGML_CUDA_CC_IS_MTHREADS(cc) (cc >= GGML_CUDA_CC_OFFSET_MTHREADS && cc < GGML_CUDA_CC_OFFSET_AMD)
#define GGML_CUDA_CC_IS_QY1(cc) (cc >= GGML_CUDA_CC_QY1 && cc < GGML_CUDA_CC_QY2)
#define GGML_CUDA_CC_IS_QY2(cc) (cc >= GGML_CUDA_CC_QY2 && cc < GGML_CUDA_CC_N
EXT
)
#define GGML_CUDA_CC_IS_QY2(cc) (cc >= GGML_CUDA_CC_QY2 && cc < GGML_CUDA_CC_N
G
)
#define GGML_CUDA_CC_IS_NG(cc) (cc >= GGML_CUDA_CC_NG)
#ifdef __CUDA_ARCH_LIST__
...
...
ml/backend/ggml/ggml/src/ggml-cuda/convert.cu
View file @
8dd12c87
#include "convert.cuh"
#include "dequantize.cuh"
#include <cstdint>
#define CUDA_Q8_0_NE_ALIGN 2048
template
<
int
qk
,
int
qr
,
dequantize_kernel_t
dequantize_kernel
,
typename
dst_t
>
...
...
@@ -570,30 +572,46 @@ static void dequantize_row_iq4_xs_cuda(const void * vx, dst_t * y, const int64_t
}
template
<
typename
src_t
,
typename
dst_t
>
static
__global__
void
convert_unary
(
const
void
*
__restrict__
vx
,
dst_t
*
__restrict__
y
,
const
int64_t
k
)
{
const
int64_t
i
=
(
int64_t
)
blockDim
.
x
*
blockIdx
.
x
+
threadIdx
.
x
;
static
__global__
void
convert_unary
(
const
void
*
__restrict__
vx
,
dst_t
*
__restrict__
y
,
const
int64_t
ne00
,
const
int64_t
ne01
,
const
int64_t
ne02
,
const
int64_t
s01
,
const
int64_t
s02
,
const
int64_t
s03
)
{
const
int64_t
i00
=
(
int64_t
)
blockDim
.
x
*
blockIdx
.
x
+
threadIdx
.
x
;
if
(
i
>=
k
)
{
if
(
i
00
>=
ne00
)
{
return
;
}
const
int64_t
i01
=
blockIdx
.
y
;
const
int64_t
i02
=
blockIdx
.
z
%
ne02
;
const
int64_t
i03
=
blockIdx
.
z
/
ne02
;
const
src_t
*
x
=
(
const
src_t
*
)
vx
;
y
[
i
]
=
float
(
x
[
i
]);
const
int64_t
ix
=
i03
*
s03
+
i02
*
s02
+
i01
*
s01
+
i00
;
const
int64_t
iy
=
((
i03
*
ne02
+
i02
)
*
ne01
+
i01
)
*
ne00
+
i00
;
y
[
iy
]
=
float
(
x
[
ix
]);
}
template
<
typename
src_t
,
typename
dst_t
>
static
void
convert_unary_cuda
(
const
void
*
__restrict__
vx
,
dst_t
*
__restrict__
y
,
const
int64_t
k
,
cudaStream_t
stream
)
{
const
int
num_blocks
=
(
k
+
CUDA_DEQUANTIZE_BLOCK_SIZE
-
1
)
/
CUDA_DEQUANTIZE_BLOCK_SIZE
;
convert_unary
<
src_t
><<<
num_blocks
,
CUDA_DEQUANTIZE_BLOCK_SIZE
,
0
,
stream
>>>
(
vx
,
y
,
k
);
static
void
convert_unary_cuda
(
const
void
*
vx
,
dst_t
*
y
,
const
int64_t
ne00
,
const
int64_t
ne01
,
const
int64_t
ne02
,
const
int64_t
ne03
,
const
int64_t
s01
,
const
int64_t
s02
,
const
int64_t
s03
,
cudaStream_t
stream
)
{
const
dim3
num_blocks
((
ne00
+
CUDA_DEQUANTIZE_BLOCK_SIZE
-
1
)
/
CUDA_DEQUANTIZE_BLOCK_SIZE
,
ne01
,
ne02
*
ne03
);
convert_unary
<
src_t
><<<
num_blocks
,
CUDA_DEQUANTIZE_BLOCK_SIZE
,
0
,
stream
>>>
(
vx
,
y
,
ne00
,
ne01
,
ne02
,
s01
,
s02
,
s03
);
}
template
<
typename
src_t
,
typename
dst_t
>
static
void
convert_unary_cont_cuda
(
const
void
*
vx
,
dst_t
*
y
,
const
int64_t
k
,
cudaStream_t
stream
)
{
convert_unary_cuda
<
src_t
>
(
vx
,
y
,
k
,
1
,
1
,
1
,
k
,
k
,
k
,
stream
);
}
to_bf16_cuda_t
ggml_get_to_bf16_cuda
(
ggml_type
type
)
{
switch
(
type
)
{
case
GGML_TYPE_F32
:
return
convert_unary_cuda
<
float
>
;
return
convert_unary_
cont_
cuda
<
float
>
;
case
GGML_TYPE_F16
:
return
convert_unary_cuda
<
half
>
;
return
convert_unary_
cont_
cuda
<
half
>
;
default:
return
nullptr
;
}
...
...
@@ -643,9 +661,9 @@ to_fp16_cuda_t ggml_get_to_fp16_cuda(ggml_type type) {
case
GGML_TYPE_IQ3_S
:
return
dequantize_row_iq3_s_cuda
;
case
GGML_TYPE_F32
:
return
convert_unary_cuda
<
float
>
;
return
convert_unary_
cont_
cuda
<
float
>
;
case
GGML_TYPE_BF16
:
return
convert_unary_cuda
<
nv_bfloat16
>
;
return
convert_unary_
cont_
cuda
<
nv_bfloat16
>
;
default:
return
nullptr
;
}
...
...
@@ -692,7 +710,18 @@ to_fp32_cuda_t ggml_get_to_fp32_cuda(ggml_type type) {
case
GGML_TYPE_IQ3_S
:
return
dequantize_row_iq3_s_cuda
;
case
GGML_TYPE_F16
:
return
convert_unary_cuda
<
half
>
;
return
convert_unary_cont_cuda
<
half
>
;
case
GGML_TYPE_BF16
:
return
convert_unary_cont_cuda
<
nv_bfloat16
>
;
default:
return
nullptr
;
}
}
to_fp16_nc_cuda_t
ggml_get_to_fp16_nc_cuda
(
ggml_type
type
)
{
switch
(
type
)
{
case
GGML_TYPE_F32
:
return
convert_unary_cuda
<
float
>
;
case
GGML_TYPE_BF16
:
return
convert_unary_cuda
<
nv_bfloat16
>
;
default:
...
...
ml/backend/ggml/ggml/src/ggml-cuda/convert.cuh
View file @
8dd12c87
...
...
@@ -3,7 +3,7 @@
#define CUDA_DEQUANTIZE_BLOCK_SIZE 256
template
<
typename
T
>
using
to_t_cuda_t
=
void
(
*
)(
const
void
*
__restrict__
x
,
T
*
__restrict__
y
,
int64_t
k
,
cudaStream_t
stream
);
using
to_t_cuda_t
=
void
(
*
)(
const
void
*
x
,
T
*
y
,
int64_t
k
,
cudaStream_t
stream
);
typedef
to_t_cuda_t
<
float
>
to_fp32_cuda_t
;
typedef
to_t_cuda_t
<
half
>
to_fp16_cuda_t
;
...
...
@@ -14,3 +14,13 @@ to_fp16_cuda_t ggml_get_to_fp16_cuda(ggml_type type);
to_bf16_cuda_t
ggml_get_to_bf16_cuda
(
ggml_type
type
);
to_fp32_cuda_t
ggml_get_to_fp32_cuda
(
ggml_type
type
);
// TODO more general support for non-contiguous inputs
template
<
typename
T
>
using
to_t_nc_cuda_t
=
void
(
*
)(
const
void
*
x
,
T
*
y
,
int64_t
ne00
,
int64_t
ne01
,
int64_t
ne02
,
int64_t
ne03
,
int64_t
s01
,
int64_t
s02
,
int64_t
s03
,
cudaStream_t
stream
);
typedef
to_t_nc_cuda_t
<
half
>
to_fp16_nc_cuda_t
;
to_fp16_nc_cuda_t
ggml_get_to_fp16_nc_cuda
(
ggml_type
type
);
ml/backend/ggml/ggml/src/ggml-cuda/cpy.cu
View file @
8dd12c87
...
...
@@ -639,6 +639,8 @@ void ggml_cuda_cpy(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, gg
if
(
ctx
.
cuda_graph
->
use_cpy_indirection
&&
!
disable_indirection_for_this_node
)
{
ctx
.
cuda_graph
->
graph_cpynode_index
=
graph_cpynode_index
;
}
#else
GGML_UNUSED
(
disable_indirection_for_this_node
);
#endif
}
...
...
ml/backend/ggml/ggml/src/ggml-cuda/getrows.cu
View file @
8dd12c87
...
...
@@ -33,8 +33,8 @@ static __global__ void k_get_rows(
dfloat2
v
;
dequantize_kernel
(
src0_row
,
ib
,
iqs
,
v
);
dst_row
[
iybs
+
iqs
+
0
]
=
v
.
x
;
dst_row
[
iybs
+
iqs
+
y_offset
]
=
v
.
y
;
dst_row
[
iybs
+
iqs
+
0
]
=
float
(
v
.
x
)
;
dst_row
[
iybs
+
iqs
+
y_offset
]
=
float
(
v
.
y
)
;
}
template
<
typename
src0_t
,
typename
dst_t
>
...
...
@@ -60,7 +60,7 @@ static __global__ void k_get_rows_float(
dst_t
*
dst_row
=
dst
+
i10
*
s1
+
i11
*
s2
+
i12
*
s3
;
const
src0_t
*
src0_row
=
(
const
src0_t
*
)((
const
char
*
)
src0
+
i01
*
nb01
+
i11
*
nb02
+
i12
*
nb03
);
dst_row
[
i00
]
=
src0_row
[
i00
];
dst_row
[
i00
]
=
float
(
src0_row
[
i00
]
)
;
}
template
<
typename
grad_t
,
typename
dst_t
>
...
...
@@ -86,122 +86,161 @@ static __global__ void k_get_rows_back_float(
dst
[
dst_row
*
ncols
+
col
]
=
sum
;
}
template
<
int
qk
,
int
qr
,
dequantize_kernel_t
dq
>
static
void
get_rows_cuda
(
const
ggml_tensor
*
src0
,
const
ggml_tensor
*
src1
,
ggml_tensor
*
dst
,
const
void
*
src0_dd
,
const
int32_t
*
src1_dd
,
float
*
dst_dd
,
cudaStream_t
stream
)
{
GGML_TENSOR_BINARY_OP_LOCALS
template
<
int
qk
,
int
qr
,
dequantize_kernel_t
dq
,
typename
dst_t
>
static
void
get_rows_cuda
_q
(
const
void
*
src0
_d
,
const
int32_t
*
src1_d
,
dst_t
*
dst
_d
,
const
int64_t
ne00
,
const
size_t
nb01
,
const
size_t
nb02
,
const
size_t
nb03
,
const
int64_t
ne10
,
const
int64_t
ne11
,
const
int64_t
ne12
,
const
size_t
nb10
,
const
size_t
nb11
,
const
size_t
nb12
,
const
size_t
nb1
,
const
size_t
nb2
,
const
size_t
nb3
,
cudaStream_t
stream
)
{
const
dim3
block_dims
(
CUDA_GET_ROWS_BLOCK_SIZE
,
1
,
1
);
const
int
block_num_x
=
(
ne00
+
2
*
CUDA_GET_ROWS_BLOCK_SIZE
-
1
)
/
(
2
*
CUDA_GET_ROWS_BLOCK_SIZE
);
const
dim3
block_nums
(
block_num_x
,
ne10
,
ne11
*
ne12
);
// strides in elements
//const size_t s0 = nb0 /
ggml_element_
size(dst);
const
size_t
s1
=
nb1
/
ggml_element_
size
(
dst
);
const
size_t
s2
=
nb2
/
ggml_element_
size
(
dst
);
const
size_t
s3
=
nb3
/
ggml_element_
size
(
dst
);
//
const size_t s0 = nb0 / size
of
(dst
_t
);
const
size_t
s1
=
nb1
/
size
of
(
dst
_t
);
const
size_t
s2
=
nb2
/
size
of
(
dst
_t
);
const
size_t
s3
=
nb3
/
size
of
(
dst
_t
);
const
size_t
s10
=
nb10
/
ggml_element_size
(
src1
);
const
size_t
s11
=
nb11
/
ggml_element_size
(
src1
);
const
size_t
s12
=
nb12
/
ggml_element_size
(
src1
);
//const size_t s13 = nb13 /
ggml_element_size(src1
);
const
size_t
s10
=
nb10
/
sizeof
(
int32_t
);
const
size_t
s11
=
nb11
/
sizeof
(
int32_t
);
const
size_t
s12
=
nb12
/
sizeof
(
int32_t
);
//
const size_t s13 = nb13 /
sizeof(int32_t
);
GGML_ASSERT
(
ne00
%
2
==
0
);
k_get_rows
<
qk
,
qr
,
dq
><<<
block_nums
,
block_dims
,
0
,
stream
>>>
(
src0_d
d
,
src1_d
d
,
dst_d
d
,
src0_d
,
src1_d
,
dst_d
,
ne00
,
/*ne01, ne02, ne03,*/
/*ne10, ne11,*/
ne12
,
/*ne13,*/
/* s0,*/
s1
,
s2
,
s3
,
/* nb00,*/
nb01
,
nb02
,
nb03
,
s10
,
s11
,
s12
/*, s13*/
);
GGML_UNUSED
(
dst
);
}
template
<
typename
src0_t
>
template
<
typename
src0_t
,
typename
dst_t
>
static
void
get_rows_cuda_float
(
const
ggml_tensor
*
src0
,
const
ggml_tensor
*
src1
,
ggml_tensor
*
dst
,
const
src0_t
*
src0_dd
,
const
int32_t
*
src1_dd
,
float
*
dst_dd
,
cudaStream_t
stream
)
{
GGML_TENSOR_BINARY_OP_LOCALS
GGML_ASSERT
(
ne13
==
1
);
const
src0_t
*
src0_d
,
const
int32_t
*
src1_d
,
dst_t
*
dst_d
,
const
int64_t
ne00
,
const
size_t
nb01
,
const
size_t
nb02
,
const
size_t
nb03
,
const
int64_t
ne10
,
const
int64_t
ne11
,
const
int64_t
ne12
,
const
size_t
nb10
,
const
size_t
nb11
,
const
size_t
nb12
,
const
size_t
nb1
,
const
size_t
nb2
,
const
size_t
nb3
,
cudaStream_t
stream
)
{
const
dim3
block_dims
(
CUDA_GET_ROWS_BLOCK_SIZE
,
1
,
1
);
const
int
block_num_x
=
(
ne00
+
CUDA_GET_ROWS_BLOCK_SIZE
-
1
)
/
CUDA_GET_ROWS_BLOCK_SIZE
;
const
dim3
block_nums
(
block_num_x
,
ne10
,
ne11
*
ne12
);
// strides in elements
//const size_t s0 = nb0 /
ggml_element_
size(dst);
const
size_t
s1
=
nb1
/
ggml_element_
size
(
dst
);
const
size_t
s2
=
nb2
/
ggml_element_
size
(
dst
);
const
size_t
s3
=
nb3
/
ggml_element_
size
(
dst
);
//
const size_t s0 = nb0 / size
of
(dst
_t
);
const
size_t
s1
=
nb1
/
size
of
(
dst
_t
);
const
size_t
s2
=
nb2
/
size
of
(
dst
_t
);
const
size_t
s3
=
nb3
/
size
of
(
dst
_t
);
const
size_t
s10
=
nb10
/
ggml_element_size
(
src1
);
const
size_t
s11
=
nb11
/
ggml_element_size
(
src1
);
const
size_t
s12
=
nb12
/
ggml_element_size
(
src1
);
//const size_t s13 = nb13 /
ggml_element_size(src1
);
const
size_t
s10
=
nb10
/
sizeof
(
int32_t
);
const
size_t
s11
=
nb11
/
sizeof
(
int32_t
);
const
size_t
s12
=
nb12
/
sizeof
(
int32_t
);
//
const size_t s13 = nb13 /
sizeof(int32_t
);
k_get_rows_float
<<<
block_nums
,
block_dims
,
0
,
stream
>>>
(
src0_d
d
,
src1_d
d
,
dst_d
d
,
src0_d
,
src1_d
,
dst_d
,
ne00
,
/*ne01, ne02, ne03,*/
/*ne10, ne11,*/
ne12
,
/*ne13,*/
/* s0,*/
s1
,
s2
,
s3
,
/* nb00,*/
nb01
,
nb02
,
nb03
,
s10
,
s11
,
s12
/*, s13*/
);
GGML_UNUSED
(
dst
);
}
void
ggml_cuda_op_get_rows
(
ggml_backend_cuda_context
&
ctx
,
ggml_tensor
*
dst
)
{
const
ggml_tensor
*
src0
=
dst
->
src
[
0
];
const
ggml_tensor
*
src1
=
dst
->
src
[
1
];
const
void
*
src0_d
=
(
const
void
*
)
src0
->
data
;
const
int32_t
*
src1_d
=
(
const
int32_t
*
)
src1
->
data
;
float
*
dst_d
=
(
float
*
)
dst
->
data
;
cudaStream_t
stream
=
ctx
.
stream
();
GGML_ASSERT
(
src1
->
type
==
GGML_TYPE_I32
);
GGML_ASSERT
(
dst
->
type
==
GGML_TYPE_F32
);
GGML_ASSERT
(
src0
->
nb
[
0
]
==
ggml_type_size
(
src0
->
type
));
GGML_ASSERT
(
src1
->
nb
[
0
]
==
ggml_type_size
(
src1
->
type
));
GGML_ASSERT
(
dst
->
nb
[
0
]
==
ggml_type_size
(
dst
->
type
));
switch
(
src0
->
type
)
{
template
<
typename
dst_t
>
static
void
ggml_cuda_get_rows_switch_src0_type
(
const
void
*
src0_d
,
const
ggml_type
src0_type
,
const
int32_t
*
src1_d
,
dst_t
*
dst_d
,
const
int64_t
ne00
,
const
size_t
nb01
,
const
size_t
nb02
,
const
size_t
nb03
,
const
int64_t
ne10
,
const
int64_t
ne11
,
const
int64_t
ne12
,
const
size_t
nb10
,
const
size_t
nb11
,
const
size_t
nb12
,
const
size_t
nb1
,
const
size_t
nb2
,
const
size_t
nb3
,
cudaStream_t
stream
)
{
switch
(
src0_type
)
{
case
GGML_TYPE_F16
:
get_rows_cuda_float
(
src0
,
src1
,
dst
,
(
const
half
*
)
src0_d
,
src1_d
,
dst_d
,
stream
);
get_rows_cuda_float
((
const
half
*
)
src0_d
,
src1_d
,
dst_d
,
ne00
,
nb01
,
nb02
,
nb03
,
ne10
,
ne11
,
ne12
,
nb10
,
nb11
,
nb12
,
nb1
,
nb2
,
nb3
,
stream
);
break
;
case
GGML_TYPE_F32
:
get_rows_cuda_float
(
src0
,
src1
,
dst
,
(
const
float
*
)
src0_d
,
src1_d
,
dst_d
,
stream
);
get_rows_cuda_float
((
const
float
*
)
src0_d
,
src1_d
,
dst_d
,
ne00
,
nb01
,
nb02
,
nb03
,
ne10
,
ne11
,
ne12
,
nb10
,
nb11
,
nb12
,
nb1
,
nb2
,
nb3
,
stream
);
break
;
case
GGML_TYPE_BF16
:
get_rows_cuda_float
((
const
nv_bfloat16
*
)
src0_d
,
src1_d
,
dst_d
,
ne00
,
nb01
,
nb02
,
nb03
,
ne10
,
ne11
,
ne12
,
nb10
,
nb11
,
nb12
,
nb1
,
nb2
,
nb3
,
stream
);
break
;
case
GGML_TYPE_Q4_0
:
get_rows_cuda
<
QK4_0
,
QR4_0
,
dequantize_q4_0
>
(
src0
,
src1
,
dst
,
src0_d
,
src1_d
,
dst_d
,
stream
);
get_rows_cuda_q
<
QK4_0
,
QR4_0
,
dequantize_q4_0
>
(
src0_d
,
src1_d
,
dst_d
,
ne00
,
nb01
,
nb02
,
nb03
,
ne10
,
ne11
,
ne12
,
nb10
,
nb11
,
nb12
,
nb1
,
nb2
,
nb3
,
stream
);
break
;
case
GGML_TYPE_Q4_1
:
get_rows_cuda
<
QK4_1
,
QR4_1
,
dequantize_q4_1
>
(
src0
,
src1
,
dst
,
src0_d
,
src1_d
,
dst_d
,
stream
);
get_rows_cuda_q
<
QK4_1
,
QR4_1
,
dequantize_q4_1
>
(
src0_d
,
src1_d
,
dst_d
,
ne00
,
nb01
,
nb02
,
nb03
,
ne10
,
ne11
,
ne12
,
nb10
,
nb11
,
nb12
,
nb1
,
nb2
,
nb3
,
stream
);
break
;
case
GGML_TYPE_Q5_0
:
get_rows_cuda
<
QK5_0
,
QR5_0
,
dequantize_q5_0
>
(
src0
,
src1
,
dst
,
src0_d
,
src1_d
,
dst_d
,
stream
);
get_rows_cuda_q
<
QK5_0
,
QR5_0
,
dequantize_q5_0
>
(
src0_d
,
src1_d
,
dst_d
,
ne00
,
nb01
,
nb02
,
nb03
,
ne10
,
ne11
,
ne12
,
nb10
,
nb11
,
nb12
,
nb1
,
nb2
,
nb3
,
stream
);
break
;
case
GGML_TYPE_Q5_1
:
get_rows_cuda
<
QK5_1
,
QR5_1
,
dequantize_q5_1
>
(
src0
,
src1
,
dst
,
src0_d
,
src1_d
,
dst_d
,
stream
);
get_rows_cuda_q
<
QK5_1
,
QR5_1
,
dequantize_q5_1
>
(
src0_d
,
src1_d
,
dst_d
,
ne00
,
nb01
,
nb02
,
nb03
,
ne10
,
ne11
,
ne12
,
nb10
,
nb11
,
nb12
,
nb1
,
nb2
,
nb3
,
stream
);
break
;
case
GGML_TYPE_Q8_0
:
get_rows_cuda
<
QK8_0
,
QR8_0
,
dequantize_q8_0
>
(
src0
,
src1
,
dst
,
src0_d
,
src1_d
,
dst_d
,
stream
);
get_rows_cuda_q
<
QK8_0
,
QR8_0
,
dequantize_q8_0
>
(
src0_d
,
src1_d
,
dst_d
,
ne00
,
nb01
,
nb02
,
nb03
,
ne10
,
ne11
,
ne12
,
nb10
,
nb11
,
nb12
,
nb1
,
nb2
,
nb3
,
stream
);
break
;
default:
// TODO: k-quants
GGML_ABORT
(
"%s: unsupported type: %s
\n
"
,
__func__
,
ggml_type_name
(
src0
->
type
));
GGML_ABORT
(
"%s: unsupported
src0
type: %s
\n
"
,
__func__
,
ggml_type_name
(
src0
_
type
));
break
;
}
}
void
get_rows_cuda
(
const
void
*
src0_d
,
ggml_type
src0_type
,
const
int32_t
*
src1_d
,
void
*
dst_d
,
ggml_type
dst_type
,
int64_t
ne00
,
size_t
nb01
,
size_t
nb02
,
size_t
nb03
,
int64_t
ne10
,
int64_t
ne11
,
int64_t
ne12
,
size_t
nb10
,
size_t
nb11
,
size_t
nb12
,
size_t
nb1
,
size_t
nb2
,
size_t
nb3
,
cudaStream_t
stream
)
{
switch
(
dst_type
)
{
case
GGML_TYPE_F32
:
ggml_cuda_get_rows_switch_src0_type
(
src0_d
,
src0_type
,
src1_d
,
(
float
*
)
dst_d
,
ne00
,
nb01
,
nb02
,
nb03
,
ne10
,
ne11
,
ne12
,
nb10
,
nb11
,
nb12
,
nb1
,
nb2
,
nb3
,
stream
);
break
;
case
GGML_TYPE_F16
:
ggml_cuda_get_rows_switch_src0_type
(
src0_d
,
src0_type
,
src1_d
,
(
half
*
)
dst_d
,
ne00
,
nb01
,
nb02
,
nb03
,
ne10
,
ne11
,
ne12
,
nb10
,
nb11
,
nb12
,
nb1
,
nb2
,
nb3
,
stream
);
break
;
case
GGML_TYPE_BF16
:
ggml_cuda_get_rows_switch_src0_type
(
src0_d
,
src0_type
,
src1_d
,
(
nv_bfloat16
*
)
dst_d
,
ne00
,
nb01
,
nb02
,
nb03
,
ne10
,
ne11
,
ne12
,
nb10
,
nb11
,
nb12
,
nb1
,
nb2
,
nb3
,
stream
);
break
;
default:
GGML_ABORT
(
"%s: unsupported dst type: %s
\n
"
,
__func__
,
ggml_type_name
(
dst_type
));
break
;
}
}
void
ggml_cuda_op_get_rows
(
ggml_backend_cuda_context
&
ctx
,
ggml_tensor
*
dst
)
{
const
ggml_tensor
*
src0
=
dst
->
src
[
0
];
const
ggml_tensor
*
src1
=
dst
->
src
[
1
];
cudaStream_t
stream
=
ctx
.
stream
();
GGML_TENSOR_BINARY_OP_LOCALS
GGML_ASSERT
(
src1
->
type
==
GGML_TYPE_I32
);
GGML_ASSERT
(
ne13
==
1
);
GGML_ASSERT
(
src0
->
nb
[
0
]
==
ggml_type_size
(
src0
->
type
));
GGML_ASSERT
(
src1
->
nb
[
0
]
==
ggml_type_size
(
src1
->
type
));
GGML_ASSERT
(
dst
->
nb
[
0
]
==
ggml_type_size
(
dst
->
type
));
get_rows_cuda
(
src0
->
data
,
src0
->
type
,
(
const
int32_t
*
)
src1
->
data
,
dst
->
data
,
dst
->
type
,
ne00
,
nb01
,
nb02
,
nb03
,
ne10
,
ne11
,
ne12
,
nb10
,
nb11
,
nb12
,
nb1
,
nb2
,
nb3
,
stream
);
}
void
ggml_cuda_op_get_rows_back
(
ggml_backend_cuda_context
&
ctx
,
ggml_tensor
*
dst
)
{
const
ggml_tensor
*
src0
=
dst
->
src
[
0
];
// gradients of forward pass output
const
ggml_tensor
*
src1
=
dst
->
src
[
1
];
// src1 in forward pass
...
...
ml/backend/ggml/ggml/src/ggml-cuda/getrows.cuh
View file @
8dd12c87
...
...
@@ -3,6 +3,13 @@
#define CUDA_GET_ROWS_BLOCK_SIZE 256
#define CUDA_GET_ROWS_BACK_BLOCK_SIZE 256
void
get_rows_cuda
(
const
void
*
src0_d
,
ggml_type
src0_type
,
const
int32_t
*
src1_d
,
void
*
dst_d
,
ggml_type
dst_type
,
int64_t
ne00
,
size_t
nb01
,
size_t
nb02
,
size_t
nb03
,
int64_t
ne10
,
int64_t
ne11
,
int64_t
ne12
,
size_t
nb10
,
size_t
nb11
,
size_t
nb12
,
size_t
nb1
,
size_t
nb2
,
size_t
nb3
,
cudaStream_t
stream
);
void
ggml_cuda_op_get_rows
(
ggml_backend_cuda_context
&
ctx
,
ggml_tensor
*
dst
);
void
ggml_cuda_op_get_rows_back
(
ggml_backend_cuda_context
&
ctx
,
ggml_tensor
*
dst
);
ml/backend/ggml/ggml/src/ggml-cuda/ggml-cuda.cu
View file @
8dd12c87
This diff is collapsed.
Click to expand it.
ml/backend/ggml/ggml/src/ggml-cuda/mmq.cu
View file @
8dd12c87
This diff is collapsed.
Click to expand it.
ml/backend/ggml/ggml/src/ggml-cuda/mmq.cuh
View file @
8dd12c87
This diff is collapsed.
Click to expand it.
ml/backend/ggml/ggml/src/ggml-cuda/mmv.cu
View file @
8dd12c87
This diff is collapsed.
Click to expand it.
ml/backend/ggml/ggml/src/ggml-cuda/mmv.cuh
View file @
8dd12c87
...
...
@@ -3,7 +3,7 @@
// maximum number of src0 rows with which to use mul_mat_vec over cuBLAS if FP16 tensor cores are available
#define MMV_MAX_ROWS 512
void
ggml_cuda_mul_mat_vec
(
ggml_backend_cuda_context
&
ctx
,
const
ggml_tensor
*
src0
,
const
ggml_tensor
*
src1
,
ggml_tensor
*
dst
);
void
ggml_cuda_mul_mat_vec
(
ggml_backend_cuda_context
&
ctx
,
const
ggml_tensor
*
src0
,
const
ggml_tensor
*
src1
,
const
ggml_tensor
*
ids
,
ggml_tensor
*
dst
);
void
ggml_cuda_op_mul_mat_vec
(
ggml_backend_cuda_context
&
ctx
,
...
...
ml/backend/ggml/ggml/src/ggml-cuda/mmvq.cu
View file @
8dd12c87
This diff is collapsed.
Click to expand it.
Prev
1
2
3
4
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment