Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
ollama
Commits
8dd12c87
Unverified
Commit
8dd12c87
authored
May 01, 2025
by
Jeffrey Morgan
Committed by
GitHub
May 01, 2025
Browse files
llama: update to commit e1e8e099 (#10513)
parent
e6d2d041
Changes
68
Expand all
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
1738 additions
and
862 deletions
+1738
-862
ml/backend/ggml/ggml/include/ggml.h
ml/backend/ggml/ggml/include/ggml.h
+23
-3
ml/backend/ggml/ggml/src/CMakeLists.txt
ml/backend/ggml/ggml/src/CMakeLists.txt
+8
-5
ml/backend/ggml/ggml/src/ggml-cpu/CMakeLists.txt
ml/backend/ggml/ggml/src/ggml-cpu/CMakeLists.txt
+11
-5
ml/backend/ggml/ggml/src/ggml-cpu/cpu-feats-x86.cpp
ml/backend/ggml/ggml/src/ggml-cpu/cpu-feats-x86.cpp
+1
-1
ml/backend/ggml/ggml/src/ggml-cpu/ggml-cpu.c
ml/backend/ggml/ggml/src/ggml-cpu/ggml-cpu.c
+94
-2
ml/backend/ggml/ggml/src/ggml-cpu/ops.cpp
ml/backend/ggml/ggml/src/ggml-cpu/ops.cpp
+174
-2
ml/backend/ggml/ggml/src/ggml-cpu/ops.h
ml/backend/ggml/ggml/src/ggml-cpu/ops.h
+1
-0
ml/backend/ggml/ggml/src/ggml-cpu/simd-mappings.h
ml/backend/ggml/ggml/src/ggml-cpu/simd-mappings.h
+1
-1
ml/backend/ggml/ggml/src/ggml-cuda/common.cuh
ml/backend/ggml/ggml/src/ggml-cuda/common.cuh
+4
-4
ml/backend/ggml/ggml/src/ggml-cuda/convert.cu
ml/backend/ggml/ggml/src/ggml-cuda/convert.cu
+41
-12
ml/backend/ggml/ggml/src/ggml-cuda/convert.cuh
ml/backend/ggml/ggml/src/ggml-cuda/convert.cuh
+11
-1
ml/backend/ggml/ggml/src/ggml-cuda/cpy.cu
ml/backend/ggml/ggml/src/ggml-cuda/cpy.cu
+2
-0
ml/backend/ggml/ggml/src/ggml-cuda/getrows.cu
ml/backend/ggml/ggml/src/ggml-cuda/getrows.cu
+105
-66
ml/backend/ggml/ggml/src/ggml-cuda/getrows.cuh
ml/backend/ggml/ggml/src/ggml-cuda/getrows.cuh
+7
-0
ml/backend/ggml/ggml/src/ggml-cuda/ggml-cuda.cu
ml/backend/ggml/ggml/src/ggml-cuda/ggml-cuda.cu
+173
-198
ml/backend/ggml/ggml/src/ggml-cuda/mmq.cu
ml/backend/ggml/ggml/src/ggml-cuda/mmq.cu
+189
-31
ml/backend/ggml/ggml/src/ggml-cuda/mmq.cuh
ml/backend/ggml/ggml/src/ggml-cuda/mmq.cuh
+442
-192
ml/backend/ggml/ggml/src/ggml-cuda/mmv.cu
ml/backend/ggml/ggml/src/ggml-cuda/mmv.cu
+110
-71
ml/backend/ggml/ggml/src/ggml-cuda/mmv.cuh
ml/backend/ggml/ggml/src/ggml-cuda/mmv.cuh
+1
-1
ml/backend/ggml/ggml/src/ggml-cuda/mmvq.cu
ml/backend/ggml/ggml/src/ggml-cuda/mmvq.cu
+340
-267
No files found.
ml/backend/ggml/ggml/include/ggml.h
View file @
8dd12c87
...
@@ -393,8 +393,8 @@ extern "C" {
...
@@ -393,8 +393,8 @@ extern "C" {
// precision
// precision
enum
ggml_prec
{
enum
ggml_prec
{
GGML_PREC_DEFAULT
,
GGML_PREC_DEFAULT
=
0
,
// stored as ggml_tensor.op_params, 0 by default
GGML_PREC_F32
,
GGML_PREC_F32
=
10
,
};
};
// model file types
// model file types
...
@@ -481,6 +481,7 @@ extern "C" {
...
@@ -481,6 +481,7 @@ extern "C" {
GGML_OP_CONV_TRANSPOSE_1D
,
GGML_OP_CONV_TRANSPOSE_1D
,
GGML_OP_IM2COL
,
GGML_OP_IM2COL
,
GGML_OP_IM2COL_BACK
,
GGML_OP_IM2COL_BACK
,
GGML_OP_CONV_2D_DW
,
GGML_OP_CONV_TRANSPOSE_2D
,
GGML_OP_CONV_TRANSPOSE_2D
,
GGML_OP_POOL_1D
,
GGML_OP_POOL_1D
,
GGML_OP_POOL_2D
,
GGML_OP_POOL_2D
,
...
@@ -678,6 +679,9 @@ extern "C" {
...
@@ -678,6 +679,9 @@ extern "C" {
GGML_API
bool
ggml_is_contiguous_1
(
const
struct
ggml_tensor
*
tensor
);
// contiguous for dims >= 1
GGML_API
bool
ggml_is_contiguous_1
(
const
struct
ggml_tensor
*
tensor
);
// contiguous for dims >= 1
GGML_API
bool
ggml_is_contiguous_2
(
const
struct
ggml_tensor
*
tensor
);
// contiguous for dims >= 2
GGML_API
bool
ggml_is_contiguous_2
(
const
struct
ggml_tensor
*
tensor
);
// contiguous for dims >= 2
// true for tensor that is stored in memory as CxWxHxN and has been permuted to WxHxCxN
GGML_API
bool
ggml_is_contiguous_channels
(
const
struct
ggml_tensor
*
tensor
);
GGML_API
bool
ggml_are_same_shape
(
const
struct
ggml_tensor
*
t0
,
const
struct
ggml_tensor
*
t1
);
GGML_API
bool
ggml_are_same_shape
(
const
struct
ggml_tensor
*
t0
,
const
struct
ggml_tensor
*
t1
);
GGML_API
bool
ggml_are_same_stride
(
const
struct
ggml_tensor
*
t0
,
const
struct
ggml_tensor
*
t1
);
GGML_API
bool
ggml_are_same_stride
(
const
struct
ggml_tensor
*
t0
,
const
struct
ggml_tensor
*
t1
);
...
@@ -1661,7 +1665,7 @@ extern "C" {
...
@@ -1661,7 +1665,7 @@ extern "C" {
struct
ggml_tensor
*
a
,
struct
ggml_tensor
*
a
,
struct
ggml_tensor
*
b
);
struct
ggml_tensor
*
b
);
// depthwise
// depthwise
(via im2col and mul_mat)
GGML_API
struct
ggml_tensor
*
ggml_conv_2d_dw
(
GGML_API
struct
ggml_tensor
*
ggml_conv_2d_dw
(
struct
ggml_context
*
ctx
,
struct
ggml_context
*
ctx
,
struct
ggml_tensor
*
a
,
// convolution kernel
struct
ggml_tensor
*
a
,
// convolution kernel
...
@@ -1673,6 +1677,22 @@ extern "C" {
...
@@ -1673,6 +1677,22 @@ extern "C" {
int
d0
,
// dilation dimension 0
int
d0
,
// dilation dimension 0
int
d1
);
// dilation dimension 1
int
d1
);
// dilation dimension 1
// Depthwise 2D convolution
// may be faster than ggml_conv_2d_dw, but not available in all backends
// a: KW KH 1 C convolution kernel
// b: W H C N input data
// res: W_out H_out C N
GGML_API
struct
ggml_tensor
*
ggml_conv_2d_dw_direct
(
struct
ggml_context
*
ctx
,
struct
ggml_tensor
*
a
,
struct
ggml_tensor
*
b
,
int
stride0
,
int
stride1
,
int
pad0
,
int
pad1
,
int
dilation0
,
int
dilation1
);
GGML_API
struct
ggml_tensor
*
ggml_conv_transpose_2d_p0
(
GGML_API
struct
ggml_tensor
*
ggml_conv_transpose_2d_p0
(
struct
ggml_context
*
ctx
,
struct
ggml_context
*
ctx
,
struct
ggml_tensor
*
a
,
struct
ggml_tensor
*
a
,
...
...
ml/backend/ggml/ggml/src/CMakeLists.txt
View file @
8dd12c87
...
@@ -267,6 +267,7 @@ function(ggml_add_cpu_backend_variant tag_name)
...
@@ -267,6 +267,7 @@ function(ggml_add_cpu_backend_variant tag_name)
set
(
GGML_CPU_TAG_NAME
${
tag_name
}
)
set
(
GGML_CPU_TAG_NAME
${
tag_name
}
)
# other: OPENMP LLAMAFILE CPU_HBM
# other: OPENMP LLAMAFILE CPU_HBM
foreach
(
feat NATIVE
foreach
(
feat NATIVE
SSE42
AVX AVX2 BMI2 AVX_VNNI FMA F16C
AVX AVX2 BMI2 AVX_VNNI FMA F16C
AVX512 AVX512_VBMI AVX512_VNNI AVX512_BF16
AVX512 AVX512_VBMI AVX512_VNNI AVX512_BF16
AMX_TILE AMX_INT8 AMX_BF16
)
AMX_TILE AMX_INT8 AMX_BF16
)
...
@@ -288,11 +289,13 @@ if (GGML_CPU_ALL_VARIANTS)
...
@@ -288,11 +289,13 @@ if (GGML_CPU_ALL_VARIANTS)
message
(
FATAL_ERROR
"GGML_CPU_ALL_VARIANTS requires GGML_BACKEND_DL"
)
message
(
FATAL_ERROR
"GGML_CPU_ALL_VARIANTS requires GGML_BACKEND_DL"
)
endif
()
endif
()
add_custom_target
(
ggml-cpu
)
add_custom_target
(
ggml-cpu
)
ggml_add_cpu_backend_variant
(
sandybridge AVX
)
ggml_add_cpu_backend_variant
(
x64
)
ggml_add_cpu_backend_variant
(
haswell AVX F16C AVX2 BMI2 FMA
)
ggml_add_cpu_backend_variant
(
sse42 SSE42
)
ggml_add_cpu_backend_variant
(
skylakex AVX F16C AVX2 BMI2 FMA AVX512
)
ggml_add_cpu_backend_variant
(
sandybridge SSE42 AVX
)
ggml_add_cpu_backend_variant
(
icelake AVX F16C AVX2 BMI2 FMA AVX512 AVX512_VBMI AVX512_VNNI
)
ggml_add_cpu_backend_variant
(
haswell SSE42 AVX F16C AVX2 BMI2 FMA
)
ggml_add_cpu_backend_variant
(
alderlake AVX F16C AVX2 BMI2 FMA AVX_VNNI
)
ggml_add_cpu_backend_variant
(
skylakex SSE42 AVX F16C AVX2 BMI2 FMA AVX512
)
ggml_add_cpu_backend_variant
(
icelake SSE42 AVX F16C AVX2 BMI2 FMA AVX512 AVX512_VBMI AVX512_VNNI
)
ggml_add_cpu_backend_variant
(
alderlake SSE42 AVX F16C AVX2 BMI2 FMA AVX_VNNI
)
elseif
(
GGML_CPU
)
elseif
(
GGML_CPU
)
ggml_add_cpu_backend_variant_impl
(
""
)
ggml_add_cpu_backend_variant_impl
(
""
)
endif
()
endif
()
...
...
ml/backend/ggml/ggml/src/ggml-cpu/CMakeLists.txt
View file @
8dd12c87
...
@@ -222,7 +222,7 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
...
@@ -222,7 +222,7 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
elseif
(
GGML_AVX
)
elseif
(
GGML_AVX
)
list
(
APPEND ARCH_FLAGS /arch:AVX
)
list
(
APPEND ARCH_FLAGS /arch:AVX
)
list
(
APPEND ARCH_DEFINITIONS GGML_AVX
)
list
(
APPEND ARCH_DEFINITIONS GGML_AVX
)
else
(
)
else
if
(
GGML_SSE42
)
list
(
APPEND ARCH_FLAGS /arch:SSE4.2
)
list
(
APPEND ARCH_FLAGS /arch:SSE4.2
)
list
(
APPEND ARCH_DEFINITIONS GGML_SSE42
)
list
(
APPEND ARCH_DEFINITIONS GGML_SSE42
)
endif
()
endif
()
...
@@ -237,8 +237,10 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
...
@@ -237,8 +237,10 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
if
(
GGML_NATIVE
)
if
(
GGML_NATIVE
)
list
(
APPEND ARCH_FLAGS -march=native
)
list
(
APPEND ARCH_FLAGS -march=native
)
else
()
else
()
list
(
APPEND ARCH_FLAGS -msse4.2
)
if
(
GGML_SSE42
)
list
(
APPEND ARCH_DEFINITIONS GGML_SSE42
)
list
(
APPEND ARCH_FLAGS -msse4.2
)
list
(
APPEND ARCH_DEFINITIONS GGML_SSE42
)
endif
()
if
(
GGML_F16C
)
if
(
GGML_F16C
)
list
(
APPEND ARCH_FLAGS -mf16c
)
list
(
APPEND ARCH_FLAGS -mf16c
)
list
(
APPEND ARCH_DEFINITIONS GGML_F16C
)
list
(
APPEND ARCH_DEFINITIONS GGML_F16C
)
...
@@ -350,10 +352,14 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
...
@@ -350,10 +352,14 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
# TODO: Separation to determine activation of VX/VXE/VXE2
# TODO: Separation to determine activation of VX/VXE/VXE2
if
(
${
S390X_M
}
MATCHES
"8561|8562"
)
if
(
${
S390X_M
}
MATCHES
"8561|8562"
)
message
(
STATUS
"z15 target"
)
message
(
STATUS
"z15 target"
)
list
(
APPEND ARCH_FLAGS -march=z15
-mtune=z15
)
list
(
APPEND ARCH_FLAGS -march=z15
)
elseif
(
${
S390X_M
}
MATCHES
"3931"
)
elseif
(
${
S390X_M
}
MATCHES
"3931"
)
message
(
STATUS
"z16 target"
)
message
(
STATUS
"z16 target"
)
list
(
APPEND ARCH_FLAGS -march=z16 -mtune=z16
)
list
(
APPEND ARCH_FLAGS -march=z16
)
elseif
(
${
S390X_M
}
MATCHES
"9175|9176"
)
# NOTE: Only available from GCC 15.1.0 onwards. Any z17 machine with compile issues must first verify their GCC version.
message
(
STATUS
"z17 target"
)
list
(
APPEND ARCH_FLAGS -march=z17
)
else
()
else
()
message
(
STATUS
"Unknown target"
)
message
(
STATUS
"Unknown target"
)
message
(
WARNING
"Unknown target. If you are compiling for z14 and earlier, you might have to add -DGGML_VXE=OFF."
)
message
(
WARNING
"Unknown target. If you are compiling for z14 and earlier, you might have to add -DGGML_VXE=OFF."
)
...
...
ml/backend/ggml/ggml/src/ggml-cpu/cpu-feats-x86.cpp
View file @
8dd12c87
...
@@ -263,7 +263,7 @@ void test_x86_is() {
...
@@ -263,7 +263,7 @@ void test_x86_is() {
static
int
ggml_backend_cpu_x86_score
()
{
static
int
ggml_backend_cpu_x86_score
()
{
// FIXME: this does not check for OS support
// FIXME: this does not check for OS support
int
score
=
0
;
int
score
=
1
;
cpuid_x86
is
;
cpuid_x86
is
;
#ifdef GGML_FMA
#ifdef GGML_FMA
...
...
ml/backend/ggml/ggml/src/ggml-cpu/ggml-cpu.c
View file @
8dd12c87
...
@@ -217,7 +217,7 @@ static const struct ggml_type_traits_cpu type_traits_cpu[GGML_TYPE_COUNT] = {
...
@@ -217,7 +217,7 @@ static const struct ggml_type_traits_cpu type_traits_cpu[GGML_TYPE_COUNT] = {
.
nrows
=
1
,
.
nrows
=
1
,
},
},
[
GGML_TYPE_F16
]
=
{
[
GGML_TYPE_F16
]
=
{
.
from_float
=
(
ggml_from_float_t
)
ggml_fp32_to_fp16
_row
,
.
from_float
=
(
ggml_from_float_t
)
ggml_
cpu_
fp32_to_fp16
,
.
vec_dot
=
(
ggml_vec_dot_t
)
ggml_vec_dot_f16
,
.
vec_dot
=
(
ggml_vec_dot_t
)
ggml_vec_dot_f16
,
.
vec_dot_type
=
GGML_TYPE_F16
,
.
vec_dot_type
=
GGML_TYPE_F16
,
.
nrows
=
1
,
.
nrows
=
1
,
...
@@ -358,7 +358,7 @@ static const struct ggml_type_traits_cpu type_traits_cpu[GGML_TYPE_COUNT] = {
...
@@ -358,7 +358,7 @@ static const struct ggml_type_traits_cpu type_traits_cpu[GGML_TYPE_COUNT] = {
.
from_float
=
quantize_row_q8_K
,
.
from_float
=
quantize_row_q8_K
,
},
},
[
GGML_TYPE_BF16
]
=
{
[
GGML_TYPE_BF16
]
=
{
.
from_float
=
(
ggml_from_float_t
)
ggml_fp32_to_bf16
_row
,
.
from_float
=
(
ggml_from_float_t
)
ggml_
cpu_
fp32_to_bf16
,
.
vec_dot
=
(
ggml_vec_dot_t
)
ggml_vec_dot_bf16
,
.
vec_dot
=
(
ggml_vec_dot_t
)
ggml_vec_dot_bf16
,
.
vec_dot_type
=
GGML_TYPE_BF16
,
.
vec_dot_type
=
GGML_TYPE_BF16
,
.
nrows
=
1
,
.
nrows
=
1
,
...
@@ -1934,6 +1934,10 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
...
@@ -1934,6 +1934,10 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
{
{
ggml_compute_forward_im2col_back_f32
(
params
,
tensor
);
ggml_compute_forward_im2col_back_f32
(
params
,
tensor
);
}
break
;
}
break
;
case
GGML_OP_CONV_2D_DW
:
{
ggml_compute_forward_conv_2d_dw
(
params
,
tensor
);
}
break
;
case
GGML_OP_CONV_TRANSPOSE_2D
:
case
GGML_OP_CONV_TRANSPOSE_2D
:
{
{
ggml_compute_forward_conv_transpose_2d
(
params
,
tensor
);
ggml_compute_forward_conv_transpose_2d
(
params
,
tensor
);
...
@@ -2274,6 +2278,7 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
...
@@ -2274,6 +2278,7 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
}
break
;
}
break
;
case
GGML_OP_IM2COL
:
case
GGML_OP_IM2COL
:
case
GGML_OP_IM2COL_BACK
:
case
GGML_OP_IM2COL_BACK
:
case
GGML_OP_CONV_2D_DW
:
case
GGML_OP_CONV_TRANSPOSE_1D
:
case
GGML_OP_CONV_TRANSPOSE_1D
:
case
GGML_OP_CONV_TRANSPOSE_2D
:
case
GGML_OP_CONV_TRANSPOSE_2D
:
{
{
...
@@ -3172,6 +3177,93 @@ enum ggml_status ggml_graph_compute_with_ctx(struct ggml_context * ctx, struct g
...
@@ -3172,6 +3177,93 @@ enum ggml_status ggml_graph_compute_with_ctx(struct ggml_context * ctx, struct g
return
ggml_graph_compute
(
cgraph
,
&
cplan
);
return
ggml_graph_compute
(
cgraph
,
&
cplan
);
}
}
void
ggml_cpu_fp32_to_fp16
(
const
float
*
x
,
ggml_fp16_t
*
y
,
int64_t
n
)
{
int64_t
i
=
0
;
#if defined(__F16C__)
#if defined(__AVX512F__)
for
(;
i
+
15
<
n
;
i
+=
16
)
{
__m512
x_vec
=
_mm512_loadu_ps
(
x
+
i
);
__m256i
y_vec
=
_mm512_cvtps_ph
(
x_vec
,
_MM_FROUND_TO_NEAREST_INT
);
_mm256_storeu_si256
((
__m256i
*
)(
y
+
i
),
y_vec
);
}
#endif
for
(;
i
+
7
<
n
;
i
+=
8
)
{
__m256
x_vec
=
_mm256_loadu_ps
(
x
+
i
);
__m128i
y_vec
=
_mm256_cvtps_ph
(
x_vec
,
_MM_FROUND_TO_NEAREST_INT
);
_mm_storeu_si128
((
__m128i
*
)(
y
+
i
),
y_vec
);
}
for
(;
i
+
3
<
n
;
i
+=
4
)
{
__m128
x_vec
=
_mm_loadu_ps
(
x
+
i
);
__m128i
y_vec
=
_mm_cvtps_ph
(
x_vec
,
_MM_FROUND_TO_NEAREST_INT
);
_mm_storel_epi64
((
__m128i
*
)(
y
+
i
),
y_vec
);
}
#endif
for
(;
i
<
n
;
++
i
)
{
y
[
i
]
=
GGML_FP32_TO_FP16
(
x
[
i
]);
}
}
void
ggml_cpu_fp16_to_fp32
(
const
ggml_fp16_t
*
x
,
float
*
y
,
int64_t
n
)
{
int64_t
i
=
0
;
#if defined(__F16C__)
#if defined(__AVX512F__)
for
(;
i
+
15
<
n
;
i
+=
16
)
{
__m256i
x_vec
=
_mm256_loadu_si256
((
const
__m256i
*
)(
x
+
i
));
__m512
y_vec
=
_mm512_cvtph_ps
(
x_vec
);
_mm512_storeu_ps
(
y
+
i
,
y_vec
);
}
#endif
for
(;
i
+
7
<
n
;
i
+=
8
)
{
__m128i
x_vec
=
_mm_loadu_si128
((
const
__m128i
*
)(
x
+
i
));
__m256
y_vec
=
_mm256_cvtph_ps
(
x_vec
);
_mm256_storeu_ps
(
y
+
i
,
y_vec
);
}
for
(;
i
+
3
<
n
;
i
+=
4
)
{
__m128i
x_vec
=
_mm_loadl_epi64
((
const
__m128i
*
)(
x
+
i
));
__m128
y_vec
=
_mm_cvtph_ps
(
x_vec
);
_mm_storeu_ps
(
y
+
i
,
y_vec
);
}
#endif
for
(;
i
<
n
;
++
i
)
{
y
[
i
]
=
GGML_FP16_TO_FP32
(
x
[
i
]);
}
}
void
ggml_cpu_fp32_to_bf16
(
const
float
*
x
,
ggml_bf16_t
*
y
,
int64_t
n
)
{
int64_t
i
=
0
;
for
(;
i
<
n
;
++
i
)
{
y
[
i
]
=
GGML_FP32_TO_BF16
(
x
[
i
]);
}
}
void
ggml_cpu_bf16_to_fp32
(
const
ggml_bf16_t
*
x
,
float
*
y
,
int64_t
n
)
{
int64_t
i
=
0
;
#if defined(__AVX2__)
#if defined(__AVX512F__)
for
(;
i
+
15
<
n
;
i
+=
16
)
{
_mm512_storeu_ps
(
y
+
i
,
_mm512_castsi512_ps
(
_mm512_slli_epi32
(
_mm512_cvtepu16_epi32
(
_mm256_loadu_si256
(
(
const
__m256i
*
)(
x
+
i
))),
16
)));
}
#endif
for
(;
i
+
7
<
n
;
i
+=
8
)
{
_mm256_storeu_ps
(
y
+
i
,
_mm256_castsi256_ps
(
_mm256_slli_epi32
(
_mm256_cvtepu16_epi32
(
_mm_loadu_si128
(
(
const
__m128i
*
)(
x
+
i
))),
16
)));
}
#endif
for
(;
i
<
n
;
i
++
)
{
y
[
i
]
=
GGML_BF16_TO_FP32
(
x
[
i
]);
}
}
int
ggml_cpu_has_avx
(
void
)
{
int
ggml_cpu_has_avx
(
void
)
{
#if defined(__AVX__)
#if defined(__AVX__)
...
...
ml/backend/ggml/ggml/src/ggml-cpu/ops.cpp
View file @
8dd12c87
...
@@ -4222,7 +4222,7 @@ static void ggml_compute_forward_get_rows_f16(
...
@@ -4222,7 +4222,7 @@ static void ggml_compute_forward_get_rows_f16(
GGML_ASSERT
(
i01
>=
0
&&
i01
<
ne01
);
GGML_ASSERT
(
i01
>=
0
&&
i01
<
ne01
);
ggml_fp16_to_fp32
_row
(
ggml_
cpu_
fp16_to_fp32
(
(
const
ggml_fp16_t
*
)
((
char
*
)
src0
->
data
+
i01
*
nb01
+
i11
*
nb02
+
i12
*
nb03
),
(
const
ggml_fp16_t
*
)
((
char
*
)
src0
->
data
+
i01
*
nb01
+
i11
*
nb02
+
i12
*
nb03
),
(
float
*
)
((
char
*
)
dst
->
data
+
i10
*
nb1
+
i11
*
nb2
+
i12
*
nb3
),
nc
);
(
float
*
)
((
char
*
)
dst
->
data
+
i10
*
nb1
+
i11
*
nb2
+
i12
*
nb3
),
nc
);
}
}
...
@@ -4263,7 +4263,7 @@ static void ggml_compute_forward_get_rows_bf16(
...
@@ -4263,7 +4263,7 @@ static void ggml_compute_forward_get_rows_bf16(
GGML_ASSERT
(
i01
>=
0
&&
i01
<
ne01
);
GGML_ASSERT
(
i01
>=
0
&&
i01
<
ne01
);
ggml_bf16_to_fp32
_row
(
ggml_
cpu_
bf16_to_fp32
(
(
const
ggml_bf16_t
*
)
((
char
*
)
src0
->
data
+
i01
*
nb01
+
i11
*
nb02
+
i12
*
nb03
),
(
const
ggml_bf16_t
*
)
((
char
*
)
src0
->
data
+
i01
*
nb01
+
i11
*
nb02
+
i12
*
nb03
),
(
float
*
)
((
char
*
)
dst
->
data
+
i10
*
nb1
+
i11
*
nb2
+
i12
*
nb3
),
nc
);
(
float
*
)
((
char
*
)
dst
->
data
+
i10
*
nb1
+
i11
*
nb2
+
i12
*
nb3
),
nc
);
}
}
...
@@ -6064,6 +6064,178 @@ void ggml_compute_forward_conv_transpose_2d(
...
@@ -6064,6 +6064,178 @@ void ggml_compute_forward_conv_transpose_2d(
}
}
}
}
// ggml_compute_forward_conv_2d_dw
struct
ggml_conv_2d_dw_params
{
int64_t
channels
;
int64_t
batch
;
int64_t
src_w
;
int64_t
src_h
;
int64_t
dst_w
;
int64_t
dst_h
;
int64_t
knl_w
;
int64_t
knl_h
;
int
stride_x
;
int
stride_y
;
int
pad_x
;
int
pad_y
;
int
dilation_x
;
int
dilation_y
;
};
static
void
ggml_compute_forward_conv_2d_dw_cwhn
(
const
ggml_compute_params
*
params
,
const
ggml_tensor
*
src
,
const
ggml_tensor
*
kernel
,
ggml_tensor
*
dst
,
const
ggml_conv_2d_dw_params
&
p
)
{
const
int64_t
c
=
p
.
channels
;
const
float
*
knl_data
=
(
const
float
*
)
kernel
->
data
;
const
int64_t
rows_total
=
p
.
dst_h
*
p
.
batch
;
const
int64_t
rows_per_thread
=
(
rows_total
+
params
->
nth
-
1
)
/
params
->
nth
;
const
int64_t
row_start
=
params
->
ith
*
rows_per_thread
;
const
int64_t
row_end
=
MIN
(
row_start
+
rows_per_thread
,
rows_total
);
#ifdef GGML_SIMD
const
int64_t
pkg_size
=
GGML_F32_EPR
;
const
int64_t
pkg_count
=
c
/
pkg_size
;
const
int64_t
c_pkg_end
=
pkg_count
*
pkg_size
;
#else
const
int64_t
c_pkg_end
=
0
;
#endif
for
(
int64_t
row
=
row_start
;
row
<
row_end
;
++
row
)
{
const
int64_t
dst_y
=
row
%
p
.
dst_h
;
const
float
*
src_data
=
(
const
float
*
)
src
->
data
+
(
row
/
p
.
dst_h
)
*
p
.
src_w
*
p
.
src_h
*
c
;
for
(
int64_t
dst_x
=
0
;
dst_x
<
p
.
dst_w
;
++
dst_x
)
{
float
*
dst_data
=
(
float
*
)
dst
->
data
+
(
row
*
p
.
dst_w
+
dst_x
)
*
c
;
const
int64_t
src_y_base
=
dst_y
*
p
.
stride_y
-
p
.
pad_y
;
const
int64_t
src_x_base
=
dst_x
*
p
.
stride_x
-
p
.
pad_x
;
#ifdef GGML_SIMD
// Vectorized loop
for
(
int64_t
c_i
=
0
;
c_i
<
c_pkg_end
;
c_i
+=
pkg_size
)
{
GGML_F32_VEC
sum
=
GGML_F32_VEC_ZERO
;
for
(
int64_t
knl_y
=
0
;
knl_y
<
p
.
knl_h
;
++
knl_y
)
{
const
int64_t
src_y
=
src_y_base
+
knl_y
*
p
.
dilation_y
;
if
(
src_y
<
0
||
src_y
>=
p
.
src_h
)
{
continue
;
}
for
(
int64_t
knl_x
=
0
;
knl_x
<
p
.
knl_w
;
++
knl_x
)
{
const
int64_t
src_x
=
src_x_base
+
knl_x
*
p
.
dilation_x
;
if
(
src_x
<
0
||
src_x
>=
p
.
src_w
)
{
continue
;
}
GGML_F32_VEC
k
=
GGML_F32_VEC_LOAD
(
knl_data
+
(
knl_y
*
p
.
knl_w
+
knl_x
)
*
c
+
c_i
);
GGML_F32_VEC
s
=
GGML_F32_VEC_LOAD
(
src_data
+
(
src_y
*
p
.
src_w
+
src_x
)
*
c
+
c_i
);
sum
=
GGML_F32_VEC_FMA
(
sum
,
k
,
s
);
}
}
GGML_F32_VEC_STORE
(
dst_data
+
c_i
,
sum
);
}
#endif
// Scalar loop
for
(
int64_t
c_i
=
c_pkg_end
;
c_i
<
c
;
++
c_i
)
{
float
sum
=
0.0
f
;
for
(
int64_t
knl_y
=
0
;
knl_y
<
p
.
knl_h
;
++
knl_y
)
{
const
int64_t
src_y
=
src_y_base
+
knl_y
*
p
.
dilation_y
;
if
(
src_y
<
0
||
src_y
>=
p
.
src_h
)
{
continue
;
}
for
(
int64_t
knl_x
=
0
;
knl_x
<
p
.
knl_w
;
++
knl_x
)
{
const
int64_t
src_x
=
src_x_base
+
knl_x
*
p
.
dilation_x
;
if
(
src_x
<
0
||
src_x
>=
p
.
src_w
)
{
continue
;
}
sum
+=
knl_data
[(
knl_y
*
p
.
knl_w
+
knl_x
)
*
c
+
c_i
]
*
src_data
[(
src_y
*
p
.
src_w
+
src_x
)
*
c
+
c_i
];
}
}
dst_data
[
c_i
]
=
sum
;
}
}
}
}
static
void
ggml_compute_forward_conv_2d_dw_whcn
(
const
ggml_compute_params
*
params
,
const
ggml_tensor
*
src
,
const
ggml_tensor
*
kernel
,
ggml_tensor
*
dst
,
const
ggml_conv_2d_dw_params
&
p
)
{
const
int64_t
n
=
p
.
channels
*
p
.
batch
;
const
int64_t
per_thread
=
(
n
+
params
->
nth
-
1
)
/
params
->
nth
;
const
int64_t
start
=
params
->
ith
*
per_thread
;
const
int64_t
end
=
MIN
(
start
+
per_thread
,
n
);
for
(
int64_t
i
=
start
;
i
<
end
;
++
i
)
{
const
float
*
knl_data
=
(
const
float
*
)
kernel
->
data
+
(
i
%
p
.
channels
)
*
p
.
knl_w
*
p
.
knl_h
;
const
float
*
src_data
=
(
const
float
*
)
src
->
data
+
i
*
p
.
src_w
*
p
.
src_h
;
float
*
dst_data
=
(
float
*
)
dst
->
data
+
i
*
p
.
dst_w
*
p
.
dst_h
;
for
(
int64_t
dst_y
=
0
;
dst_y
<
p
.
dst_h
;
++
dst_y
)
{
for
(
int64_t
dst_x
=
0
;
dst_x
<
p
.
dst_w
;
++
dst_x
)
{
float
sum
=
0.0
f
;
for
(
int64_t
knl_y
=
0
;
knl_y
<
p
.
knl_h
;
++
knl_y
)
{
const
int64_t
src_y
=
dst_y
*
p
.
stride_y
+
knl_y
*
p
.
dilation_y
-
p
.
pad_y
;
if
(
src_y
<
0
||
src_y
>=
p
.
src_h
)
{
continue
;
}
for
(
int64_t
knl_x
=
0
;
knl_x
<
p
.
knl_w
;
++
knl_x
)
{
const
int64_t
src_x
=
dst_x
*
p
.
stride_x
+
knl_x
*
p
.
dilation_x
-
p
.
pad_x
;
if
(
src_x
<
0
||
src_x
>=
p
.
src_w
)
{
continue
;
}
sum
+=
knl_data
[
knl_y
*
p
.
knl_w
+
knl_x
]
*
src_data
[
src_y
*
p
.
src_w
+
src_x
];
}
}
dst_data
[
dst_y
*
p
.
dst_w
+
dst_x
]
=
sum
;
}
}
}
}
void
ggml_compute_forward_conv_2d_dw
(
const
ggml_compute_params
*
params
,
ggml_tensor
*
dst
)
{
const
ggml_tensor
*
kernel
=
dst
->
src
[
0
];
const
ggml_tensor
*
src
=
dst
->
src
[
1
];
ggml_conv_2d_dw_params
p
;
p
.
channels
=
src
->
ne
[
2
];
p
.
batch
=
src
->
ne
[
3
];
p
.
src_w
=
src
->
ne
[
0
];
p
.
src_h
=
src
->
ne
[
1
];
p
.
dst_w
=
dst
->
ne
[
0
];
p
.
dst_h
=
dst
->
ne
[
1
];
p
.
knl_w
=
kernel
->
ne
[
0
];
p
.
knl_h
=
kernel
->
ne
[
1
];
p
.
stride_x
=
dst
->
op_params
[
0
];
p
.
stride_y
=
dst
->
op_params
[
1
];
p
.
pad_x
=
dst
->
op_params
[
2
];
p
.
pad_y
=
dst
->
op_params
[
3
];
p
.
dilation_x
=
dst
->
op_params
[
4
];
p
.
dilation_y
=
dst
->
op_params
[
5
];
GGML_ASSERT
(
kernel
->
ne
[
3
]
==
p
.
channels
);
GGML_ASSERT
(
dst
->
ne
[
3
]
==
p
.
batch
);
if
(
ggml_is_contiguous
(
src
))
{
ggml_compute_forward_conv_2d_dw_whcn
(
params
,
src
,
kernel
,
dst
,
p
);
}
else
if
(
ggml_is_contiguous_channels
(
src
))
{
// kernel should also have channels most contiguous in memory
GGML_ASSERT
(
kernel
->
nb
[
0
]
>=
kernel
->
nb
[
2
]
&&
kernel
->
nb
[
1
]
>=
kernel
->
nb
[
0
]);
ggml_compute_forward_conv_2d_dw_cwhn
(
params
,
src
,
kernel
,
dst
,
p
);
}
else
{
GGML_ABORT
(
"non-contiguous memory layout not supported"
);
}
}
// ggml_compute_forward_pool_1d_sk_p0
// ggml_compute_forward_pool_1d_sk_p0
static
void
ggml_compute_forward_pool_1d_sk_p0
(
static
void
ggml_compute_forward_pool_1d_sk_p0
(
...
...
ml/backend/ggml/ggml/src/ggml-cpu/ops.h
View file @
8dd12c87
...
@@ -65,6 +65,7 @@ void ggml_compute_forward_conv_transpose_1d(const struct ggml_compute_params * p
...
@@ -65,6 +65,7 @@ void ggml_compute_forward_conv_transpose_1d(const struct ggml_compute_params * p
void
ggml_compute_forward_im2col
(
const
struct
ggml_compute_params
*
params
,
struct
ggml_tensor
*
dst
);
void
ggml_compute_forward_im2col
(
const
struct
ggml_compute_params
*
params
,
struct
ggml_tensor
*
dst
);
void
ggml_compute_forward_im2col_back_f32
(
const
struct
ggml_compute_params
*
params
,
struct
ggml_tensor
*
dst
);
void
ggml_compute_forward_im2col_back_f32
(
const
struct
ggml_compute_params
*
params
,
struct
ggml_tensor
*
dst
);
void
ggml_compute_forward_conv_transpose_2d
(
const
struct
ggml_compute_params
*
params
,
struct
ggml_tensor
*
dst
);
void
ggml_compute_forward_conv_transpose_2d
(
const
struct
ggml_compute_params
*
params
,
struct
ggml_tensor
*
dst
);
void
ggml_compute_forward_conv_2d_dw
(
const
struct
ggml_compute_params
*
params
,
struct
ggml_tensor
*
dst
);
void
ggml_compute_forward_pool_1d
(
const
struct
ggml_compute_params
*
params
,
struct
ggml_tensor
*
dst
);
void
ggml_compute_forward_pool_1d
(
const
struct
ggml_compute_params
*
params
,
struct
ggml_tensor
*
dst
);
void
ggml_compute_forward_pool_2d
(
const
struct
ggml_compute_params
*
params
,
struct
ggml_tensor
*
dst
);
void
ggml_compute_forward_pool_2d
(
const
struct
ggml_compute_params
*
params
,
struct
ggml_tensor
*
dst
);
void
ggml_compute_forward_pool_2d_back
(
const
struct
ggml_compute_params
*
params
,
struct
ggml_tensor
*
dst
);
void
ggml_compute_forward_pool_2d_back
(
const
struct
ggml_compute_params
*
params
,
struct
ggml_tensor
*
dst
);
...
...
ml/backend/ggml/ggml/src/ggml-cpu/simd-mappings.h
View file @
8dd12c87
...
@@ -341,7 +341,7 @@ static inline void __avx_f32cx8_store(ggml_fp16_t *x, __m256 y) {
...
@@ -341,7 +341,7 @@ static inline void __avx_f32cx8_store(ggml_fp16_t *x, __m256 y) {
#define GGML_F32_EPR 4
#define GGML_F32_EPR 4
#define GGML_F32x4 vector float
#define GGML_F32x4 vector float
#define GGML_F32x4_ZERO 0.0f
#define GGML_F32x4_ZERO
{
0.0f
}
#define GGML_F32x4_SET1 vec_splats
#define GGML_F32x4_SET1 vec_splats
#define GGML_F32x4_LOAD(p) vec_xl(0, p)
#define GGML_F32x4_LOAD(p) vec_xl(0, p)
#define GGML_F32x4_STORE(p, r) vec_xst(r, 0, p)
#define GGML_F32x4_STORE(p, r) vec_xst(r, 0, p)
...
...
ml/backend/ggml/ggml/src/ggml-cuda/common.cuh
View file @
8dd12c87
...
@@ -78,13 +78,13 @@
...
@@ -78,13 +78,13 @@
// Moore Threads
// Moore Threads
#define GGML_CUDA_MUSA_ARCH_IS_QY1 (__MUSA_ARCH__ <= 210)
#define GGML_CUDA_MUSA_ARCH_IS_QY1 (__MUSA_ARCH__ <= 210)
#define GGML_CUDA_CC_QY1 (GGML_
MUS
A_CC_OFFSET_MTHREADS + 0x210) // MTT S80, MTT S3000
#define GGML_CUDA_CC_QY1 (GGML_
CUD
A_CC_OFFSET_MTHREADS + 0x210) // MTT S80, MTT S3000
#define GGML_CUDA_CC_QY2 (GGML_
MUS
A_CC_OFFSET_MTHREADS + 0x220) // MTT S4000
#define GGML_CUDA_CC_QY2 (GGML_
CUD
A_CC_OFFSET_MTHREADS + 0x220) // MTT S4000
#define GGML_CUDA_CC_NG (GGML_
MUS
A_CC_OFFSET_MTHREADS + 0x310) // TBD
#define GGML_CUDA_CC_NG (GGML_
CUD
A_CC_OFFSET_MTHREADS + 0x310) // TBD
#define GGML_CUDA_CC_IS_MTHREADS(cc) (cc >= GGML_CUDA_CC_OFFSET_MTHREADS && cc < GGML_CUDA_CC_OFFSET_AMD)
#define GGML_CUDA_CC_IS_MTHREADS(cc) (cc >= GGML_CUDA_CC_OFFSET_MTHREADS && cc < GGML_CUDA_CC_OFFSET_AMD)
#define GGML_CUDA_CC_IS_QY1(cc) (cc >= GGML_CUDA_CC_QY1 && cc < GGML_CUDA_CC_QY2)
#define GGML_CUDA_CC_IS_QY1(cc) (cc >= GGML_CUDA_CC_QY1 && cc < GGML_CUDA_CC_QY2)
#define GGML_CUDA_CC_IS_QY2(cc) (cc >= GGML_CUDA_CC_QY2 && cc < GGML_CUDA_CC_N
EXT
)
#define GGML_CUDA_CC_IS_QY2(cc) (cc >= GGML_CUDA_CC_QY2 && cc < GGML_CUDA_CC_N
G
)
#define GGML_CUDA_CC_IS_NG(cc) (cc >= GGML_CUDA_CC_NG)
#define GGML_CUDA_CC_IS_NG(cc) (cc >= GGML_CUDA_CC_NG)
#ifdef __CUDA_ARCH_LIST__
#ifdef __CUDA_ARCH_LIST__
...
...
ml/backend/ggml/ggml/src/ggml-cuda/convert.cu
View file @
8dd12c87
#include "convert.cuh"
#include "convert.cuh"
#include "dequantize.cuh"
#include "dequantize.cuh"
#include <cstdint>
#define CUDA_Q8_0_NE_ALIGN 2048
#define CUDA_Q8_0_NE_ALIGN 2048
template
<
int
qk
,
int
qr
,
dequantize_kernel_t
dequantize_kernel
,
typename
dst_t
>
template
<
int
qk
,
int
qr
,
dequantize_kernel_t
dequantize_kernel
,
typename
dst_t
>
...
@@ -570,30 +572,46 @@ static void dequantize_row_iq4_xs_cuda(const void * vx, dst_t * y, const int64_t
...
@@ -570,30 +572,46 @@ static void dequantize_row_iq4_xs_cuda(const void * vx, dst_t * y, const int64_t
}
}
template
<
typename
src_t
,
typename
dst_t
>
template
<
typename
src_t
,
typename
dst_t
>
static
__global__
void
convert_unary
(
const
void
*
__restrict__
vx
,
dst_t
*
__restrict__
y
,
const
int64_t
k
)
{
static
__global__
void
convert_unary
(
const
int64_t
i
=
(
int64_t
)
blockDim
.
x
*
blockIdx
.
x
+
threadIdx
.
x
;
const
void
*
__restrict__
vx
,
dst_t
*
__restrict__
y
,
const
int64_t
ne00
,
const
int64_t
ne01
,
const
int64_t
ne02
,
const
int64_t
s01
,
const
int64_t
s02
,
const
int64_t
s03
)
{
const
int64_t
i00
=
(
int64_t
)
blockDim
.
x
*
blockIdx
.
x
+
threadIdx
.
x
;
if
(
i
>=
k
)
{
if
(
i
00
>=
ne00
)
{
return
;
return
;
}
}
const
int64_t
i01
=
blockIdx
.
y
;
const
int64_t
i02
=
blockIdx
.
z
%
ne02
;
const
int64_t
i03
=
blockIdx
.
z
/
ne02
;
const
src_t
*
x
=
(
const
src_t
*
)
vx
;
const
src_t
*
x
=
(
const
src_t
*
)
vx
;
y
[
i
]
=
float
(
x
[
i
]);
const
int64_t
ix
=
i03
*
s03
+
i02
*
s02
+
i01
*
s01
+
i00
;
const
int64_t
iy
=
((
i03
*
ne02
+
i02
)
*
ne01
+
i01
)
*
ne00
+
i00
;
y
[
iy
]
=
float
(
x
[
ix
]);
}
}
template
<
typename
src_t
,
typename
dst_t
>
template
<
typename
src_t
,
typename
dst_t
>
static
void
convert_unary_cuda
(
const
void
*
__restrict__
vx
,
dst_t
*
__restrict__
y
,
const
int64_t
k
,
cudaStream_t
stream
)
{
static
void
convert_unary_cuda
(
const
void
*
vx
,
dst_t
*
y
,
const
int
num_blocks
=
(
k
+
CUDA_DEQUANTIZE_BLOCK_SIZE
-
1
)
/
CUDA_DEQUANTIZE_BLOCK_SIZE
;
const
int64_t
ne00
,
const
int64_t
ne01
,
const
int64_t
ne02
,
const
int64_t
ne03
,
convert_unary
<
src_t
><<<
num_blocks
,
CUDA_DEQUANTIZE_BLOCK_SIZE
,
0
,
stream
>>>
(
vx
,
y
,
k
);
const
int64_t
s01
,
const
int64_t
s02
,
const
int64_t
s03
,
cudaStream_t
stream
)
{
const
dim3
num_blocks
((
ne00
+
CUDA_DEQUANTIZE_BLOCK_SIZE
-
1
)
/
CUDA_DEQUANTIZE_BLOCK_SIZE
,
ne01
,
ne02
*
ne03
);
convert_unary
<
src_t
><<<
num_blocks
,
CUDA_DEQUANTIZE_BLOCK_SIZE
,
0
,
stream
>>>
(
vx
,
y
,
ne00
,
ne01
,
ne02
,
s01
,
s02
,
s03
);
}
template
<
typename
src_t
,
typename
dst_t
>
static
void
convert_unary_cont_cuda
(
const
void
*
vx
,
dst_t
*
y
,
const
int64_t
k
,
cudaStream_t
stream
)
{
convert_unary_cuda
<
src_t
>
(
vx
,
y
,
k
,
1
,
1
,
1
,
k
,
k
,
k
,
stream
);
}
}
to_bf16_cuda_t
ggml_get_to_bf16_cuda
(
ggml_type
type
)
{
to_bf16_cuda_t
ggml_get_to_bf16_cuda
(
ggml_type
type
)
{
switch
(
type
)
{
switch
(
type
)
{
case
GGML_TYPE_F32
:
case
GGML_TYPE_F32
:
return
convert_unary_cuda
<
float
>
;
return
convert_unary_
cont_
cuda
<
float
>
;
case
GGML_TYPE_F16
:
case
GGML_TYPE_F16
:
return
convert_unary_cuda
<
half
>
;
return
convert_unary_
cont_
cuda
<
half
>
;
default:
default:
return
nullptr
;
return
nullptr
;
}
}
...
@@ -643,9 +661,9 @@ to_fp16_cuda_t ggml_get_to_fp16_cuda(ggml_type type) {
...
@@ -643,9 +661,9 @@ to_fp16_cuda_t ggml_get_to_fp16_cuda(ggml_type type) {
case
GGML_TYPE_IQ3_S
:
case
GGML_TYPE_IQ3_S
:
return
dequantize_row_iq3_s_cuda
;
return
dequantize_row_iq3_s_cuda
;
case
GGML_TYPE_F32
:
case
GGML_TYPE_F32
:
return
convert_unary_cuda
<
float
>
;
return
convert_unary_
cont_
cuda
<
float
>
;
case
GGML_TYPE_BF16
:
case
GGML_TYPE_BF16
:
return
convert_unary_cuda
<
nv_bfloat16
>
;
return
convert_unary_
cont_
cuda
<
nv_bfloat16
>
;
default:
default:
return
nullptr
;
return
nullptr
;
}
}
...
@@ -692,7 +710,18 @@ to_fp32_cuda_t ggml_get_to_fp32_cuda(ggml_type type) {
...
@@ -692,7 +710,18 @@ to_fp32_cuda_t ggml_get_to_fp32_cuda(ggml_type type) {
case
GGML_TYPE_IQ3_S
:
case
GGML_TYPE_IQ3_S
:
return
dequantize_row_iq3_s_cuda
;
return
dequantize_row_iq3_s_cuda
;
case
GGML_TYPE_F16
:
case
GGML_TYPE_F16
:
return
convert_unary_cuda
<
half
>
;
return
convert_unary_cont_cuda
<
half
>
;
case
GGML_TYPE_BF16
:
return
convert_unary_cont_cuda
<
nv_bfloat16
>
;
default:
return
nullptr
;
}
}
to_fp16_nc_cuda_t
ggml_get_to_fp16_nc_cuda
(
ggml_type
type
)
{
switch
(
type
)
{
case
GGML_TYPE_F32
:
return
convert_unary_cuda
<
float
>
;
case
GGML_TYPE_BF16
:
case
GGML_TYPE_BF16
:
return
convert_unary_cuda
<
nv_bfloat16
>
;
return
convert_unary_cuda
<
nv_bfloat16
>
;
default:
default:
...
...
ml/backend/ggml/ggml/src/ggml-cuda/convert.cuh
View file @
8dd12c87
...
@@ -3,7 +3,7 @@
...
@@ -3,7 +3,7 @@
#define CUDA_DEQUANTIZE_BLOCK_SIZE 256
#define CUDA_DEQUANTIZE_BLOCK_SIZE 256
template
<
typename
T
>
template
<
typename
T
>
using
to_t_cuda_t
=
void
(
*
)(
const
void
*
__restrict__
x
,
T
*
__restrict__
y
,
int64_t
k
,
cudaStream_t
stream
);
using
to_t_cuda_t
=
void
(
*
)(
const
void
*
x
,
T
*
y
,
int64_t
k
,
cudaStream_t
stream
);
typedef
to_t_cuda_t
<
float
>
to_fp32_cuda_t
;
typedef
to_t_cuda_t
<
float
>
to_fp32_cuda_t
;
typedef
to_t_cuda_t
<
half
>
to_fp16_cuda_t
;
typedef
to_t_cuda_t
<
half
>
to_fp16_cuda_t
;
...
@@ -14,3 +14,13 @@ to_fp16_cuda_t ggml_get_to_fp16_cuda(ggml_type type);
...
@@ -14,3 +14,13 @@ to_fp16_cuda_t ggml_get_to_fp16_cuda(ggml_type type);
to_bf16_cuda_t
ggml_get_to_bf16_cuda
(
ggml_type
type
);
to_bf16_cuda_t
ggml_get_to_bf16_cuda
(
ggml_type
type
);
to_fp32_cuda_t
ggml_get_to_fp32_cuda
(
ggml_type
type
);
to_fp32_cuda_t
ggml_get_to_fp32_cuda
(
ggml_type
type
);
// TODO more general support for non-contiguous inputs
template
<
typename
T
>
using
to_t_nc_cuda_t
=
void
(
*
)(
const
void
*
x
,
T
*
y
,
int64_t
ne00
,
int64_t
ne01
,
int64_t
ne02
,
int64_t
ne03
,
int64_t
s01
,
int64_t
s02
,
int64_t
s03
,
cudaStream_t
stream
);
typedef
to_t_nc_cuda_t
<
half
>
to_fp16_nc_cuda_t
;
to_fp16_nc_cuda_t
ggml_get_to_fp16_nc_cuda
(
ggml_type
type
);
ml/backend/ggml/ggml/src/ggml-cuda/cpy.cu
View file @
8dd12c87
...
@@ -639,6 +639,8 @@ void ggml_cuda_cpy(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, gg
...
@@ -639,6 +639,8 @@ void ggml_cuda_cpy(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, gg
if
(
ctx
.
cuda_graph
->
use_cpy_indirection
&&
!
disable_indirection_for_this_node
)
{
if
(
ctx
.
cuda_graph
->
use_cpy_indirection
&&
!
disable_indirection_for_this_node
)
{
ctx
.
cuda_graph
->
graph_cpynode_index
=
graph_cpynode_index
;
ctx
.
cuda_graph
->
graph_cpynode_index
=
graph_cpynode_index
;
}
}
#else
GGML_UNUSED
(
disable_indirection_for_this_node
);
#endif
#endif
}
}
...
...
ml/backend/ggml/ggml/src/ggml-cuda/getrows.cu
View file @
8dd12c87
...
@@ -33,8 +33,8 @@ static __global__ void k_get_rows(
...
@@ -33,8 +33,8 @@ static __global__ void k_get_rows(
dfloat2
v
;
dfloat2
v
;
dequantize_kernel
(
src0_row
,
ib
,
iqs
,
v
);
dequantize_kernel
(
src0_row
,
ib
,
iqs
,
v
);
dst_row
[
iybs
+
iqs
+
0
]
=
v
.
x
;
dst_row
[
iybs
+
iqs
+
0
]
=
float
(
v
.
x
)
;
dst_row
[
iybs
+
iqs
+
y_offset
]
=
v
.
y
;
dst_row
[
iybs
+
iqs
+
y_offset
]
=
float
(
v
.
y
)
;
}
}
template
<
typename
src0_t
,
typename
dst_t
>
template
<
typename
src0_t
,
typename
dst_t
>
...
@@ -60,7 +60,7 @@ static __global__ void k_get_rows_float(
...
@@ -60,7 +60,7 @@ static __global__ void k_get_rows_float(
dst_t
*
dst_row
=
dst
+
i10
*
s1
+
i11
*
s2
+
i12
*
s3
;
dst_t
*
dst_row
=
dst
+
i10
*
s1
+
i11
*
s2
+
i12
*
s3
;
const
src0_t
*
src0_row
=
(
const
src0_t
*
)((
const
char
*
)
src0
+
i01
*
nb01
+
i11
*
nb02
+
i12
*
nb03
);
const
src0_t
*
src0_row
=
(
const
src0_t
*
)((
const
char
*
)
src0
+
i01
*
nb01
+
i11
*
nb02
+
i12
*
nb03
);
dst_row
[
i00
]
=
src0_row
[
i00
];
dst_row
[
i00
]
=
float
(
src0_row
[
i00
]
)
;
}
}
template
<
typename
grad_t
,
typename
dst_t
>
template
<
typename
grad_t
,
typename
dst_t
>
...
@@ -86,122 +86,161 @@ static __global__ void k_get_rows_back_float(
...
@@ -86,122 +86,161 @@ static __global__ void k_get_rows_back_float(
dst
[
dst_row
*
ncols
+
col
]
=
sum
;
dst
[
dst_row
*
ncols
+
col
]
=
sum
;
}
}
template
<
int
qk
,
int
qr
,
dequantize_kernel_t
dq
>
template
<
int
qk
,
int
qr
,
dequantize_kernel_t
dq
,
typename
dst_t
>
static
void
get_rows_cuda
(
static
void
get_rows_cuda
_q
(
const
ggml_tensor
*
src0
,
const
ggml_tensor
*
src1
,
ggml_tensor
*
dst
,
const
void
*
src0
_d
,
const
int32_t
*
src1_d
,
dst_t
*
dst
_d
,
const
void
*
src0_dd
,
const
int32_t
*
src1_dd
,
float
*
dst_dd
,
cudaStream_t
stream
)
{
const
int64_t
ne00
,
const
size_t
nb01
,
const
size_t
nb02
,
const
size_t
nb03
,
const
int64_t
ne10
,
const
int64_t
ne11
,
const
int64_t
ne12
,
const
size_t
nb10
,
const
size_t
nb11
,
const
size_t
nb12
,
GGML_TENSOR_BINARY_OP_LOCALS
const
size_t
nb1
,
const
size_t
nb2
,
const
size_t
nb3
,
cudaStream_t
stream
)
{
const
dim3
block_dims
(
CUDA_GET_ROWS_BLOCK_SIZE
,
1
,
1
);
const
dim3
block_dims
(
CUDA_GET_ROWS_BLOCK_SIZE
,
1
,
1
);
const
int
block_num_x
=
(
ne00
+
2
*
CUDA_GET_ROWS_BLOCK_SIZE
-
1
)
/
(
2
*
CUDA_GET_ROWS_BLOCK_SIZE
);
const
int
block_num_x
=
(
ne00
+
2
*
CUDA_GET_ROWS_BLOCK_SIZE
-
1
)
/
(
2
*
CUDA_GET_ROWS_BLOCK_SIZE
);
const
dim3
block_nums
(
block_num_x
,
ne10
,
ne11
*
ne12
);
const
dim3
block_nums
(
block_num_x
,
ne10
,
ne11
*
ne12
);
// strides in elements
// strides in elements
//const size_t s0 = nb0 /
ggml_element_
size(dst);
//
const size_t s0 = nb0 / size
of
(dst
_t
);
const
size_t
s1
=
nb1
/
ggml_element_
size
(
dst
);
const
size_t
s1
=
nb1
/
size
of
(
dst
_t
);
const
size_t
s2
=
nb2
/
ggml_element_
size
(
dst
);
const
size_t
s2
=
nb2
/
size
of
(
dst
_t
);
const
size_t
s3
=
nb3
/
ggml_element_
size
(
dst
);
const
size_t
s3
=
nb3
/
size
of
(
dst
_t
);
const
size_t
s10
=
nb10
/
ggml_element_size
(
src1
);
const
size_t
s10
=
nb10
/
sizeof
(
int32_t
);
const
size_t
s11
=
nb11
/
ggml_element_size
(
src1
);
const
size_t
s11
=
nb11
/
sizeof
(
int32_t
);
const
size_t
s12
=
nb12
/
ggml_element_size
(
src1
);
const
size_t
s12
=
nb12
/
sizeof
(
int32_t
);
//const size_t s13 = nb13 /
ggml_element_size(src1
);
//
const size_t s13 = nb13 /
sizeof(int32_t
);
GGML_ASSERT
(
ne00
%
2
==
0
);
GGML_ASSERT
(
ne00
%
2
==
0
);
k_get_rows
<
qk
,
qr
,
dq
><<<
block_nums
,
block_dims
,
0
,
stream
>>>
(
k_get_rows
<
qk
,
qr
,
dq
><<<
block_nums
,
block_dims
,
0
,
stream
>>>
(
src0_d
d
,
src1_d
d
,
dst_d
d
,
src0_d
,
src1_d
,
dst_d
,
ne00
,
/*ne01, ne02, ne03,*/
ne00
,
/*ne01, ne02, ne03,*/
/*ne10, ne11,*/
ne12
,
/*ne13,*/
/*ne10, ne11,*/
ne12
,
/*ne13,*/
/* s0,*/
s1
,
s2
,
s3
,
/* s0,*/
s1
,
s2
,
s3
,
/* nb00,*/
nb01
,
nb02
,
nb03
,
/* nb00,*/
nb01
,
nb02
,
nb03
,
s10
,
s11
,
s12
/*, s13*/
);
s10
,
s11
,
s12
/*, s13*/
);
GGML_UNUSED
(
dst
);
}
}
template
<
typename
src0_t
>
template
<
typename
src0_t
,
typename
dst_t
>
static
void
get_rows_cuda_float
(
static
void
get_rows_cuda_float
(
const
ggml_tensor
*
src0
,
const
ggml_tensor
*
src1
,
ggml_tensor
*
dst
,
const
src0_t
*
src0_d
,
const
int32_t
*
src1_d
,
dst_t
*
dst_d
,
const
src0_t
*
src0_dd
,
const
int32_t
*
src1_dd
,
float
*
dst_dd
,
cudaStream_t
stream
)
{
const
int64_t
ne00
,
const
size_t
nb01
,
const
size_t
nb02
,
const
size_t
nb03
,
const
int64_t
ne10
,
const
int64_t
ne11
,
const
int64_t
ne12
,
const
size_t
nb10
,
const
size_t
nb11
,
const
size_t
nb12
,
GGML_TENSOR_BINARY_OP_LOCALS
const
size_t
nb1
,
const
size_t
nb2
,
const
size_t
nb3
,
cudaStream_t
stream
)
{
GGML_ASSERT
(
ne13
==
1
);
const
dim3
block_dims
(
CUDA_GET_ROWS_BLOCK_SIZE
,
1
,
1
);
const
dim3
block_dims
(
CUDA_GET_ROWS_BLOCK_SIZE
,
1
,
1
);
const
int
block_num_x
=
(
ne00
+
CUDA_GET_ROWS_BLOCK_SIZE
-
1
)
/
CUDA_GET_ROWS_BLOCK_SIZE
;
const
int
block_num_x
=
(
ne00
+
CUDA_GET_ROWS_BLOCK_SIZE
-
1
)
/
CUDA_GET_ROWS_BLOCK_SIZE
;
const
dim3
block_nums
(
block_num_x
,
ne10
,
ne11
*
ne12
);
const
dim3
block_nums
(
block_num_x
,
ne10
,
ne11
*
ne12
);
// strides in elements
// strides in elements
//const size_t s0 = nb0 /
ggml_element_
size(dst);
//
const size_t s0 = nb0 / size
of
(dst
_t
);
const
size_t
s1
=
nb1
/
ggml_element_
size
(
dst
);
const
size_t
s1
=
nb1
/
size
of
(
dst
_t
);
const
size_t
s2
=
nb2
/
ggml_element_
size
(
dst
);
const
size_t
s2
=
nb2
/
size
of
(
dst
_t
);
const
size_t
s3
=
nb3
/
ggml_element_
size
(
dst
);
const
size_t
s3
=
nb3
/
size
of
(
dst
_t
);
const
size_t
s10
=
nb10
/
ggml_element_size
(
src1
);
const
size_t
s10
=
nb10
/
sizeof
(
int32_t
);
const
size_t
s11
=
nb11
/
ggml_element_size
(
src1
);
const
size_t
s11
=
nb11
/
sizeof
(
int32_t
);
const
size_t
s12
=
nb12
/
ggml_element_size
(
src1
);
const
size_t
s12
=
nb12
/
sizeof
(
int32_t
);
//const size_t s13 = nb13 /
ggml_element_size(src1
);
//
const size_t s13 = nb13 /
sizeof(int32_t
);
k_get_rows_float
<<<
block_nums
,
block_dims
,
0
,
stream
>>>
(
k_get_rows_float
<<<
block_nums
,
block_dims
,
0
,
stream
>>>
(
src0_d
d
,
src1_d
d
,
dst_d
d
,
src0_d
,
src1_d
,
dst_d
,
ne00
,
/*ne01, ne02, ne03,*/
ne00
,
/*ne01, ne02, ne03,*/
/*ne10, ne11,*/
ne12
,
/*ne13,*/
/*ne10, ne11,*/
ne12
,
/*ne13,*/
/* s0,*/
s1
,
s2
,
s3
,
/* s0,*/
s1
,
s2
,
s3
,
/* nb00,*/
nb01
,
nb02
,
nb03
,
/* nb00,*/
nb01
,
nb02
,
nb03
,
s10
,
s11
,
s12
/*, s13*/
);
s10
,
s11
,
s12
/*, s13*/
);
GGML_UNUSED
(
dst
);
}
}
void
ggml_cuda_op_get_rows
(
ggml_backend_cuda_context
&
ctx
,
ggml_tensor
*
dst
)
{
template
<
typename
dst_t
>
const
ggml_tensor
*
src0
=
dst
->
src
[
0
];
static
void
ggml_cuda_get_rows_switch_src0_type
(
const
ggml_tensor
*
src1
=
dst
->
src
[
1
];
const
void
*
src0_d
,
const
ggml_type
src0_type
,
const
int32_t
*
src1_d
,
dst_t
*
dst_d
,
const
int64_t
ne00
,
const
size_t
nb01
,
const
size_t
nb02
,
const
size_t
nb03
,
const
void
*
src0_d
=
(
const
void
*
)
src0
->
data
;
const
int64_t
ne10
,
const
int64_t
ne11
,
const
int64_t
ne12
,
const
size_t
nb10
,
const
size_t
nb11
,
const
size_t
nb12
,
const
int32_t
*
src1_d
=
(
const
int32_t
*
)
src1
->
data
;
const
size_t
nb1
,
const
size_t
nb2
,
const
size_t
nb3
,
float
*
dst_d
=
(
float
*
)
dst
->
data
;
cudaStream_t
stream
)
{
switch
(
src0_type
)
{
cudaStream_t
stream
=
ctx
.
stream
();
GGML_ASSERT
(
src1
->
type
==
GGML_TYPE_I32
);
GGML_ASSERT
(
dst
->
type
==
GGML_TYPE_F32
);
GGML_ASSERT
(
src0
->
nb
[
0
]
==
ggml_type_size
(
src0
->
type
));
GGML_ASSERT
(
src1
->
nb
[
0
]
==
ggml_type_size
(
src1
->
type
));
GGML_ASSERT
(
dst
->
nb
[
0
]
==
ggml_type_size
(
dst
->
type
));
switch
(
src0
->
type
)
{
case
GGML_TYPE_F16
:
case
GGML_TYPE_F16
:
get_rows_cuda_float
(
src0
,
src1
,
dst
,
(
const
half
*
)
src0_d
,
src1_d
,
dst_d
,
stream
);
get_rows_cuda_float
((
const
half
*
)
src0_d
,
src1_d
,
dst_d
,
ne00
,
nb01
,
nb02
,
nb03
,
ne10
,
ne11
,
ne12
,
nb10
,
nb11
,
nb12
,
nb1
,
nb2
,
nb3
,
stream
);
break
;
break
;
case
GGML_TYPE_F32
:
case
GGML_TYPE_F32
:
get_rows_cuda_float
(
src0
,
src1
,
dst
,
(
const
float
*
)
src0_d
,
src1_d
,
dst_d
,
stream
);
get_rows_cuda_float
((
const
float
*
)
src0_d
,
src1_d
,
dst_d
,
ne00
,
nb01
,
nb02
,
nb03
,
ne10
,
ne11
,
ne12
,
nb10
,
nb11
,
nb12
,
nb1
,
nb2
,
nb3
,
stream
);
break
;
case
GGML_TYPE_BF16
:
get_rows_cuda_float
((
const
nv_bfloat16
*
)
src0_d
,
src1_d
,
dst_d
,
ne00
,
nb01
,
nb02
,
nb03
,
ne10
,
ne11
,
ne12
,
nb10
,
nb11
,
nb12
,
nb1
,
nb2
,
nb3
,
stream
);
break
;
break
;
case
GGML_TYPE_Q4_0
:
case
GGML_TYPE_Q4_0
:
get_rows_cuda
<
QK4_0
,
QR4_0
,
dequantize_q4_0
>
(
src0
,
src1
,
dst
,
src0_d
,
src1_d
,
dst_d
,
stream
);
get_rows_cuda_q
<
QK4_0
,
QR4_0
,
dequantize_q4_0
>
(
src0_d
,
src1_d
,
dst_d
,
ne00
,
nb01
,
nb02
,
nb03
,
ne10
,
ne11
,
ne12
,
nb10
,
nb11
,
nb12
,
nb1
,
nb2
,
nb3
,
stream
);
break
;
break
;
case
GGML_TYPE_Q4_1
:
case
GGML_TYPE_Q4_1
:
get_rows_cuda
<
QK4_1
,
QR4_1
,
dequantize_q4_1
>
(
src0
,
src1
,
dst
,
src0_d
,
src1_d
,
dst_d
,
stream
);
get_rows_cuda_q
<
QK4_1
,
QR4_1
,
dequantize_q4_1
>
(
src0_d
,
src1_d
,
dst_d
,
ne00
,
nb01
,
nb02
,
nb03
,
ne10
,
ne11
,
ne12
,
nb10
,
nb11
,
nb12
,
nb1
,
nb2
,
nb3
,
stream
);
break
;
break
;
case
GGML_TYPE_Q5_0
:
case
GGML_TYPE_Q5_0
:
get_rows_cuda
<
QK5_0
,
QR5_0
,
dequantize_q5_0
>
(
src0
,
src1
,
dst
,
src0_d
,
src1_d
,
dst_d
,
stream
);
get_rows_cuda_q
<
QK5_0
,
QR5_0
,
dequantize_q5_0
>
(
src0_d
,
src1_d
,
dst_d
,
ne00
,
nb01
,
nb02
,
nb03
,
ne10
,
ne11
,
ne12
,
nb10
,
nb11
,
nb12
,
nb1
,
nb2
,
nb3
,
stream
);
break
;
break
;
case
GGML_TYPE_Q5_1
:
case
GGML_TYPE_Q5_1
:
get_rows_cuda
<
QK5_1
,
QR5_1
,
dequantize_q5_1
>
(
src0
,
src1
,
dst
,
src0_d
,
src1_d
,
dst_d
,
stream
);
get_rows_cuda_q
<
QK5_1
,
QR5_1
,
dequantize_q5_1
>
(
src0_d
,
src1_d
,
dst_d
,
ne00
,
nb01
,
nb02
,
nb03
,
ne10
,
ne11
,
ne12
,
nb10
,
nb11
,
nb12
,
nb1
,
nb2
,
nb3
,
stream
);
break
;
break
;
case
GGML_TYPE_Q8_0
:
case
GGML_TYPE_Q8_0
:
get_rows_cuda
<
QK8_0
,
QR8_0
,
dequantize_q8_0
>
(
src0
,
src1
,
dst
,
src0_d
,
src1_d
,
dst_d
,
stream
);
get_rows_cuda_q
<
QK8_0
,
QR8_0
,
dequantize_q8_0
>
(
src0_d
,
src1_d
,
dst_d
,
ne00
,
nb01
,
nb02
,
nb03
,
ne10
,
ne11
,
ne12
,
nb10
,
nb11
,
nb12
,
nb1
,
nb2
,
nb3
,
stream
);
break
;
break
;
default:
default:
// TODO: k-quants
// TODO: k-quants
GGML_ABORT
(
"%s: unsupported type: %s
\n
"
,
__func__
,
ggml_type_name
(
src0
->
type
));
GGML_ABORT
(
"%s: unsupported
src0
type: %s
\n
"
,
__func__
,
ggml_type_name
(
src0
_
type
));
break
;
break
;
}
}
}
}
void
get_rows_cuda
(
const
void
*
src0_d
,
ggml_type
src0_type
,
const
int32_t
*
src1_d
,
void
*
dst_d
,
ggml_type
dst_type
,
int64_t
ne00
,
size_t
nb01
,
size_t
nb02
,
size_t
nb03
,
int64_t
ne10
,
int64_t
ne11
,
int64_t
ne12
,
size_t
nb10
,
size_t
nb11
,
size_t
nb12
,
size_t
nb1
,
size_t
nb2
,
size_t
nb3
,
cudaStream_t
stream
)
{
switch
(
dst_type
)
{
case
GGML_TYPE_F32
:
ggml_cuda_get_rows_switch_src0_type
(
src0_d
,
src0_type
,
src1_d
,
(
float
*
)
dst_d
,
ne00
,
nb01
,
nb02
,
nb03
,
ne10
,
ne11
,
ne12
,
nb10
,
nb11
,
nb12
,
nb1
,
nb2
,
nb3
,
stream
);
break
;
case
GGML_TYPE_F16
:
ggml_cuda_get_rows_switch_src0_type
(
src0_d
,
src0_type
,
src1_d
,
(
half
*
)
dst_d
,
ne00
,
nb01
,
nb02
,
nb03
,
ne10
,
ne11
,
ne12
,
nb10
,
nb11
,
nb12
,
nb1
,
nb2
,
nb3
,
stream
);
break
;
case
GGML_TYPE_BF16
:
ggml_cuda_get_rows_switch_src0_type
(
src0_d
,
src0_type
,
src1_d
,
(
nv_bfloat16
*
)
dst_d
,
ne00
,
nb01
,
nb02
,
nb03
,
ne10
,
ne11
,
ne12
,
nb10
,
nb11
,
nb12
,
nb1
,
nb2
,
nb3
,
stream
);
break
;
default:
GGML_ABORT
(
"%s: unsupported dst type: %s
\n
"
,
__func__
,
ggml_type_name
(
dst_type
));
break
;
}
}
void
ggml_cuda_op_get_rows
(
ggml_backend_cuda_context
&
ctx
,
ggml_tensor
*
dst
)
{
const
ggml_tensor
*
src0
=
dst
->
src
[
0
];
const
ggml_tensor
*
src1
=
dst
->
src
[
1
];
cudaStream_t
stream
=
ctx
.
stream
();
GGML_TENSOR_BINARY_OP_LOCALS
GGML_ASSERT
(
src1
->
type
==
GGML_TYPE_I32
);
GGML_ASSERT
(
ne13
==
1
);
GGML_ASSERT
(
src0
->
nb
[
0
]
==
ggml_type_size
(
src0
->
type
));
GGML_ASSERT
(
src1
->
nb
[
0
]
==
ggml_type_size
(
src1
->
type
));
GGML_ASSERT
(
dst
->
nb
[
0
]
==
ggml_type_size
(
dst
->
type
));
get_rows_cuda
(
src0
->
data
,
src0
->
type
,
(
const
int32_t
*
)
src1
->
data
,
dst
->
data
,
dst
->
type
,
ne00
,
nb01
,
nb02
,
nb03
,
ne10
,
ne11
,
ne12
,
nb10
,
nb11
,
nb12
,
nb1
,
nb2
,
nb3
,
stream
);
}
void
ggml_cuda_op_get_rows_back
(
ggml_backend_cuda_context
&
ctx
,
ggml_tensor
*
dst
)
{
void
ggml_cuda_op_get_rows_back
(
ggml_backend_cuda_context
&
ctx
,
ggml_tensor
*
dst
)
{
const
ggml_tensor
*
src0
=
dst
->
src
[
0
];
// gradients of forward pass output
const
ggml_tensor
*
src0
=
dst
->
src
[
0
];
// gradients of forward pass output
const
ggml_tensor
*
src1
=
dst
->
src
[
1
];
// src1 in forward pass
const
ggml_tensor
*
src1
=
dst
->
src
[
1
];
// src1 in forward pass
...
...
ml/backend/ggml/ggml/src/ggml-cuda/getrows.cuh
View file @
8dd12c87
...
@@ -3,6 +3,13 @@
...
@@ -3,6 +3,13 @@
#define CUDA_GET_ROWS_BLOCK_SIZE 256
#define CUDA_GET_ROWS_BLOCK_SIZE 256
#define CUDA_GET_ROWS_BACK_BLOCK_SIZE 256
#define CUDA_GET_ROWS_BACK_BLOCK_SIZE 256
void
get_rows_cuda
(
const
void
*
src0_d
,
ggml_type
src0_type
,
const
int32_t
*
src1_d
,
void
*
dst_d
,
ggml_type
dst_type
,
int64_t
ne00
,
size_t
nb01
,
size_t
nb02
,
size_t
nb03
,
int64_t
ne10
,
int64_t
ne11
,
int64_t
ne12
,
size_t
nb10
,
size_t
nb11
,
size_t
nb12
,
size_t
nb1
,
size_t
nb2
,
size_t
nb3
,
cudaStream_t
stream
);
void
ggml_cuda_op_get_rows
(
ggml_backend_cuda_context
&
ctx
,
ggml_tensor
*
dst
);
void
ggml_cuda_op_get_rows
(
ggml_backend_cuda_context
&
ctx
,
ggml_tensor
*
dst
);
void
ggml_cuda_op_get_rows_back
(
ggml_backend_cuda_context
&
ctx
,
ggml_tensor
*
dst
);
void
ggml_cuda_op_get_rows_back
(
ggml_backend_cuda_context
&
ctx
,
ggml_tensor
*
dst
);
ml/backend/ggml/ggml/src/ggml-cuda/ggml-cuda.cu
View file @
8dd12c87
This diff is collapsed.
Click to expand it.
ml/backend/ggml/ggml/src/ggml-cuda/mmq.cu
View file @
8dd12c87
This diff is collapsed.
Click to expand it.
ml/backend/ggml/ggml/src/ggml-cuda/mmq.cuh
View file @
8dd12c87
This diff is collapsed.
Click to expand it.
ml/backend/ggml/ggml/src/ggml-cuda/mmv.cu
View file @
8dd12c87
This diff is collapsed.
Click to expand it.
ml/backend/ggml/ggml/src/ggml-cuda/mmv.cuh
View file @
8dd12c87
...
@@ -3,7 +3,7 @@
...
@@ -3,7 +3,7 @@
// maximum number of src0 rows with which to use mul_mat_vec over cuBLAS if FP16 tensor cores are available
// maximum number of src0 rows with which to use mul_mat_vec over cuBLAS if FP16 tensor cores are available
#define MMV_MAX_ROWS 512
#define MMV_MAX_ROWS 512
void
ggml_cuda_mul_mat_vec
(
ggml_backend_cuda_context
&
ctx
,
const
ggml_tensor
*
src0
,
const
ggml_tensor
*
src1
,
ggml_tensor
*
dst
);
void
ggml_cuda_mul_mat_vec
(
ggml_backend_cuda_context
&
ctx
,
const
ggml_tensor
*
src0
,
const
ggml_tensor
*
src1
,
const
ggml_tensor
*
ids
,
ggml_tensor
*
dst
);
void
ggml_cuda_op_mul_mat_vec
(
void
ggml_cuda_op_mul_mat_vec
(
ggml_backend_cuda_context
&
ctx
,
ggml_backend_cuda_context
&
ctx
,
...
...
ml/backend/ggml/ggml/src/ggml-cuda/mmvq.cu
View file @
8dd12c87
This diff is collapsed.
Click to expand it.
Prev
1
2
3
4
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment