Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
ollama
Commits
0cefd46f
Unverified
Commit
0cefd46f
authored
May 12, 2025
by
Jeffrey Morgan
Committed by
GitHub
May 12, 2025
Browse files
llama: update to commit de4c07f93 (#10655)
parent
ad035ad5
Changes
113
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
179 additions
and
116 deletions
+179
-116
ml/backend/ggml/ggml/src/ggml-cuda/getrows.cu
ml/backend/ggml/ggml/src/ggml-cuda/getrows.cu
+14
-12
ml/backend/ggml/ggml/src/ggml-cuda/ggml-cuda.cu
ml/backend/ggml/ggml/src/ggml-cuda/ggml-cuda.cu
+28
-13
ml/backend/ggml/ggml/src/ggml-cuda/mmq.cu
ml/backend/ggml/ggml/src/ggml-cuda/mmq.cu
+14
-3
ml/backend/ggml/ggml/src/ggml-cuda/mmq.cuh
ml/backend/ggml/ggml/src/ggml-cuda/mmq.cuh
+35
-27
ml/backend/ggml/ggml/src/ggml-cuda/mmvq.cu
ml/backend/ggml/ggml/src/ggml-cuda/mmvq.cu
+11
-0
ml/backend/ggml/ggml/src/ggml-cuda/quantize.cu
ml/backend/ggml/ggml/src/ggml-cuda/quantize.cu
+1
-0
ml/backend/ggml/ggml/src/ggml-cuda/sum.cu
ml/backend/ggml/ggml/src/ggml-cuda/sum.cu
+1
-1
ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_1-ncols2_16.cu
...te-instances/fattn-mma-f16-instance-ncols1_1-ncols2_16.cu
+5
-0
ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_1-ncols2_8.cu
...ate-instances/fattn-mma-f16-instance-ncols1_1-ncols2_8.cu
+6
-6
ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_16-ncols2_1.cu
...te-instances/fattn-mma-f16-instance-ncols1_16-ncols2_1.cu
+6
-6
ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_16-ncols2_2.cu
...te-instances/fattn-mma-f16-instance-ncols1_16-ncols2_2.cu
+6
-6
ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_16-ncols2_4.cu
...te-instances/fattn-mma-f16-instance-ncols1_16-ncols2_4.cu
+6
-6
ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_2-ncols2_16.cu
...te-instances/fattn-mma-f16-instance-ncols1_2-ncols2_16.cu
+5
-0
ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_2-ncols2_4.cu
...ate-instances/fattn-mma-f16-instance-ncols1_2-ncols2_4.cu
+6
-6
ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_2-ncols2_8.cu
...ate-instances/fattn-mma-f16-instance-ncols1_2-ncols2_8.cu
+6
-6
ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_32-ncols2_1.cu
...te-instances/fattn-mma-f16-instance-ncols1_32-ncols2_1.cu
+6
-6
ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_32-ncols2_2.cu
...te-instances/fattn-mma-f16-instance-ncols1_32-ncols2_2.cu
+6
-6
ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_4-ncols2_16.cu
...te-instances/fattn-mma-f16-instance-ncols1_4-ncols2_16.cu
+5
-0
ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_4-ncols2_2.cu
...ate-instances/fattn-mma-f16-instance-ncols1_4-ncols2_2.cu
+6
-6
ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_4-ncols2_4.cu
...ate-instances/fattn-mma-f16-instance-ncols1_4-ncols2_4.cu
+6
-6
No files found.
ml/backend/ggml/ggml/src/ggml-cuda/getrows.cu
View file @
0cefd46f
...
@@ -10,10 +10,11 @@ static __global__ void k_get_rows(
...
@@ -10,10 +10,11 @@ static __global__ void k_get_rows(
/*const size_t nb00,*/
const
size_t
nb01
,
const
size_t
nb02
,
const
size_t
nb03
,
/*const size_t nb00,*/
const
size_t
nb01
,
const
size_t
nb02
,
const
size_t
nb03
,
const
size_t
s10
,
const
size_t
s11
,
const
size_t
s12
/*, const size_t s13*/
)
{
const
size_t
s10
,
const
size_t
s11
,
const
size_t
s12
/*, const size_t s13*/
)
{
const
int
i00
=
(
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
)
*
2
;
// The x and y dimensions of the grid are swapped because the maximum allowed grid size for x is higher.
const
int
i10
=
blockDim
.
y
*
blockIdx
.
y
+
threadIdx
.
y
;
const
int
i00
=
(
blockIdx
.
y
*
blockDim
.
x
+
threadIdx
.
x
)
*
2
;
const
int
i11
=
(
blockIdx
.
z
*
blockDim
.
z
+
threadIdx
.
z
)
/
ne12
;
const
int
i10
=
blockIdx
.
x
;
const
int
i12
=
(
blockIdx
.
z
*
blockDim
.
z
+
threadIdx
.
z
)
%
ne12
;
const
int
i11
=
blockIdx
.
z
/
ne12
;
const
int
i12
=
blockIdx
.
z
%
ne12
;
if
(
i00
>=
ne00
)
{
if
(
i00
>=
ne00
)
{
return
;
return
;
...
@@ -46,10 +47,11 @@ static __global__ void k_get_rows_float(
...
@@ -46,10 +47,11 @@ static __global__ void k_get_rows_float(
/*const size_t nb00,*/
const
size_t
nb01
,
const
size_t
nb02
,
const
size_t
nb03
,
/*const size_t nb00,*/
const
size_t
nb01
,
const
size_t
nb02
,
const
size_t
nb03
,
const
size_t
s10
,
const
size_t
s11
,
const
size_t
s12
/*, const size_t s13*/
)
{
const
size_t
s10
,
const
size_t
s11
,
const
size_t
s12
/*, const size_t s13*/
)
{
const
int
i00
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
// The x and y dimensions of the grid are swapped because the maximum allowed grid size for x is higher.
const
int
i10
=
blockDim
.
y
*
blockIdx
.
y
+
threadIdx
.
y
;
const
int
i00
=
blockIdx
.
y
*
blockDim
.
x
+
threadIdx
.
x
;
const
int
i11
=
(
blockIdx
.
z
*
blockDim
.
z
+
threadIdx
.
z
)
/
ne12
;
const
int
i10
=
blockIdx
.
x
;
const
int
i12
=
(
blockIdx
.
z
*
blockDim
.
z
+
threadIdx
.
z
)
%
ne12
;
const
int
i11
=
blockIdx
.
z
/
ne12
;
const
int
i12
=
blockIdx
.
z
%
ne12
;
if
(
i00
>=
ne00
)
{
if
(
i00
>=
ne00
)
{
return
;
return
;
...
@@ -94,8 +96,8 @@ static void get_rows_cuda_q(
...
@@ -94,8 +96,8 @@ static void get_rows_cuda_q(
const
size_t
nb1
,
const
size_t
nb2
,
const
size_t
nb3
,
const
size_t
nb1
,
const
size_t
nb2
,
const
size_t
nb3
,
cudaStream_t
stream
)
{
cudaStream_t
stream
)
{
const
dim3
block_dims
(
CUDA_GET_ROWS_BLOCK_SIZE
,
1
,
1
);
const
dim3
block_dims
(
CUDA_GET_ROWS_BLOCK_SIZE
,
1
,
1
);
const
int
block_num_
x
=
(
ne00
+
2
*
CUDA_GET_ROWS_BLOCK_SIZE
-
1
)
/
(
2
*
CUDA_GET_ROWS_BLOCK_SIZE
);
const
int
block_num_
y
=
(
ne00
+
2
*
CUDA_GET_ROWS_BLOCK_SIZE
-
1
)
/
(
2
*
CUDA_GET_ROWS_BLOCK_SIZE
);
const
dim3
block_nums
(
block_num_
x
,
ne10
,
ne11
*
ne12
);
const
dim3
block_nums
(
ne10
,
block_num_
y
,
ne11
*
ne12
);
// strides in elements
// strides in elements
// const size_t s0 = nb0 / sizeof(dst_t);
// const size_t s0 = nb0 / sizeof(dst_t);
...
@@ -127,8 +129,8 @@ static void get_rows_cuda_float(
...
@@ -127,8 +129,8 @@ static void get_rows_cuda_float(
const
size_t
nb1
,
const
size_t
nb2
,
const
size_t
nb3
,
const
size_t
nb1
,
const
size_t
nb2
,
const
size_t
nb3
,
cudaStream_t
stream
)
{
cudaStream_t
stream
)
{
const
dim3
block_dims
(
CUDA_GET_ROWS_BLOCK_SIZE
,
1
,
1
);
const
dim3
block_dims
(
CUDA_GET_ROWS_BLOCK_SIZE
,
1
,
1
);
const
int
block_num_
x
=
(
ne00
+
CUDA_GET_ROWS_BLOCK_SIZE
-
1
)
/
CUDA_GET_ROWS_BLOCK_SIZE
;
const
int
block_num_
y
=
(
ne00
+
CUDA_GET_ROWS_BLOCK_SIZE
-
1
)
/
CUDA_GET_ROWS_BLOCK_SIZE
;
const
dim3
block_nums
(
block_num_
x
,
ne10
,
ne11
*
ne12
);
const
dim3
block_nums
(
ne10
,
block_num_
y
,
ne11
*
ne12
);
// strides in elements
// strides in elements
// const size_t s0 = nb0 / sizeof(dst_t);
// const size_t s0 = nb0 / sizeof(dst_t);
...
...
ml/backend/ggml/ggml/src/ggml-cuda/ggml-cuda.cu
View file @
0cefd46f
...
@@ -556,8 +556,8 @@ static enum ggml_status ggml_backend_cuda_buffer_init_tensor(ggml_backend_buffer
...
@@ -556,8 +556,8 @@ static enum ggml_status ggml_backend_cuda_buffer_init_tensor(ggml_backend_buffer
if
(
ggml_is_quantized
(
tensor
->
type
)
&&
tensor
->
view_src
==
nullptr
&&
ggml_backend_buffer_get_usage
(
buffer
)
!=
GGML_BACKEND_BUFFER_USAGE_COMPUTE
)
{
if
(
ggml_is_quantized
(
tensor
->
type
)
&&
tensor
->
view_src
==
nullptr
&&
ggml_backend_buffer_get_usage
(
buffer
)
!=
GGML_BACKEND_BUFFER_USAGE_COMPUTE
)
{
// initialize padding to 0 to avoid possible NaN values
// initialize padding to 0 to avoid possible NaN values
size_t
original_size
=
ggml_nbytes
(
tensor
);
const
size_t
original_size
=
ggml_nbytes
(
tensor
);
size_t
padded_size
=
ggml_backend_buft_get_alloc_size
(
buffer
->
buft
,
tensor
);
const
size_t
padded_size
=
ggml_backend_buft_get_alloc_size
(
buffer
->
buft
,
tensor
);
if
(
padded_size
>
original_size
)
{
if
(
padded_size
>
original_size
)
{
ggml_cuda_set_device
(
ctx
->
device
);
ggml_cuda_set_device
(
ctx
->
device
);
...
@@ -680,6 +680,7 @@ static size_t ggml_backend_cuda_buffer_type_get_alloc_size(ggml_backend_buffer_t
...
@@ -680,6 +680,7 @@ static size_t ggml_backend_cuda_buffer_type_get_alloc_size(ggml_backend_buffer_t
if
(
ggml_is_quantized
(
tensor
->
type
))
{
if
(
ggml_is_quantized
(
tensor
->
type
))
{
if
(
ne0
%
MATRIX_ROW_PADDING
!=
0
)
{
if
(
ne0
%
MATRIX_ROW_PADDING
!=
0
)
{
GGML_ASSERT
(
tensor
->
nb
[
0
]
==
ggml_element_size
(
tensor
));
size
+=
ggml_row_size
(
tensor
->
type
,
MATRIX_ROW_PADDING
-
ne0
%
MATRIX_ROW_PADDING
);
size
+=
ggml_row_size
(
tensor
->
type
,
MATRIX_ROW_PADDING
-
ne0
%
MATRIX_ROW_PADDING
);
}
}
}
}
...
@@ -802,6 +803,7 @@ static void * ggml_backend_cuda_split_buffer_get_base(ggml_backend_buffer_t buff
...
@@ -802,6 +803,7 @@ static void * ggml_backend_cuda_split_buffer_get_base(ggml_backend_buffer_t buff
static
enum
ggml_status
ggml_backend_cuda_split_buffer_init_tensor
(
ggml_backend_buffer_t
buffer
,
ggml_tensor
*
tensor
)
{
static
enum
ggml_status
ggml_backend_cuda_split_buffer_init_tensor
(
ggml_backend_buffer_t
buffer
,
ggml_tensor
*
tensor
)
{
GGML_ASSERT
(
tensor
->
view_src
==
nullptr
);
// views of split tensors are not supported
GGML_ASSERT
(
tensor
->
view_src
==
nullptr
);
// views of split tensors are not supported
GGML_ASSERT
(
ggml_is_contiguous
(
tensor
)
&&
"split buffers only supported for contiguous tensors"
);
ggml_backend_cuda_split_buffer_context
*
ctx
=
(
ggml_backend_cuda_split_buffer_context
*
)
buffer
->
context
;
ggml_backend_cuda_split_buffer_context
*
ctx
=
(
ggml_backend_cuda_split_buffer_context
*
)
buffer
->
context
;
ggml_backend_cuda_split_buffer_type_context
*
buft_ctx
=
(
ggml_backend_cuda_split_buffer_type_context
*
)
buffer
->
buft
->
context
;
ggml_backend_cuda_split_buffer_type_context
*
buft_ctx
=
(
ggml_backend_cuda_split_buffer_type_context
*
)
buffer
->
buft
->
context
;
...
@@ -853,6 +855,7 @@ static void ggml_backend_cuda_split_buffer_set_tensor(ggml_backend_buffer_t buff
...
@@ -853,6 +855,7 @@ static void ggml_backend_cuda_split_buffer_set_tensor(ggml_backend_buffer_t buff
// split tensors must always be set in their entirety at once
// split tensors must always be set in their entirety at once
GGML_ASSERT
(
offset
==
0
);
GGML_ASSERT
(
offset
==
0
);
GGML_ASSERT
(
size
==
ggml_nbytes
(
tensor
));
GGML_ASSERT
(
size
==
ggml_nbytes
(
tensor
));
GGML_ASSERT
(
ggml_is_contiguous
(
tensor
)
&&
"split buffers only supported for contiguous tensors"
);
ggml_backend_cuda_split_buffer_type_context
*
buft_ctx
=
(
ggml_backend_cuda_split_buffer_type_context
*
)
buffer
->
buft
->
context
;
ggml_backend_cuda_split_buffer_type_context
*
buft_ctx
=
(
ggml_backend_cuda_split_buffer_type_context
*
)
buffer
->
buft
->
context
;
...
@@ -891,6 +894,7 @@ static void ggml_backend_cuda_split_buffer_get_tensor(ggml_backend_buffer_t buff
...
@@ -891,6 +894,7 @@ static void ggml_backend_cuda_split_buffer_get_tensor(ggml_backend_buffer_t buff
// split tensors must always be set in their entirety at once
// split tensors must always be set in their entirety at once
GGML_ASSERT
(
offset
==
0
);
GGML_ASSERT
(
offset
==
0
);
GGML_ASSERT
(
size
==
ggml_nbytes
(
tensor
));
GGML_ASSERT
(
size
==
ggml_nbytes
(
tensor
));
GGML_ASSERT
(
ggml_is_contiguous
(
tensor
)
&&
"split buffers only supported for contiguous tensors"
);
ggml_backend_cuda_split_buffer_type_context
*
buft_ctx
=
(
ggml_backend_cuda_split_buffer_type_context
*
)
buffer
->
buft
->
context
;
ggml_backend_cuda_split_buffer_type_context
*
buft_ctx
=
(
ggml_backend_cuda_split_buffer_type_context
*
)
buffer
->
buft
->
context
;
...
@@ -972,6 +976,7 @@ static size_t ggml_backend_cuda_split_buffer_type_get_alignment(ggml_backend_buf
...
@@ -972,6 +976,7 @@ static size_t ggml_backend_cuda_split_buffer_type_get_alignment(ggml_backend_buf
static
size_t
ggml_backend_cuda_split_buffer_type_get_alloc_size
(
ggml_backend_buffer_type_t
buft
,
const
ggml_tensor
*
tensor
)
{
static
size_t
ggml_backend_cuda_split_buffer_type_get_alloc_size
(
ggml_backend_buffer_type_t
buft
,
const
ggml_tensor
*
tensor
)
{
ggml_backend_cuda_split_buffer_type_context
*
ctx
=
(
ggml_backend_cuda_split_buffer_type_context
*
)
buft
->
context
;
ggml_backend_cuda_split_buffer_type_context
*
ctx
=
(
ggml_backend_cuda_split_buffer_type_context
*
)
buft
->
context
;
GGML_ASSERT
(
ggml_is_contiguous
(
tensor
)
&&
"split buffers only supported for contiguous tensors"
);
size_t
total_size
=
0
;
size_t
total_size
=
0
;
...
@@ -1534,6 +1539,8 @@ static void ggml_cuda_op_mul_mat(
...
@@ -1534,6 +1539,8 @@ static void ggml_cuda_op_mul_mat(
// If src0 is on a temporary compute buffer (partial offloading) there may be some padding that needs to be cleared:
// If src0 is on a temporary compute buffer (partial offloading) there may be some padding that needs to be cleared:
if
(
ne00
%
MATRIX_ROW_PADDING
!=
0
&&
ggml_is_quantized
(
src0
->
type
)
&&
ggml_backend_buffer_get_usage
(
src0
->
buffer
)
==
GGML_BACKEND_BUFFER_USAGE_COMPUTE
&&
src0
->
view_src
==
nullptr
)
{
if
(
ne00
%
MATRIX_ROW_PADDING
!=
0
&&
ggml_is_quantized
(
src0
->
type
)
&&
ggml_backend_buffer_get_usage
(
src0
->
buffer
)
==
GGML_BACKEND_BUFFER_USAGE_COMPUTE
&&
src0
->
view_src
==
nullptr
)
{
GGML_ASSERT
(
ggml_is_contiguously_allocated
(
src0
));
GGML_ASSERT
(
!
src0
->
view_src
);
const
size_t
nbytes_data
=
ggml_row_size
(
src0
->
type
,
(
dev
[
id
].
row_high
-
dev
[
id
].
row_low
)
*
ne00
);
const
size_t
nbytes_data
=
ggml_row_size
(
src0
->
type
,
(
dev
[
id
].
row_high
-
dev
[
id
].
row_low
)
*
ne00
);
const
size_t
nbytes_padding
=
ggml_row_size
(
src0
->
type
,
MATRIX_ROW_PADDING
-
ne00
%
MATRIX_ROW_PADDING
);
const
size_t
nbytes_padding
=
ggml_row_size
(
src0
->
type
,
MATRIX_ROW_PADDING
-
ne00
%
MATRIX_ROW_PADDING
);
CUDA_CHECK
(
cudaMemsetAsync
(
dev
[
id
].
src0_dd
+
nbytes_data
,
0
,
nbytes_padding
,
stream
));
CUDA_CHECK
(
cudaMemsetAsync
(
dev
[
id
].
src0_dd
+
nbytes_data
,
0
,
nbytes_padding
,
stream
));
...
@@ -1905,13 +1912,19 @@ static void ggml_cuda_mul_mat_batched_cublas(ggml_backend_cuda_context & ctx, co
...
@@ -1905,13 +1912,19 @@ static void ggml_cuda_mul_mat_batched_cublas(ggml_backend_cuda_context & ctx, co
static
void
ggml_cuda_mul_mat
(
ggml_backend_cuda_context
&
ctx
,
const
ggml_tensor
*
src0
,
const
ggml_tensor
*
src1
,
ggml_tensor
*
dst
)
{
static
void
ggml_cuda_mul_mat
(
ggml_backend_cuda_context
&
ctx
,
const
ggml_tensor
*
src0
,
const
ggml_tensor
*
src1
,
ggml_tensor
*
dst
)
{
const
bool
split
=
ggml_backend_buft_is_cuda_split
(
src0
->
buffer
->
buft
);
const
bool
split
=
ggml_backend_buft_is_cuda_split
(
src0
->
buffer
->
buft
);
// If src0 is a temporary compute buffer it may have some padding that needs to be cleared for mul_mat_vec_q or mul_mat_q.
// But if src0 is also a view of another tensor then this cannot be done safely because it may overwrite valid tensor data.
// Therefore, in such cases use cuBLAS.
const
bool
bad_padding_clear
=
ggml_backend_buffer_get_usage
(
src0
->
buffer
)
==
GGML_BACKEND_BUFFER_USAGE_COMPUTE
&&
ggml_nbytes
(
src0
)
!=
ggml_backend_buffer_get_alloc_size
(
src0
->
buffer
,
src0
)
&&
src0
->
view_src
;
bool
use_mul_mat_vec
=
(
src0
->
type
==
GGML_TYPE_F32
||
src0
->
type
==
GGML_TYPE_F16
||
src0
->
type
==
GGML_TYPE_BF16
)
bool
use_mul_mat_vec
=
(
src0
->
type
==
GGML_TYPE_F32
||
src0
->
type
==
GGML_TYPE_F16
||
src0
->
type
==
GGML_TYPE_BF16
)
&&
src1
->
type
==
GGML_TYPE_F32
&&
dst
->
type
==
GGML_TYPE_F32
&&
src1
->
type
==
GGML_TYPE_F32
&&
dst
->
type
==
GGML_TYPE_F32
&&
src0
->
ne
[
0
]
%
2
==
0
&&
src1
->
ne
[
1
]
==
1
;
&&
src0
->
ne
[
0
]
%
2
==
0
&&
src1
->
ne
[
1
]
==
1
;
bool
use_mul_mat_vec_q
=
ggml_is_quantized
(
src0
->
type
)
bool
use_mul_mat_vec_q
=
ggml_is_quantized
(
src0
->
type
)
&&
!
bad_padding_clear
&&
src1
->
type
==
GGML_TYPE_F32
&&
dst
->
type
==
GGML_TYPE_F32
&&
src1
->
type
==
GGML_TYPE_F32
&&
dst
->
type
==
GGML_TYPE_F32
&&
src1
->
ne
[
1
]
<=
MMVQ_MAX_BATCH_SIZE
;
&&
src1
->
ne
[
1
]
<=
MMVQ_MAX_BATCH_SIZE
;
bool
use_mul_mat_q
=
ggml_is_quantized
(
src0
->
type
)
bool
use_mul_mat_q
=
ggml_is_quantized
(
src0
->
type
)
&&
!
bad_padding_clear
&&
src1
->
type
==
GGML_TYPE_F32
&&
dst
->
type
==
GGML_TYPE_F32
;
&&
src1
->
type
==
GGML_TYPE_F32
&&
dst
->
type
==
GGML_TYPE_F32
;
bool
any_gpus_with_slow_fp16
=
false
;
bool
any_gpus_with_slow_fp16
=
false
;
...
@@ -2065,9 +2078,11 @@ static void ggml_cuda_mul_mat_id(ggml_backend_cuda_context & ctx, ggml_tensor *
...
@@ -2065,9 +2078,11 @@ static void ggml_cuda_mul_mat_id(ggml_backend_cuda_context & ctx, ggml_tensor *
}
}
ggml_tensor
src0_slice
=
*
src0
;
ggml_tensor
src0_slice
=
*
src0
;
src0_slice
.
ne
[
2
]
=
1
;
src0_slice
.
ne
[
2
]
=
1
;
src0_slice
.
nb
[
3
]
=
src0_slice
.
nb
[
2
];
src0_slice
.
nb
[
3
]
=
src0_slice
.
nb
[
2
];
src0_slice
.
data
=
(
char
*
)
src0
->
data
+
i02
*
nb02
;
src0_slice
.
op
=
GGML_OP_VIEW
;
src0_slice
.
view_src
=
dst
->
src
[
0
];
// non-const pointer to src0
src0_slice
.
data
=
(
char
*
)
src0
->
data
+
i02
*
nb02
;
ggml_tensor
src1_slice
;
ggml_tensor
src1_slice
;
memset
(
&
src1_slice
,
0
,
sizeof
(
src1_slice
));
memset
(
&
src1_slice
,
0
,
sizeof
(
src1_slice
));
...
@@ -3213,16 +3228,16 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
...
@@ -3213,16 +3228,16 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
return
false
;
return
false
;
#endif // FLASH_ATTN_AVAILABLE
#endif // FLASH_ATTN_AVAILABLE
if
(
op
->
src
[
1
]
->
ne
[
0
]
!=
op
->
src
[
2
]
->
ne
[
0
])
{
if
(
op
->
src
[
1
]
->
ne
[
0
]
!=
op
->
src
[
2
]
->
ne
[
0
])
{
// different head sizes of K and V are not supported yet
const
int
cc
=
ggml_cuda_info
().
devices
[
dev_ctx
->
device
].
cc
;
return
false
;
if
(
!
new_mma_available
(
cc
)
||
cc
<
GGML_CUDA_CC_AMPERE
)
{
return
false
;
}
const
int
gqa_ratio
=
op
->
src
[
0
]
->
ne
[
2
]
/
op
->
src
[
1
]
->
ne
[
2
];
return
op
->
src
[
1
]
->
ne
[
0
]
==
576
&&
op
->
src
[
2
]
->
ne
[
0
]
==
512
&&
op
->
src
[
3
]
&&
gqa_ratio
%
16
==
0
;
}
}
if
(
op
->
src
[
0
]
->
ne
[
0
]
==
192
)
{
if
(
op
->
src
[
0
]
->
ne
[
0
]
==
192
)
{
return
false
;
return
false
;
}
}
if
(
op
->
src
[
0
]
->
ne
[
0
]
==
576
)
{
// DeepSeek MLA
return
false
;
}
if
(
op
->
src
[
0
]
->
ne
[
3
]
!=
1
)
{
if
(
op
->
src
[
0
]
->
ne
[
3
]
!=
1
)
{
return
false
;
return
false
;
}
}
...
...
ml/backend/ggml/ggml/src/ggml-cuda/mmq.cu
View file @
0cefd46f
...
@@ -89,6 +89,17 @@ void ggml_cuda_mul_mat_q(
...
@@ -89,6 +89,17 @@ void ggml_cuda_mul_mat_q(
const
float
*
src1_d
=
(
const
float
*
)
src1
->
data
;
const
float
*
src1_d
=
(
const
float
*
)
src1
->
data
;
float
*
dst_d
=
(
float
*
)
dst
->
data
;
float
*
dst_d
=
(
float
*
)
dst
->
data
;
// If src0 is a temporary compute buffer, clear any potential padding.
if
(
ggml_backend_buffer_get_usage
(
src0
->
buffer
)
==
GGML_BACKEND_BUFFER_USAGE_COMPUTE
)
{
const
size_t
size_data
=
ggml_nbytes
(
src0
);
const
size_t
size_alloc
=
ggml_backend_buffer_get_alloc_size
(
src0
->
buffer
,
src0
);
if
(
size_alloc
>
size_data
)
{
GGML_ASSERT
(
ggml_is_contiguously_allocated
(
src0
));
GGML_ASSERT
(
!
src0
->
view_src
);
CUDA_CHECK
(
cudaMemsetAsync
((
char
*
)
src0
->
data
+
size_data
,
0
,
size_alloc
-
size_data
,
stream
));
}
}
const
int64_t
ne10_padded
=
GGML_PAD
(
ne10
,
MATRIX_ROW_PADDING
);
const
int64_t
ne10_padded
=
GGML_PAD
(
ne10
,
MATRIX_ROW_PADDING
);
const
int64_t
s01
=
src0
->
nb
[
1
]
/
ts_src0
;
const
int64_t
s01
=
src0
->
nb
[
1
]
/
ts_src0
;
...
@@ -118,7 +129,7 @@ void ggml_cuda_mul_mat_q(
...
@@ -118,7 +129,7 @@ void ggml_cuda_mul_mat_q(
const
mmq_args
args
=
{
const
mmq_args
args
=
{
src0_d
,
src0
->
type
,
(
const
int
*
)
src1_q8_1
.
ptr
,
nullptr
,
nullptr
,
dst_d
,
src0_d
,
src0
->
type
,
(
const
int
*
)
src1_q8_1
.
ptr
,
nullptr
,
nullptr
,
dst_d
,
ne00
,
ne01
,
ne1
,
s01
,
s1
,
ne00
,
ne01
,
ne1
,
s01
,
ne11
,
s1
,
ne02
,
ne12
,
s02
,
s12
,
s2
,
ne02
,
ne12
,
s02
,
s12
,
s2
,
ne03
,
ne13
,
s03
,
s13
,
s3
,
ne03
,
ne13
,
s03
,
s13
,
s3
,
use_stream_k
};
use_stream_k
};
...
@@ -202,7 +213,7 @@ void ggml_cuda_mul_mat_q(
...
@@ -202,7 +213,7 @@ void ggml_cuda_mul_mat_q(
// Note that ne02 is used instead of ne12 because the number of y channels determines the z dimension of the CUDA grid.
// Note that ne02 is used instead of ne12 because the number of y channels determines the z dimension of the CUDA grid.
const
mmq_args
args
=
{
const
mmq_args
args
=
{
src0_d
,
src0
->
type
,
(
const
int
*
)
src1_q8_1
.
ptr
,
ids_dst_dev
,
expert_bounds_dev
,
dst_d
,
src0_d
,
src0
->
type
,
(
const
int
*
)
src1_q8_1
.
ptr
,
ids_dst_dev
,
expert_bounds_dev
,
dst_d
,
ne00
,
ne01
,
ne_get_rows
,
s01
,
s1
,
ne00
,
ne01
,
ne_get_rows
,
s01
,
ne_get_rows
,
s1
,
ne02
,
ne02
,
s02
,
s12
,
s2
,
ne02
,
ne02
,
s02
,
s12
,
s2
,
ne03
,
ne13
,
s03
,
s13
,
s3
,
ne03
,
ne13
,
s03
,
s13
,
s3
,
use_stream_k
};
use_stream_k
};
...
@@ -241,7 +252,7 @@ void ggml_cuda_op_mul_mat_q(
...
@@ -241,7 +252,7 @@ void ggml_cuda_op_mul_mat_q(
ggml_cuda_highest_compiled_arch
(
cc
)
>=
GGML_CUDA_CC_VOLTA
&&
src1_ncols
==
ne11
;
ggml_cuda_highest_compiled_arch
(
cc
)
>=
GGML_CUDA_CC_VOLTA
&&
src1_ncols
==
ne11
;
const
mmq_args
args
=
{
const
mmq_args
args
=
{
src0_dd_i
,
src0
->
type
,
(
const
int
*
)
src1_ddq_i
,
nullptr
,
nullptr
,
dst_dd_i
,
src0_dd_i
,
src0
->
type
,
(
const
int
*
)
src1_ddq_i
,
nullptr
,
nullptr
,
dst_dd_i
,
ne00
,
row_diff
,
src1_ncols
,
stride01
,
nrows_dst
,
ne00
,
row_diff
,
src1_ncols
,
stride01
,
ne11
,
nrows_dst
,
1
,
1
,
0
,
0
,
0
,
1
,
1
,
0
,
0
,
0
,
1
,
1
,
0
,
0
,
0
,
1
,
1
,
0
,
0
,
0
,
use_stream_k
};
use_stream_k
};
...
...
ml/backend/ggml/ggml/src/ggml-cuda/mmq.cuh
View file @
0cefd46f
...
@@ -2522,7 +2522,7 @@ template <ggml_type type, int mmq_x, int nwarps, bool need_check, bool fixup>
...
@@ -2522,7 +2522,7 @@ template <ggml_type type, int mmq_x, int nwarps, bool need_check, bool fixup>
static
__device__
__forceinline__
void
mul_mat_q_process_tile
(
static
__device__
__forceinline__
void
mul_mat_q_process_tile
(
const
char
*
__restrict__
x
,
const
int
offset_x
,
const
int
*
__restrict__
y
,
const
char
*
__restrict__
x
,
const
int
offset_x
,
const
int
*
__restrict__
y
,
const
int
*
__restrict__
ids_dst
,
float
*
__restrict__
dst
,
float
*
__restrict__
tmp_fixup
,
const
int
*
__restrict__
ids_dst
,
float
*
__restrict__
dst
,
float
*
__restrict__
tmp_fixup
,
const
int
n
row
s
_x
,
const
int
ncols_y
,
const
int
stride_row_x
,
const
int
stride_col_dst
,
const
int
stride_
row_x
,
const
int
ncols_y
,
const
int
stride_col_dst
,
const
int
tile_x_max_i
,
const
int
tile_y_max_j
,
const
int
kb0_start
,
const
int
kb0_stop
)
{
const
int
tile_x_max_i
,
const
int
tile_y_max_j
,
const
int
kb0_start
,
const
int
kb0_stop
)
{
constexpr
int
qk
=
ggml_cuda_type_traits
<
type
>::
qk
;
constexpr
int
qk
=
ggml_cuda_type_traits
<
type
>::
qk
;
...
@@ -2606,7 +2606,7 @@ template <ggml_type type, int mmq_x, int nwarps, bool need_check>
...
@@ -2606,7 +2606,7 @@ template <ggml_type type, int mmq_x, int nwarps, bool need_check>
static
__global__
void
mul_mat_q
(
static
__global__
void
mul_mat_q
(
const
char
*
__restrict__
x
,
const
int
*
__restrict__
y
,
const
int32_t
*
__restrict__
ids_dst
,
const
char
*
__restrict__
x
,
const
int
*
__restrict__
y
,
const
int32_t
*
__restrict__
ids_dst
,
const
int32_t
*
__restrict__
expert_bounds
,
float
*
__restrict__
dst
,
float
*
__restrict__
tmp_fixup
,
const
int32_t
*
__restrict__
expert_bounds
,
float
*
__restrict__
dst
,
float
*
__restrict__
tmp_fixup
,
const
int
ncols_x
,
const
int
nrows_x
,
const
int
ncols_
y
,
const
int
stride_row_x
,
const
int
stride_col_dst
,
const
int
ncols_x
,
const
int
nrows_x
,
const
int
ncols_
dst
,
const
int
stride_row_x
,
const
int
ncols_y
,
const
int
stride_col_dst
,
const
int
channel_ratio
,
const
int
nchannels_y
,
const
int
stride_channel_x
,
const
int
stride_channel_y
,
const
int
stride_channel_dst
,
const
int
channel_ratio
,
const
int
nchannels_y
,
const
int
stride_channel_x
,
const
int
stride_channel_y
,
const
int
stride_channel_dst
,
const
int
sample_ratio
,
const
int
nsamples_y
,
const
int
stride_sample_x
,
const
int
stride_sample_y
,
const
int
stride_sample_dst
)
{
const
int
sample_ratio
,
const
int
nsamples_y
,
const
int
stride_sample_x
,
const
int
stride_sample_y
,
const
int
stride_sample_dst
)
{
...
@@ -2619,8 +2619,8 @@ static __global__ void mul_mat_q(
...
@@ -2619,8 +2619,8 @@ static __global__ void mul_mat_q(
constexpr
int
qk
=
ggml_cuda_type_traits
<
type
>::
qk
;
constexpr
int
qk
=
ggml_cuda_type_traits
<
type
>::
qk
;
constexpr
int
mmq_y
=
get_mmq_y_device
();
constexpr
int
mmq_y
=
get_mmq_y_device
();
const
int
ntx
=
(
ncols_
y
+
mmq_x
-
1
)
/
mmq_x
;
// Number of tiles x
const
int
ntx
=
(
ncols_
dst
+
mmq_x
-
1
)
/
mmq_x
;
// Number of tiles x
const
int
nty
=
(
nrows_x
+
mmq_y
-
1
)
/
mmq_y
;
// Number of tiles y
const
int
nty
=
(
nrows_x
+
mmq_y
-
1
)
/
mmq_y
;
// Number of tiles y
// Initialize the ids for writing back data with just the index.
// Initialize the ids for writing back data with just the index.
// For regular matrix multiplications this is never changed.
// For regular matrix multiplications this is never changed.
...
@@ -2636,6 +2636,7 @@ static __global__ void mul_mat_q(
...
@@ -2636,6 +2636,7 @@ static __global__ void mul_mat_q(
ids_dst_shared
[
j
]
=
j
;
ids_dst_shared
[
j
]
=
j
;
}
}
__syncthreads
();
// On AMD or old CUDA the performance with stream-k was worse, use conventional tiling instead:
// On AMD or old CUDA the performance with stream-k was worse, use conventional tiling instead:
#if (defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) || __CUDA_ARCH__ < GGML_CUDA_CC_VOLTA
#if (defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) || __CUDA_ARCH__ < GGML_CUDA_CC_VOLTA
...
@@ -2647,8 +2648,8 @@ static __global__ void mul_mat_q(
...
@@ -2647,8 +2648,8 @@ static __global__ void mul_mat_q(
// Defaults for regular matrix multiplication:
// Defaults for regular matrix multiplication:
int
col_low
=
0
;
int
col_low
=
0
;
int
col_high
=
ncols_
y
;
int
col_high
=
ncols_
dst
;
int
col_diff
=
ncols_
y
;
int
col_diff
=
ncols_
dst
;
int
offset_y
=
wt
*
stride_sample_y
+
zt
*
stride_channel_y
;
int
offset_y
=
wt
*
stride_sample_y
+
zt
*
stride_channel_y
;
int
offset_dst
=
wt
*
stride_sample_dst
+
zt
*
stride_channel_dst
+
jt
*
mmq_x
*
stride_col_dst
;
int
offset_dst
=
wt
*
stride_sample_dst
+
zt
*
stride_channel_dst
+
jt
*
mmq_x
*
stride_col_dst
;
...
@@ -2664,6 +2665,7 @@ static __global__ void mul_mat_q(
...
@@ -2664,6 +2665,7 @@ static __global__ void mul_mat_q(
return
;
return
;
}
}
// __syncthreads(); // There is no previous tile that could cause a race condition.
#pragma unroll
#pragma unroll
for
(
int
j0
=
0
;
j0
<
mmq_x
;
j0
+=
nwarps
*
WARP_SIZE
)
{
for
(
int
j0
=
0
;
j0
<
mmq_x
;
j0
+=
nwarps
*
WARP_SIZE
)
{
const
int
j
=
j0
+
threadIdx
.
y
*
WARP_SIZE
+
threadIdx
.
x
;
const
int
j
=
j0
+
threadIdx
.
y
*
WARP_SIZE
+
threadIdx
.
x
;
...
@@ -2674,6 +2676,7 @@ static __global__ void mul_mat_q(
...
@@ -2674,6 +2676,7 @@ static __global__ void mul_mat_q(
ids_dst_shared
[
j
]
=
ids_dst
[
col_low
+
jt
*
mmq_x
+
j
];
ids_dst_shared
[
j
]
=
ids_dst
[
col_low
+
jt
*
mmq_x
+
j
];
}
}
__syncthreads
();
}
}
offset_y
+=
(
col_low
+
jt
*
mmq_x
)
*
(
sizeof
(
block_q8_1_mmq
)
/
sizeof
(
int
));
offset_y
+=
(
col_low
+
jt
*
mmq_x
)
*
(
sizeof
(
block_q8_1_mmq
)
/
sizeof
(
int
));
...
@@ -2686,7 +2689,7 @@ static __global__ void mul_mat_q(
...
@@ -2686,7 +2689,7 @@ static __global__ void mul_mat_q(
constexpr
bool
fixup
=
false
;
constexpr
bool
fixup
=
false
;
mul_mat_q_process_tile
<
type
,
mmq_x
,
nwarps
,
need_check
,
fixup
>
mul_mat_q_process_tile
<
type
,
mmq_x
,
nwarps
,
need_check
,
fixup
>
(
x
,
offset_x
,
y
+
offset_y
,
ids_dst_shared
,
dst
+
offset_dst
,
tmp_fixup
,
n
row
s
_x
,
ncols_y
,
stride_row_x
,
stride_col_dst
,
(
x
,
offset_x
,
y
+
offset_y
,
ids_dst_shared
,
dst
+
offset_dst
,
tmp_fixup
,
stride_
row_x
,
ncols_y
,
stride_col_dst
,
tile_x_max_i
,
tile_y_max_j
,
0
,
ncols_x
/
qk
);
tile_x_max_i
,
tile_y_max_j
,
0
,
ncols_x
/
qk
);
return
;
return
;
}
}
...
@@ -2717,8 +2720,8 @@ static __global__ void mul_mat_q(
...
@@ -2717,8 +2720,8 @@ static __global__ void mul_mat_q(
// Defaults for regular matrix multiplication:
// Defaults for regular matrix multiplication:
int
col_low
=
0
;
int
col_low
=
0
;
int
col_high
=
ncols_
y
;
int
col_high
=
ncols_
dst
;
int
col_diff
=
ncols_
y
;
int
col_diff
=
ncols_
dst
;
int
offset_y
=
wt
*
stride_sample_y
+
zt
*
stride_channel_y
;
int
offset_y
=
wt
*
stride_sample_y
+
zt
*
stride_channel_y
;
int
offset_dst
=
wt
*
stride_sample_dst
+
zt
*
stride_channel_dst
+
jt
*
mmq_x
*
stride_col_dst
;
int
offset_dst
=
wt
*
stride_sample_dst
+
zt
*
stride_channel_dst
+
jt
*
mmq_x
*
stride_col_dst
;
...
@@ -2740,6 +2743,7 @@ static __global__ void mul_mat_q(
...
@@ -2740,6 +2743,7 @@ static __global__ void mul_mat_q(
continue
;
continue
;
}
}
__syncthreads
();
#pragma unroll
#pragma unroll
for
(
int
j0
=
0
;
j0
<
mmq_x
;
j0
+=
nwarps
*
WARP_SIZE
)
{
for
(
int
j0
=
0
;
j0
<
mmq_x
;
j0
+=
nwarps
*
WARP_SIZE
)
{
const
int
j
=
j0
+
threadIdx
.
y
*
WARP_SIZE
+
threadIdx
.
x
;
const
int
j
=
j0
+
threadIdx
.
y
*
WARP_SIZE
+
threadIdx
.
x
;
...
@@ -2750,6 +2754,7 @@ static __global__ void mul_mat_q(
...
@@ -2750,6 +2754,7 @@ static __global__ void mul_mat_q(
ids_dst_shared
[
j
]
=
ids_dst
[
col_low
+
jt
*
mmq_x
+
j
];
ids_dst_shared
[
j
]
=
ids_dst
[
col_low
+
jt
*
mmq_x
+
j
];
}
}
__syncthreads
();
}
}
offset_y
+=
(
col_low
+
jt
*
mmq_x
)
*
(
sizeof
(
block_q8_1_mmq
)
/
sizeof
(
int
));
offset_y
+=
(
col_low
+
jt
*
mmq_x
)
*
(
sizeof
(
block_q8_1_mmq
)
/
sizeof
(
int
));
...
@@ -2762,7 +2767,7 @@ static __global__ void mul_mat_q(
...
@@ -2762,7 +2767,7 @@ static __global__ void mul_mat_q(
constexpr
bool
fixup
=
false
;
// All but (potentially) the last iterations write their data to dst rather than the fixup buffer.
constexpr
bool
fixup
=
false
;
// All but (potentially) the last iterations write their data to dst rather than the fixup buffer.
mul_mat_q_process_tile
<
type
,
mmq_x
,
nwarps
,
need_check
,
fixup
>
mul_mat_q_process_tile
<
type
,
mmq_x
,
nwarps
,
need_check
,
fixup
>
(
x
,
offset_x
,
y
+
offset_y
,
ids_dst_shared
,
dst
+
offset_dst
,
tmp_fixup
,
n
row
s
_x
,
ncols_y
,
stride_row_x
,
stride_col_dst
,
(
x
,
offset_x
,
y
+
offset_y
,
ids_dst_shared
,
dst
+
offset_dst
,
tmp_fixup
,
stride_
row_x
,
ncols_y
,
stride_col_dst
,
tile_x_max_i
,
tile_y_max_j
,
kb0_start
,
kb0_stop
);
tile_x_max_i
,
tile_y_max_j
,
kb0_start
,
kb0_stop
);
kbc
+=
blocks_per_ne00
;
kbc
+=
blocks_per_ne00
;
...
@@ -2787,8 +2792,8 @@ static __global__ void mul_mat_q(
...
@@ -2787,8 +2792,8 @@ static __global__ void mul_mat_q(
// Defaults for regular matrix multiplication:
// Defaults for regular matrix multiplication:
int
col_low
=
0
;
int
col_low
=
0
;
int
col_high
=
ncols_
y
;
int
col_high
=
ncols_
dst
;
int
col_diff
=
ncols_
y
;
int
col_diff
=
ncols_
dst
;
int
offset_y
=
wt
*
stride_sample_y
+
zt
*
stride_channel_y
;
int
offset_y
=
wt
*
stride_sample_y
+
zt
*
stride_channel_y
;
int
offset_dst
=
wt
*
stride_sample_dst
+
zt
*
stride_channel_dst
+
jt
*
mmq_x
*
stride_col_dst
;
int
offset_dst
=
wt
*
stride_sample_dst
+
zt
*
stride_channel_dst
+
jt
*
mmq_x
*
stride_col_dst
;
...
@@ -2805,6 +2810,7 @@ static __global__ void mul_mat_q(
...
@@ -2805,6 +2810,7 @@ static __global__ void mul_mat_q(
}
}
// The memory layout for the fixup buffer is always contiguous, therefore reset ids:
// The memory layout for the fixup buffer is always contiguous, therefore reset ids:
__syncthreads
();
#pragma unroll
#pragma unroll
for
(
int
j0
=
0
;
j0
<
mmq_x
;
j0
+=
nwarps
*
WARP_SIZE
)
{
for
(
int
j0
=
0
;
j0
<
mmq_x
;
j0
+=
nwarps
*
WARP_SIZE
)
{
const
int
j
=
j0
+
threadIdx
.
y
*
WARP_SIZE
+
threadIdx
.
x
;
const
int
j
=
j0
+
threadIdx
.
y
*
WARP_SIZE
+
threadIdx
.
x
;
...
@@ -2815,6 +2821,7 @@ static __global__ void mul_mat_q(
...
@@ -2815,6 +2821,7 @@ static __global__ void mul_mat_q(
ids_dst_shared
[
j
]
=
j
;
ids_dst_shared
[
j
]
=
j
;
}
}
__syncthreads
();
}
}
offset_y
+=
(
col_low
+
jt
*
mmq_x
)
*
(
sizeof
(
block_q8_1_mmq
)
/
sizeof
(
int
));
offset_y
+=
(
col_low
+
jt
*
mmq_x
)
*
(
sizeof
(
block_q8_1_mmq
)
/
sizeof
(
int
));
...
@@ -2827,7 +2834,7 @@ static __global__ void mul_mat_q(
...
@@ -2827,7 +2834,7 @@ static __global__ void mul_mat_q(
constexpr
bool
fixup
=
true
;
// Last index writes its data to fixup buffer to avoid data races with other blocks.
constexpr
bool
fixup
=
true
;
// Last index writes its data to fixup buffer to avoid data races with other blocks.
mul_mat_q_process_tile
<
type
,
mmq_x
,
nwarps
,
need_check
,
fixup
>
mul_mat_q_process_tile
<
type
,
mmq_x
,
nwarps
,
need_check
,
fixup
>
(
x
,
offset_x
,
y
+
offset_y
,
ids_dst_shared
,
dst
+
offset_dst
,
tmp_fixup
,
n
row
s
_x
,
ncols_y
,
stride_row_x
,
stride_col_dst
,
(
x
,
offset_x
,
y
+
offset_y
,
ids_dst_shared
,
dst
+
offset_dst
,
tmp_fixup
,
stride_
row_x
,
ncols_y
,
stride_col_dst
,
tile_x_max_i
,
tile_y_max_j
,
kb0_start
,
kb0_stop
);
tile_x_max_i
,
tile_y_max_j
,
kb0_start
,
kb0_stop
);
}
}
...
@@ -2835,7 +2842,7 @@ static __global__ void mul_mat_q(
...
@@ -2835,7 +2842,7 @@ static __global__ void mul_mat_q(
template
<
ggml_type
type
,
int
mmq_x
,
int
nwarps
,
bool
need_check
>
template
<
ggml_type
type
,
int
mmq_x
,
int
nwarps
,
bool
need_check
>
static
__global__
void
mul_mat_q_stream_k_fixup
(
static
__global__
void
mul_mat_q_stream_k_fixup
(
const
int32_t
*
ids_dst
,
const
int32_t
*
expert_bounds
,
float
*
__restrict__
dst
,
const
float
*
__restrict__
tmp_last_tile
,
const
int32_t
*
ids_dst
,
const
int32_t
*
expert_bounds
,
float
*
__restrict__
dst
,
const
float
*
__restrict__
tmp_last_tile
,
const
int
ncols_x
,
const
int
nrows_x
,
const
int
ncols_
y
,
const
int
stride_col_dst
,
const
int
ncols_x
,
const
int
nrows_x
,
const
int
ncols_
dst
,
const
int
stride_col_dst
,
const
int
nchannels_y
,
const
int
stride_channel_dst
,
const
int
nsamples_y
,
const
int
stride_sample_dst
)
{
const
int
nchannels_y
,
const
int
stride_channel_dst
,
const
int
nsamples_y
,
const
int
stride_sample_dst
)
{
constexpr
int
mmq_y
=
get_mmq_y_device
();
constexpr
int
mmq_y
=
get_mmq_y_device
();
constexpr
int
qk
=
ggml_cuda_type_traits
<
type
>::
qk
;
constexpr
int
qk
=
ggml_cuda_type_traits
<
type
>::
qk
;
...
@@ -2844,8 +2851,8 @@ static __global__ void mul_mat_q_stream_k_fixup(
...
@@ -2844,8 +2851,8 @@ static __global__ void mul_mat_q_stream_k_fixup(
float
sum
[
mmq_x
*
mmq_y
/
(
nwarps
*
WARP_SIZE
)]
=
{
0.0
f
};
float
sum
[
mmq_x
*
mmq_y
/
(
nwarps
*
WARP_SIZE
)]
=
{
0.0
f
};
const
int
ntx
=
(
ncols_
y
+
mmq_x
-
1
)
/
mmq_x
;
const
int
ntx
=
(
ncols_
dst
+
mmq_x
-
1
)
/
mmq_x
;
const
int
nty
=
(
nrows_x
+
mmq_y
-
1
)
/
mmq_y
;
const
int
nty
=
(
nrows_x
+
mmq_y
-
1
)
/
mmq_y
;
const
int
bidx0
=
blockIdx
.
x
;
const
int
bidx0
=
blockIdx
.
x
;
...
@@ -2918,8 +2925,8 @@ static __global__ void mul_mat_q_stream_k_fixup(
...
@@ -2918,8 +2925,8 @@ static __global__ void mul_mat_q_stream_k_fixup(
const
int
offset_dst
=
wt
*
stride_sample_dst
+
zt
*
stride_channel_dst
+
jt
*
mmq_x
*
stride_col_dst
+
it
*
mmq_y
;
const
int
offset_dst
=
wt
*
stride_sample_dst
+
zt
*
stride_channel_dst
+
jt
*
mmq_x
*
stride_col_dst
+
it
*
mmq_y
;
dst
+=
offset_dst
;
dst
+=
offset_dst
;
const
int
i_max
=
nrows_x
-
it
*
mmq_y
-
1
;
const
int
i_max
=
nrows_x
-
it
*
mmq_y
-
1
;
const
int
j_max
=
ncols_
y
-
jt
*
mmq_x
-
1
;
const
int
j_max
=
ncols_
dst
-
jt
*
mmq_x
-
1
;
#pragma unroll
#pragma unroll
for
(
int
j0
=
0
;
j0
<
mmq_x
;
j0
+=
nwarps
)
{
for
(
int
j0
=
0
;
j0
<
mmq_x
;
j0
+=
nwarps
)
{
...
@@ -2951,6 +2958,7 @@ static __global__ void mul_mat_q_stream_k_fixup(
...
@@ -2951,6 +2958,7 @@ static __global__ void mul_mat_q_stream_k_fixup(
for
(
int
j
=
threadIdx
.
y
*
WARP_SIZE
+
threadIdx
.
x
;
j
<
mmq_x
;
j
+=
nwarps
*
WARP_SIZE
)
{
for
(
int
j
=
threadIdx
.
y
*
WARP_SIZE
+
threadIdx
.
x
;
j
<
mmq_x
;
j
+=
nwarps
*
WARP_SIZE
)
{
ids_dst_shared
[
j
]
=
ids_dst
[
col_low
+
j
];
ids_dst_shared
[
j
]
=
ids_dst
[
col_low
+
j
];
}
}
__syncthreads
();
const
int
offset_dst
=
it
*
mmq_y
;
const
int
offset_dst
=
it
*
mmq_y
;
dst
+=
offset_dst
;
dst
+=
offset_dst
;
...
@@ -2981,7 +2989,7 @@ static __global__ void mul_mat_q_stream_k_fixup(
...
@@ -2981,7 +2989,7 @@ static __global__ void mul_mat_q_stream_k_fixup(
struct
mmq_args
{
struct
mmq_args
{
const
char
*
x
;
ggml_type
type_x
;
const
int
*
y
;
const
int32_t
*
ids_dst
;
const
int32_t
*
expert_bounds
;
float
*
dst
;
const
char
*
x
;
ggml_type
type_x
;
const
int
*
y
;
const
int32_t
*
ids_dst
;
const
int32_t
*
expert_bounds
;
float
*
dst
;
int64_t
ncols_x
;
int64_t
nrows_x
;
int64_t
ncols_
y
;
int64_t
stride_row_x
;
int64_t
nrows_dst
;
int64_t
ncols_x
;
int64_t
nrows_x
;
int64_t
ncols_
dst
;
int64_t
stride_row_x
;
int64_t
ncols_y
;
int64_t
nrows_dst
;
int64_t
nchannels_x
;
int64_t
nchannels_y
;
int64_t
stride_channel_x
;
int64_t
stride_channel_y
;
int64_t
stride_channel_dst
;
int64_t
nchannels_x
;
int64_t
nchannels_y
;
int64_t
stride_channel_x
;
int64_t
stride_channel_y
;
int64_t
stride_channel_dst
;
int64_t
nsamples_x
;
int64_t
nsamples_y
;
int64_t
stride_sample_x
;
int64_t
stride_sample_y
;
int64_t
stride_sample_dst
;
int64_t
nsamples_x
;
int64_t
nsamples_y
;
int64_t
stride_sample_x
;
int64_t
stride_sample_y
;
int64_t
stride_sample_dst
;
bool
use_stream_k
;
bool
use_stream_k
;
...
@@ -3017,8 +3025,8 @@ static void launch_mul_mat_q(ggml_backend_cuda_context & ctx, const mmq_args & a
...
@@ -3017,8 +3025,8 @@ static void launch_mul_mat_q(ggml_backend_cuda_context & ctx, const mmq_args & a
}
}
#endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && !defined(GGML_USE_MUSA)
#endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && !defined(GGML_USE_MUSA)
const
int
nty
=
(
args
.
nrows_x
+
mmq_y
-
1
)
/
mmq_y
;
const
int
nty
=
(
args
.
nrows_x
+
mmq_y
-
1
)
/
mmq_y
;
const
int
ntx
=
(
args
.
ncols_
y
+
mmq_x
-
1
)
/
mmq_x
;
const
int
ntx
=
(
args
.
ncols_
dst
+
mmq_x
-
1
)
/
mmq_x
;
const
int
ntzw
=
args
.
nchannels_y
*
args
.
nsamples_y
;
const
int
ntzw
=
args
.
nchannels_y
*
args
.
nsamples_y
;
const
dim3
block_nums_xy_tiling
(
nty
,
ntx
,
ntzw
);
const
dim3
block_nums_xy_tiling
(
nty
,
ntx
,
ntzw
);
...
@@ -3032,14 +3040,14 @@ static void launch_mul_mat_q(ggml_backend_cuda_context & ctx, const mmq_args & a
...
@@ -3032,14 +3040,14 @@ static void launch_mul_mat_q(ggml_backend_cuda_context & ctx, const mmq_args & a
constexpr
bool
need_check
=
false
;
constexpr
bool
need_check
=
false
;
mul_mat_q
<
type
,
mmq_x
,
MMQ_NWARPS
,
need_check
><<<
block_nums_xy_tiling
,
block_dims
,
nbytes_shared
,
stream
>>>
mul_mat_q
<
type
,
mmq_x
,
MMQ_NWARPS
,
need_check
><<<
block_nums_xy_tiling
,
block_dims
,
nbytes_shared
,
stream
>>>
(
args
.
x
,
args
.
y
,
args
.
ids_dst
,
args
.
expert_bounds
,
args
.
dst
,
nullptr
,
(
args
.
x
,
args
.
y
,
args
.
ids_dst
,
args
.
expert_bounds
,
args
.
dst
,
nullptr
,
args
.
ncols_x
,
args
.
nrows_x
,
args
.
ncols_
y
,
args
.
stride_row_x
,
args
.
nrows_dst
,
args
.
ncols_x
,
args
.
nrows_x
,
args
.
ncols_
dst
,
args
.
stride_row_x
,
args
.
ncols_y
,
args
.
nrows_dst
,
channel_ratio
,
args
.
nchannels_y
,
args
.
stride_channel_x
,
args
.
stride_channel_y
,
args
.
stride_channel_dst
,
channel_ratio
,
args
.
nchannels_y
,
args
.
stride_channel_x
,
args
.
stride_channel_y
,
args
.
stride_channel_dst
,
sample_ratio
,
args
.
nsamples_y
,
args
.
stride_sample_x
,
args
.
stride_sample_y
,
args
.
stride_sample_dst
);
sample_ratio
,
args
.
nsamples_y
,
args
.
stride_sample_x
,
args
.
stride_sample_y
,
args
.
stride_sample_dst
);
}
else
{
}
else
{
constexpr
bool
need_check
=
true
;
constexpr
bool
need_check
=
true
;
mul_mat_q
<
type
,
mmq_x
,
MMQ_NWARPS
,
need_check
><<<
block_nums_xy_tiling
,
block_dims
,
nbytes_shared
,
stream
>>>
mul_mat_q
<
type
,
mmq_x
,
MMQ_NWARPS
,
need_check
><<<
block_nums_xy_tiling
,
block_dims
,
nbytes_shared
,
stream
>>>
(
args
.
x
,
args
.
y
,
args
.
ids_dst
,
args
.
expert_bounds
,
args
.
dst
,
nullptr
,
(
args
.
x
,
args
.
y
,
args
.
ids_dst
,
args
.
expert_bounds
,
args
.
dst
,
nullptr
,
args
.
ncols_x
,
args
.
nrows_x
,
args
.
ncols_
y
,
args
.
stride_row_x
,
args
.
nrows_dst
,
args
.
ncols_x
,
args
.
nrows_x
,
args
.
ncols_
dst
,
args
.
stride_row_x
,
args
.
ncols_y
,
args
.
nrows_dst
,
channel_ratio
,
args
.
nchannels_y
,
args
.
stride_channel_x
,
args
.
stride_channel_y
,
args
.
stride_channel_dst
,
channel_ratio
,
args
.
nchannels_y
,
args
.
stride_channel_x
,
args
.
stride_channel_y
,
args
.
stride_channel_dst
,
sample_ratio
,
args
.
nsamples_y
,
args
.
stride_sample_x
,
args
.
stride_sample_y
,
args
.
stride_sample_dst
);
sample_ratio
,
args
.
nsamples_y
,
args
.
stride_sample_x
,
args
.
stride_sample_y
,
args
.
stride_sample_dst
);
}
}
...
@@ -3060,7 +3068,7 @@ static void launch_mul_mat_q(ggml_backend_cuda_context & ctx, const mmq_args & a
...
@@ -3060,7 +3068,7 @@ static void launch_mul_mat_q(ggml_backend_cuda_context & ctx, const mmq_args & a
mul_mat_q
<
type
,
mmq_x
,
MMQ_NWARPS
,
need_check
><<<
block_nums_stream_k
,
block_dims
,
nbytes_shared
,
stream
>>>
mul_mat_q
<
type
,
mmq_x
,
MMQ_NWARPS
,
need_check
><<<
block_nums_stream_k
,
block_dims
,
nbytes_shared
,
stream
>>>
(
args
.
x
,
args
.
y
,
args
.
ids_dst
,
args
.
expert_bounds
,
args
.
dst
,
tmp_fixup
.
ptr
,
(
args
.
x
,
args
.
y
,
args
.
ids_dst
,
args
.
expert_bounds
,
args
.
dst
,
tmp_fixup
.
ptr
,
args
.
ncols_x
,
args
.
nrows_x
,
args
.
ncols_
y
,
args
.
stride_row_x
,
args
.
nrows_dst
,
args
.
ncols_x
,
args
.
nrows_x
,
args
.
ncols_
dst
,
args
.
stride_row_x
,
args
.
ncols_y
,
args
.
nrows_dst
,
channel_ratio
,
args
.
nchannels_y
,
args
.
stride_channel_x
,
args
.
stride_channel_y
,
args
.
stride_channel_dst
,
channel_ratio
,
args
.
nchannels_y
,
args
.
stride_channel_x
,
args
.
stride_channel_y
,
args
.
stride_channel_dst
,
sample_ratio
,
args
.
nsamples_y
,
args
.
stride_sample_x
,
args
.
stride_sample_y
,
args
.
stride_sample_dst
);
sample_ratio
,
args
.
nsamples_y
,
args
.
stride_sample_x
,
args
.
stride_sample_y
,
args
.
stride_sample_dst
);
...
@@ -3069,14 +3077,14 @@ static void launch_mul_mat_q(ggml_backend_cuda_context & ctx, const mmq_args & a
...
@@ -3069,14 +3077,14 @@ static void launch_mul_mat_q(ggml_backend_cuda_context & ctx, const mmq_args & a
}
}
mul_mat_q_stream_k_fixup
<
type
,
mmq_x
,
MMQ_NWARPS
,
need_check
><<<
block_nums_stream_k
,
block_dims
,
0
,
stream
>>>
mul_mat_q_stream_k_fixup
<
type
,
mmq_x
,
MMQ_NWARPS
,
need_check
><<<
block_nums_stream_k
,
block_dims
,
0
,
stream
>>>
(
args
.
ids_dst
,
args
.
expert_bounds
,
args
.
dst
,
tmp_fixup
.
ptr
,
args
.
ncols_x
,
args
.
nrows_x
,
args
.
ncols_
y
,
(
args
.
ids_dst
,
args
.
expert_bounds
,
args
.
dst
,
tmp_fixup
.
ptr
,
args
.
ncols_x
,
args
.
nrows_x
,
args
.
ncols_
dst
,
args
.
nrows_dst
,
args
.
nchannels_y
,
args
.
stride_channel_dst
,
args
.
nsamples_y
,
args
.
stride_sample_dst
);
args
.
nrows_dst
,
args
.
nchannels_y
,
args
.
stride_channel_dst
,
args
.
nsamples_y
,
args
.
stride_sample_dst
);
}
else
{
}
else
{
constexpr
bool
need_check
=
true
;
constexpr
bool
need_check
=
true
;
mul_mat_q
<
type
,
mmq_x
,
MMQ_NWARPS
,
need_check
><<<
block_nums_stream_k
,
block_dims
,
nbytes_shared
,
stream
>>>
mul_mat_q
<
type
,
mmq_x
,
MMQ_NWARPS
,
need_check
><<<
block_nums_stream_k
,
block_dims
,
nbytes_shared
,
stream
>>>
(
args
.
x
,
args
.
y
,
args
.
ids_dst
,
args
.
expert_bounds
,
args
.
dst
,
tmp_fixup
.
ptr
,
(
args
.
x
,
args
.
y
,
args
.
ids_dst
,
args
.
expert_bounds
,
args
.
dst
,
tmp_fixup
.
ptr
,
args
.
ncols_x
,
args
.
nrows_x
,
args
.
ncols_
y
,
args
.
stride_row_x
,
args
.
nrows_dst
,
args
.
ncols_x
,
args
.
nrows_x
,
args
.
ncols_
dst
,
args
.
stride_row_x
,
args
.
ncols_y
,
args
.
nrows_dst
,
channel_ratio
,
args
.
nchannels_y
,
args
.
stride_channel_x
,
args
.
stride_channel_y
,
args
.
stride_channel_dst
,
channel_ratio
,
args
.
nchannels_y
,
args
.
stride_channel_x
,
args
.
stride_channel_y
,
args
.
stride_channel_dst
,
sample_ratio
,
args
.
nsamples_y
,
args
.
stride_sample_x
,
args
.
stride_sample_y
,
args
.
stride_sample_dst
);
sample_ratio
,
args
.
nsamples_y
,
args
.
stride_sample_x
,
args
.
stride_sample_y
,
args
.
stride_sample_dst
);
...
@@ -3085,7 +3093,7 @@ static void launch_mul_mat_q(ggml_backend_cuda_context & ctx, const mmq_args & a
...
@@ -3085,7 +3093,7 @@ static void launch_mul_mat_q(ggml_backend_cuda_context & ctx, const mmq_args & a
}
}
mul_mat_q_stream_k_fixup
<
type
,
mmq_x
,
MMQ_NWARPS
,
need_check
><<<
block_nums_stream_k
,
block_dims
,
0
,
stream
>>>
mul_mat_q_stream_k_fixup
<
type
,
mmq_x
,
MMQ_NWARPS
,
need_check
><<<
block_nums_stream_k
,
block_dims
,
0
,
stream
>>>
(
args
.
ids_dst
,
args
.
expert_bounds
,
args
.
dst
,
tmp_fixup
.
ptr
,
args
.
ncols_x
,
args
.
nrows_x
,
args
.
ncols_
y
,
(
args
.
ids_dst
,
args
.
expert_bounds
,
args
.
dst
,
tmp_fixup
.
ptr
,
args
.
ncols_x
,
args
.
nrows_x
,
args
.
ncols_
dst
,
args
.
nrows_dst
,
args
.
nchannels_y
,
args
.
stride_channel_dst
,
args
.
nsamples_y
,
args
.
stride_sample_dst
);
args
.
nrows_dst
,
args
.
nchannels_y
,
args
.
stride_channel_dst
,
args
.
nsamples_y
,
args
.
stride_sample_dst
);
}
}
}
}
...
...
ml/backend/ggml/ggml/src/ggml-cuda/mmvq.cu
View file @
0cefd46f
...
@@ -513,6 +513,17 @@ void ggml_cuda_mul_mat_vec_q(
...
@@ -513,6 +513,17 @@ void ggml_cuda_mul_mat_vec_q(
const
int32_t
*
ids_d
=
ids
?
(
const
int32_t
*
)
ids
->
data
:
nullptr
;
const
int32_t
*
ids_d
=
ids
?
(
const
int32_t
*
)
ids
->
data
:
nullptr
;
float
*
dst_d
=
(
float
*
)
dst
->
data
;
float
*
dst_d
=
(
float
*
)
dst
->
data
;
// If src0 is a temporary compute buffer, clear any potential padding.
if
(
ggml_backend_buffer_get_usage
(
src0
->
buffer
)
==
GGML_BACKEND_BUFFER_USAGE_COMPUTE
)
{
const
size_t
size_data
=
ggml_nbytes
(
src0
);
const
size_t
size_alloc
=
ggml_backend_buffer_get_alloc_size
(
src0
->
buffer
,
src0
);
if
(
size_alloc
>
size_data
)
{
GGML_ASSERT
(
ggml_is_contiguously_allocated
(
src0
));
GGML_ASSERT
(
!
src0
->
view_src
);
CUDA_CHECK
(
cudaMemsetAsync
((
char
*
)
src0
->
data
+
size_data
,
0
,
size_alloc
-
size_data
,
stream
));
}
}
const
int64_t
ne10_padded
=
GGML_PAD
(
ne10
,
MATRIX_ROW_PADDING
);
const
int64_t
ne10_padded
=
GGML_PAD
(
ne10
,
MATRIX_ROW_PADDING
);
ggml_cuda_pool_alloc
<
char
>
src1_q8_1
(
ctx
.
pool
(),
ne13
*
ne12
*
ne11
*
ne10_padded
*
sizeof
(
block_q8_1
)
/
QK8_1
);
ggml_cuda_pool_alloc
<
char
>
src1_q8_1
(
ctx
.
pool
(),
ne13
*
ne12
*
ne11
*
ne10_padded
*
sizeof
(
block_q8_1
)
/
QK8_1
);
{
{
...
...
ml/backend/ggml/ggml/src/ggml-cuda/quantize.cu
View file @
0cefd46f
...
@@ -163,6 +163,7 @@ void quantize_mmq_q8_1_cuda(
...
@@ -163,6 +163,7 @@ void quantize_mmq_q8_1_cuda(
const
float
*
x
,
const
int32_t
*
ids
,
void
*
vy
,
const
ggml_type
type_src0
,
const
float
*
x
,
const
int32_t
*
ids
,
void
*
vy
,
const
ggml_type
type_src0
,
const
int64_t
ne00
,
const
int64_t
s01
,
const
int64_t
s02
,
const
int64_t
s03
,
const
int64_t
ne00
,
const
int64_t
s01
,
const
int64_t
s02
,
const
int64_t
s03
,
const
int64_t
ne0
,
const
int64_t
ne1
,
const
int64_t
ne2
,
const
int64_t
ne3
,
cudaStream_t
stream
)
{
const
int64_t
ne0
,
const
int64_t
ne1
,
const
int64_t
ne2
,
const
int64_t
ne3
,
cudaStream_t
stream
)
{
GGML_ASSERT
(
ne00
%
4
==
0
);
GGML_ASSERT
(
ne0
%
(
4
*
QK8_1
)
==
0
);
GGML_ASSERT
(
ne0
%
(
4
*
QK8_1
)
==
0
);
const
int64_t
block_num_x
=
(
ne0
+
4
*
CUDA_QUANTIZE_BLOCK_SIZE_MMQ
-
1
)
/
(
4
*
CUDA_QUANTIZE_BLOCK_SIZE_MMQ
);
const
int64_t
block_num_x
=
(
ne0
+
4
*
CUDA_QUANTIZE_BLOCK_SIZE_MMQ
-
1
)
/
(
4
*
CUDA_QUANTIZE_BLOCK_SIZE_MMQ
);
...
...
ml/backend/ggml/ggml/src/ggml-cuda/sum.cu
View file @
0cefd46f
...
@@ -31,7 +31,7 @@ void ggml_cuda_op_sum(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
...
@@ -31,7 +31,7 @@ void ggml_cuda_op_sum(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
GGML_ASSERT
(
src0
->
type
==
GGML_TYPE_F32
);
GGML_ASSERT
(
src0
->
type
==
GGML_TYPE_F32
);
GGML_ASSERT
(
dst
->
type
==
GGML_TYPE_F32
);
GGML_ASSERT
(
dst
->
type
==
GGML_TYPE_F32
);
GGML_ASSERT
(
ggml_is_contiguous
(
src0
));
GGML_ASSERT
(
ggml_is_contiguous
ly_allocated
(
src0
));
const
float
*
src0_d
=
(
const
float
*
)
src0
->
data
;
const
float
*
src0_d
=
(
const
float
*
)
src0
->
data
;
float
*
dst_d
=
(
float
*
)
dst
->
data
;
float
*
dst_d
=
(
float
*
)
dst
->
data
;
...
...
ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_1-ncols2_16.cu
0 → 100644
View file @
0cefd46f
// This file has been autogenerated by generate_cu_files.py, do not edit manually.
#include "../fattn-mma-f16.cuh"
DECL_FATTN_MMA_F16_CASE
(
576
,
512
,
1
,
16
);
ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_1-ncols2_8.cu
View file @
0cefd46f
...
@@ -2,9 +2,9 @@
...
@@ -2,9 +2,9 @@
#include "../fattn-mma-f16.cuh"
#include "../fattn-mma-f16.cuh"
DECL_FATTN_MMA_F16_CASE
(
64
,
1
,
8
);
DECL_FATTN_MMA_F16_CASE
(
64
,
64
,
1
,
8
);
DECL_FATTN_MMA_F16_CASE
(
80
,
1
,
8
);
DECL_FATTN_MMA_F16_CASE
(
80
,
80
,
1
,
8
);
DECL_FATTN_MMA_F16_CASE
(
96
,
1
,
8
);
DECL_FATTN_MMA_F16_CASE
(
96
,
96
,
1
,
8
);
DECL_FATTN_MMA_F16_CASE
(
112
,
1
,
8
);
DECL_FATTN_MMA_F16_CASE
(
112
,
112
,
1
,
8
);
DECL_FATTN_MMA_F16_CASE
(
128
,
1
,
8
);
DECL_FATTN_MMA_F16_CASE
(
128
,
128
,
1
,
8
);
DECL_FATTN_MMA_F16_CASE
(
256
,
1
,
8
);
DECL_FATTN_MMA_F16_CASE
(
256
,
256
,
1
,
8
);
ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_16-ncols2_1.cu
View file @
0cefd46f
...
@@ -2,9 +2,9 @@
...
@@ -2,9 +2,9 @@
#include "../fattn-mma-f16.cuh"
#include "../fattn-mma-f16.cuh"
DECL_FATTN_MMA_F16_CASE
(
64
,
16
,
1
);
DECL_FATTN_MMA_F16_CASE
(
64
,
64
,
16
,
1
);
DECL_FATTN_MMA_F16_CASE
(
80
,
16
,
1
);
DECL_FATTN_MMA_F16_CASE
(
80
,
80
,
16
,
1
);
DECL_FATTN_MMA_F16_CASE
(
96
,
16
,
1
);
DECL_FATTN_MMA_F16_CASE
(
96
,
96
,
16
,
1
);
DECL_FATTN_MMA_F16_CASE
(
112
,
16
,
1
);
DECL_FATTN_MMA_F16_CASE
(
112
,
112
,
16
,
1
);
DECL_FATTN_MMA_F16_CASE
(
128
,
16
,
1
);
DECL_FATTN_MMA_F16_CASE
(
128
,
128
,
16
,
1
);
DECL_FATTN_MMA_F16_CASE
(
256
,
16
,
1
);
DECL_FATTN_MMA_F16_CASE
(
256
,
256
,
16
,
1
);
ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_16-ncols2_2.cu
View file @
0cefd46f
...
@@ -2,9 +2,9 @@
...
@@ -2,9 +2,9 @@
#include "../fattn-mma-f16.cuh"
#include "../fattn-mma-f16.cuh"
DECL_FATTN_MMA_F16_CASE
(
64
,
16
,
2
);
DECL_FATTN_MMA_F16_CASE
(
64
,
64
,
16
,
2
);
DECL_FATTN_MMA_F16_CASE
(
80
,
16
,
2
);
DECL_FATTN_MMA_F16_CASE
(
80
,
80
,
16
,
2
);
DECL_FATTN_MMA_F16_CASE
(
96
,
16
,
2
);
DECL_FATTN_MMA_F16_CASE
(
96
,
96
,
16
,
2
);
DECL_FATTN_MMA_F16_CASE
(
112
,
16
,
2
);
DECL_FATTN_MMA_F16_CASE
(
112
,
112
,
16
,
2
);
DECL_FATTN_MMA_F16_CASE
(
128
,
16
,
2
);
DECL_FATTN_MMA_F16_CASE
(
128
,
128
,
16
,
2
);
DECL_FATTN_MMA_F16_CASE
(
256
,
16
,
2
);
DECL_FATTN_MMA_F16_CASE
(
256
,
256
,
16
,
2
);
ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_16-ncols2_4.cu
View file @
0cefd46f
...
@@ -2,9 +2,9 @@
...
@@ -2,9 +2,9 @@
#include "../fattn-mma-f16.cuh"
#include "../fattn-mma-f16.cuh"
DECL_FATTN_MMA_F16_CASE
(
64
,
16
,
4
);
DECL_FATTN_MMA_F16_CASE
(
64
,
64
,
16
,
4
);
DECL_FATTN_MMA_F16_CASE
(
80
,
16
,
4
);
DECL_FATTN_MMA_F16_CASE
(
80
,
80
,
16
,
4
);
DECL_FATTN_MMA_F16_CASE
(
96
,
16
,
4
);
DECL_FATTN_MMA_F16_CASE
(
96
,
96
,
16
,
4
);
DECL_FATTN_MMA_F16_CASE
(
112
,
16
,
4
);
DECL_FATTN_MMA_F16_CASE
(
112
,
112
,
16
,
4
);
DECL_FATTN_MMA_F16_CASE
(
128
,
16
,
4
);
DECL_FATTN_MMA_F16_CASE
(
128
,
128
,
16
,
4
);
DECL_FATTN_MMA_F16_CASE
(
256
,
16
,
4
);
DECL_FATTN_MMA_F16_CASE
(
256
,
256
,
16
,
4
);
ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_2-ncols2_16.cu
0 → 100644
View file @
0cefd46f
// This file has been autogenerated by generate_cu_files.py, do not edit manually.
#include "../fattn-mma-f16.cuh"
DECL_FATTN_MMA_F16_CASE
(
576
,
512
,
2
,
16
);
ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_2-ncols2_4.cu
View file @
0cefd46f
...
@@ -2,9 +2,9 @@
...
@@ -2,9 +2,9 @@
#include "../fattn-mma-f16.cuh"
#include "../fattn-mma-f16.cuh"
DECL_FATTN_MMA_F16_CASE
(
64
,
2
,
4
);
DECL_FATTN_MMA_F16_CASE
(
64
,
64
,
2
,
4
);
DECL_FATTN_MMA_F16_CASE
(
80
,
2
,
4
);
DECL_FATTN_MMA_F16_CASE
(
80
,
80
,
2
,
4
);
DECL_FATTN_MMA_F16_CASE
(
96
,
2
,
4
);
DECL_FATTN_MMA_F16_CASE
(
96
,
96
,
2
,
4
);
DECL_FATTN_MMA_F16_CASE
(
112
,
2
,
4
);
DECL_FATTN_MMA_F16_CASE
(
112
,
112
,
2
,
4
);
DECL_FATTN_MMA_F16_CASE
(
128
,
2
,
4
);
DECL_FATTN_MMA_F16_CASE
(
128
,
128
,
2
,
4
);
DECL_FATTN_MMA_F16_CASE
(
256
,
2
,
4
);
DECL_FATTN_MMA_F16_CASE
(
256
,
256
,
2
,
4
);
ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_2-ncols2_8.cu
View file @
0cefd46f
...
@@ -2,9 +2,9 @@
...
@@ -2,9 +2,9 @@
#include "../fattn-mma-f16.cuh"
#include "../fattn-mma-f16.cuh"
DECL_FATTN_MMA_F16_CASE
(
64
,
2
,
8
);
DECL_FATTN_MMA_F16_CASE
(
64
,
64
,
2
,
8
);
DECL_FATTN_MMA_F16_CASE
(
80
,
2
,
8
);
DECL_FATTN_MMA_F16_CASE
(
80
,
80
,
2
,
8
);
DECL_FATTN_MMA_F16_CASE
(
96
,
2
,
8
);
DECL_FATTN_MMA_F16_CASE
(
96
,
96
,
2
,
8
);
DECL_FATTN_MMA_F16_CASE
(
112
,
2
,
8
);
DECL_FATTN_MMA_F16_CASE
(
112
,
112
,
2
,
8
);
DECL_FATTN_MMA_F16_CASE
(
128
,
2
,
8
);
DECL_FATTN_MMA_F16_CASE
(
128
,
128
,
2
,
8
);
DECL_FATTN_MMA_F16_CASE
(
256
,
2
,
8
);
DECL_FATTN_MMA_F16_CASE
(
256
,
256
,
2
,
8
);
ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_32-ncols2_1.cu
View file @
0cefd46f
...
@@ -2,9 +2,9 @@
...
@@ -2,9 +2,9 @@
#include "../fattn-mma-f16.cuh"
#include "../fattn-mma-f16.cuh"
DECL_FATTN_MMA_F16_CASE
(
64
,
32
,
1
);
DECL_FATTN_MMA_F16_CASE
(
64
,
64
,
32
,
1
);
DECL_FATTN_MMA_F16_CASE
(
80
,
32
,
1
);
DECL_FATTN_MMA_F16_CASE
(
80
,
80
,
32
,
1
);
DECL_FATTN_MMA_F16_CASE
(
96
,
32
,
1
);
DECL_FATTN_MMA_F16_CASE
(
96
,
96
,
32
,
1
);
DECL_FATTN_MMA_F16_CASE
(
112
,
32
,
1
);
DECL_FATTN_MMA_F16_CASE
(
112
,
112
,
32
,
1
);
DECL_FATTN_MMA_F16_CASE
(
128
,
32
,
1
);
DECL_FATTN_MMA_F16_CASE
(
128
,
128
,
32
,
1
);
DECL_FATTN_MMA_F16_CASE
(
256
,
32
,
1
);
DECL_FATTN_MMA_F16_CASE
(
256
,
256
,
32
,
1
);
ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_32-ncols2_2.cu
View file @
0cefd46f
...
@@ -2,9 +2,9 @@
...
@@ -2,9 +2,9 @@
#include "../fattn-mma-f16.cuh"
#include "../fattn-mma-f16.cuh"
DECL_FATTN_MMA_F16_CASE
(
64
,
32
,
2
);
DECL_FATTN_MMA_F16_CASE
(
64
,
64
,
32
,
2
);
DECL_FATTN_MMA_F16_CASE
(
80
,
32
,
2
);
DECL_FATTN_MMA_F16_CASE
(
80
,
80
,
32
,
2
);
DECL_FATTN_MMA_F16_CASE
(
96
,
32
,
2
);
DECL_FATTN_MMA_F16_CASE
(
96
,
96
,
32
,
2
);
DECL_FATTN_MMA_F16_CASE
(
112
,
32
,
2
);
DECL_FATTN_MMA_F16_CASE
(
112
,
112
,
32
,
2
);
DECL_FATTN_MMA_F16_CASE
(
128
,
32
,
2
);
DECL_FATTN_MMA_F16_CASE
(
128
,
128
,
32
,
2
);
DECL_FATTN_MMA_F16_CASE
(
256
,
32
,
2
);
DECL_FATTN_MMA_F16_CASE
(
256
,
256
,
32
,
2
);
ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_4-ncols2_16.cu
0 → 100644
View file @
0cefd46f
// This file has been autogenerated by generate_cu_files.py, do not edit manually.
#include "../fattn-mma-f16.cuh"
DECL_FATTN_MMA_F16_CASE
(
576
,
512
,
4
,
16
);
ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_4-ncols2_2.cu
View file @
0cefd46f
...
@@ -2,9 +2,9 @@
...
@@ -2,9 +2,9 @@
#include "../fattn-mma-f16.cuh"
#include "../fattn-mma-f16.cuh"
DECL_FATTN_MMA_F16_CASE
(
64
,
4
,
2
);
DECL_FATTN_MMA_F16_CASE
(
64
,
64
,
4
,
2
);
DECL_FATTN_MMA_F16_CASE
(
80
,
4
,
2
);
DECL_FATTN_MMA_F16_CASE
(
80
,
80
,
4
,
2
);
DECL_FATTN_MMA_F16_CASE
(
96
,
4
,
2
);
DECL_FATTN_MMA_F16_CASE
(
96
,
96
,
4
,
2
);
DECL_FATTN_MMA_F16_CASE
(
112
,
4
,
2
);
DECL_FATTN_MMA_F16_CASE
(
112
,
112
,
4
,
2
);
DECL_FATTN_MMA_F16_CASE
(
128
,
4
,
2
);
DECL_FATTN_MMA_F16_CASE
(
128
,
128
,
4
,
2
);
DECL_FATTN_MMA_F16_CASE
(
256
,
4
,
2
);
DECL_FATTN_MMA_F16_CASE
(
256
,
256
,
4
,
2
);
ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_4-ncols2_4.cu
View file @
0cefd46f
...
@@ -2,9 +2,9 @@
...
@@ -2,9 +2,9 @@
#include "../fattn-mma-f16.cuh"
#include "../fattn-mma-f16.cuh"
DECL_FATTN_MMA_F16_CASE
(
64
,
4
,
4
);
DECL_FATTN_MMA_F16_CASE
(
64
,
64
,
4
,
4
);
DECL_FATTN_MMA_F16_CASE
(
80
,
4
,
4
);
DECL_FATTN_MMA_F16_CASE
(
80
,
80
,
4
,
4
);
DECL_FATTN_MMA_F16_CASE
(
96
,
4
,
4
);
DECL_FATTN_MMA_F16_CASE
(
96
,
96
,
4
,
4
);
DECL_FATTN_MMA_F16_CASE
(
112
,
4
,
4
);
DECL_FATTN_MMA_F16_CASE
(
112
,
112
,
4
,
4
);
DECL_FATTN_MMA_F16_CASE
(
128
,
4
,
4
);
DECL_FATTN_MMA_F16_CASE
(
128
,
128
,
4
,
4
);
DECL_FATTN_MMA_F16_CASE
(
256
,
4
,
4
);
DECL_FATTN_MMA_F16_CASE
(
256
,
256
,
4
,
4
);
Prev
1
2
3
4
5
6
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment