Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
ollama
Commits
0cefd46f
Unverified
Commit
0cefd46f
authored
May 12, 2025
by
Jeffrey Morgan
Committed by
GitHub
May 12, 2025
Browse files
llama: update to commit de4c07f93 (#10655)
parent
ad035ad5
Changes
113
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
179 additions
and
116 deletions
+179
-116
ml/backend/ggml/ggml/src/ggml-cuda/getrows.cu
ml/backend/ggml/ggml/src/ggml-cuda/getrows.cu
+14
-12
ml/backend/ggml/ggml/src/ggml-cuda/ggml-cuda.cu
ml/backend/ggml/ggml/src/ggml-cuda/ggml-cuda.cu
+28
-13
ml/backend/ggml/ggml/src/ggml-cuda/mmq.cu
ml/backend/ggml/ggml/src/ggml-cuda/mmq.cu
+14
-3
ml/backend/ggml/ggml/src/ggml-cuda/mmq.cuh
ml/backend/ggml/ggml/src/ggml-cuda/mmq.cuh
+35
-27
ml/backend/ggml/ggml/src/ggml-cuda/mmvq.cu
ml/backend/ggml/ggml/src/ggml-cuda/mmvq.cu
+11
-0
ml/backend/ggml/ggml/src/ggml-cuda/quantize.cu
ml/backend/ggml/ggml/src/ggml-cuda/quantize.cu
+1
-0
ml/backend/ggml/ggml/src/ggml-cuda/sum.cu
ml/backend/ggml/ggml/src/ggml-cuda/sum.cu
+1
-1
ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_1-ncols2_16.cu
...te-instances/fattn-mma-f16-instance-ncols1_1-ncols2_16.cu
+5
-0
ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_1-ncols2_8.cu
...ate-instances/fattn-mma-f16-instance-ncols1_1-ncols2_8.cu
+6
-6
ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_16-ncols2_1.cu
...te-instances/fattn-mma-f16-instance-ncols1_16-ncols2_1.cu
+6
-6
ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_16-ncols2_2.cu
...te-instances/fattn-mma-f16-instance-ncols1_16-ncols2_2.cu
+6
-6
ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_16-ncols2_4.cu
...te-instances/fattn-mma-f16-instance-ncols1_16-ncols2_4.cu
+6
-6
ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_2-ncols2_16.cu
...te-instances/fattn-mma-f16-instance-ncols1_2-ncols2_16.cu
+5
-0
ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_2-ncols2_4.cu
...ate-instances/fattn-mma-f16-instance-ncols1_2-ncols2_4.cu
+6
-6
ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_2-ncols2_8.cu
...ate-instances/fattn-mma-f16-instance-ncols1_2-ncols2_8.cu
+6
-6
ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_32-ncols2_1.cu
...te-instances/fattn-mma-f16-instance-ncols1_32-ncols2_1.cu
+6
-6
ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_32-ncols2_2.cu
...te-instances/fattn-mma-f16-instance-ncols1_32-ncols2_2.cu
+6
-6
ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_4-ncols2_16.cu
...te-instances/fattn-mma-f16-instance-ncols1_4-ncols2_16.cu
+5
-0
ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_4-ncols2_2.cu
...ate-instances/fattn-mma-f16-instance-ncols1_4-ncols2_2.cu
+6
-6
ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_4-ncols2_4.cu
...ate-instances/fattn-mma-f16-instance-ncols1_4-ncols2_4.cu
+6
-6
No files found.
ml/backend/ggml/ggml/src/ggml-cuda/getrows.cu
View file @
0cefd46f
...
...
@@ -10,10 +10,11 @@ static __global__ void k_get_rows(
/*const size_t nb00,*/
const
size_t
nb01
,
const
size_t
nb02
,
const
size_t
nb03
,
const
size_t
s10
,
const
size_t
s11
,
const
size_t
s12
/*, const size_t s13*/
)
{
const
int
i00
=
(
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
)
*
2
;
const
int
i10
=
blockDim
.
y
*
blockIdx
.
y
+
threadIdx
.
y
;
const
int
i11
=
(
blockIdx
.
z
*
blockDim
.
z
+
threadIdx
.
z
)
/
ne12
;
const
int
i12
=
(
blockIdx
.
z
*
blockDim
.
z
+
threadIdx
.
z
)
%
ne12
;
// The x and y dimensions of the grid are swapped because the maximum allowed grid size for x is higher.
const
int
i00
=
(
blockIdx
.
y
*
blockDim
.
x
+
threadIdx
.
x
)
*
2
;
const
int
i10
=
blockIdx
.
x
;
const
int
i11
=
blockIdx
.
z
/
ne12
;
const
int
i12
=
blockIdx
.
z
%
ne12
;
if
(
i00
>=
ne00
)
{
return
;
...
...
@@ -46,10 +47,11 @@ static __global__ void k_get_rows_float(
/*const size_t nb00,*/
const
size_t
nb01
,
const
size_t
nb02
,
const
size_t
nb03
,
const
size_t
s10
,
const
size_t
s11
,
const
size_t
s12
/*, const size_t s13*/
)
{
const
int
i00
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
const
int
i10
=
blockDim
.
y
*
blockIdx
.
y
+
threadIdx
.
y
;
const
int
i11
=
(
blockIdx
.
z
*
blockDim
.
z
+
threadIdx
.
z
)
/
ne12
;
const
int
i12
=
(
blockIdx
.
z
*
blockDim
.
z
+
threadIdx
.
z
)
%
ne12
;
// The x and y dimensions of the grid are swapped because the maximum allowed grid size for x is higher.
const
int
i00
=
blockIdx
.
y
*
blockDim
.
x
+
threadIdx
.
x
;
const
int
i10
=
blockIdx
.
x
;
const
int
i11
=
blockIdx
.
z
/
ne12
;
const
int
i12
=
blockIdx
.
z
%
ne12
;
if
(
i00
>=
ne00
)
{
return
;
...
...
@@ -94,8 +96,8 @@ static void get_rows_cuda_q(
const
size_t
nb1
,
const
size_t
nb2
,
const
size_t
nb3
,
cudaStream_t
stream
)
{
const
dim3
block_dims
(
CUDA_GET_ROWS_BLOCK_SIZE
,
1
,
1
);
const
int
block_num_
x
=
(
ne00
+
2
*
CUDA_GET_ROWS_BLOCK_SIZE
-
1
)
/
(
2
*
CUDA_GET_ROWS_BLOCK_SIZE
);
const
dim3
block_nums
(
block_num_
x
,
ne10
,
ne11
*
ne12
);
const
int
block_num_
y
=
(
ne00
+
2
*
CUDA_GET_ROWS_BLOCK_SIZE
-
1
)
/
(
2
*
CUDA_GET_ROWS_BLOCK_SIZE
);
const
dim3
block_nums
(
ne10
,
block_num_
y
,
ne11
*
ne12
);
// strides in elements
// const size_t s0 = nb0 / sizeof(dst_t);
...
...
@@ -127,8 +129,8 @@ static void get_rows_cuda_float(
const
size_t
nb1
,
const
size_t
nb2
,
const
size_t
nb3
,
cudaStream_t
stream
)
{
const
dim3
block_dims
(
CUDA_GET_ROWS_BLOCK_SIZE
,
1
,
1
);
const
int
block_num_
x
=
(
ne00
+
CUDA_GET_ROWS_BLOCK_SIZE
-
1
)
/
CUDA_GET_ROWS_BLOCK_SIZE
;
const
dim3
block_nums
(
block_num_
x
,
ne10
,
ne11
*
ne12
);
const
int
block_num_
y
=
(
ne00
+
CUDA_GET_ROWS_BLOCK_SIZE
-
1
)
/
CUDA_GET_ROWS_BLOCK_SIZE
;
const
dim3
block_nums
(
ne10
,
block_num_
y
,
ne11
*
ne12
);
// strides in elements
// const size_t s0 = nb0 / sizeof(dst_t);
...
...
ml/backend/ggml/ggml/src/ggml-cuda/ggml-cuda.cu
View file @
0cefd46f
...
...
@@ -556,8 +556,8 @@ static enum ggml_status ggml_backend_cuda_buffer_init_tensor(ggml_backend_buffer
if
(
ggml_is_quantized
(
tensor
->
type
)
&&
tensor
->
view_src
==
nullptr
&&
ggml_backend_buffer_get_usage
(
buffer
)
!=
GGML_BACKEND_BUFFER_USAGE_COMPUTE
)
{
// initialize padding to 0 to avoid possible NaN values
size_t
original_size
=
ggml_nbytes
(
tensor
);
size_t
padded_size
=
ggml_backend_buft_get_alloc_size
(
buffer
->
buft
,
tensor
);
const
size_t
original_size
=
ggml_nbytes
(
tensor
);
const
size_t
padded_size
=
ggml_backend_buft_get_alloc_size
(
buffer
->
buft
,
tensor
);
if
(
padded_size
>
original_size
)
{
ggml_cuda_set_device
(
ctx
->
device
);
...
...
@@ -680,6 +680,7 @@ static size_t ggml_backend_cuda_buffer_type_get_alloc_size(ggml_backend_buffer_t
if
(
ggml_is_quantized
(
tensor
->
type
))
{
if
(
ne0
%
MATRIX_ROW_PADDING
!=
0
)
{
GGML_ASSERT
(
tensor
->
nb
[
0
]
==
ggml_element_size
(
tensor
));
size
+=
ggml_row_size
(
tensor
->
type
,
MATRIX_ROW_PADDING
-
ne0
%
MATRIX_ROW_PADDING
);
}
}
...
...
@@ -802,6 +803,7 @@ static void * ggml_backend_cuda_split_buffer_get_base(ggml_backend_buffer_t buff
static
enum
ggml_status
ggml_backend_cuda_split_buffer_init_tensor
(
ggml_backend_buffer_t
buffer
,
ggml_tensor
*
tensor
)
{
GGML_ASSERT
(
tensor
->
view_src
==
nullptr
);
// views of split tensors are not supported
GGML_ASSERT
(
ggml_is_contiguous
(
tensor
)
&&
"split buffers only supported for contiguous tensors"
);
ggml_backend_cuda_split_buffer_context
*
ctx
=
(
ggml_backend_cuda_split_buffer_context
*
)
buffer
->
context
;
ggml_backend_cuda_split_buffer_type_context
*
buft_ctx
=
(
ggml_backend_cuda_split_buffer_type_context
*
)
buffer
->
buft
->
context
;
...
...
@@ -853,6 +855,7 @@ static void ggml_backend_cuda_split_buffer_set_tensor(ggml_backend_buffer_t buff
// split tensors must always be set in their entirety at once
GGML_ASSERT
(
offset
==
0
);
GGML_ASSERT
(
size
==
ggml_nbytes
(
tensor
));
GGML_ASSERT
(
ggml_is_contiguous
(
tensor
)
&&
"split buffers only supported for contiguous tensors"
);
ggml_backend_cuda_split_buffer_type_context
*
buft_ctx
=
(
ggml_backend_cuda_split_buffer_type_context
*
)
buffer
->
buft
->
context
;
...
...
@@ -891,6 +894,7 @@ static void ggml_backend_cuda_split_buffer_get_tensor(ggml_backend_buffer_t buff
// split tensors must always be set in their entirety at once
GGML_ASSERT
(
offset
==
0
);
GGML_ASSERT
(
size
==
ggml_nbytes
(
tensor
));
GGML_ASSERT
(
ggml_is_contiguous
(
tensor
)
&&
"split buffers only supported for contiguous tensors"
);
ggml_backend_cuda_split_buffer_type_context
*
buft_ctx
=
(
ggml_backend_cuda_split_buffer_type_context
*
)
buffer
->
buft
->
context
;
...
...
@@ -972,6 +976,7 @@ static size_t ggml_backend_cuda_split_buffer_type_get_alignment(ggml_backend_buf
static
size_t
ggml_backend_cuda_split_buffer_type_get_alloc_size
(
ggml_backend_buffer_type_t
buft
,
const
ggml_tensor
*
tensor
)
{
ggml_backend_cuda_split_buffer_type_context
*
ctx
=
(
ggml_backend_cuda_split_buffer_type_context
*
)
buft
->
context
;
GGML_ASSERT
(
ggml_is_contiguous
(
tensor
)
&&
"split buffers only supported for contiguous tensors"
);
size_t
total_size
=
0
;
...
...
@@ -1534,6 +1539,8 @@ static void ggml_cuda_op_mul_mat(
// If src0 is on a temporary compute buffer (partial offloading) there may be some padding that needs to be cleared:
if
(
ne00
%
MATRIX_ROW_PADDING
!=
0
&&
ggml_is_quantized
(
src0
->
type
)
&&
ggml_backend_buffer_get_usage
(
src0
->
buffer
)
==
GGML_BACKEND_BUFFER_USAGE_COMPUTE
&&
src0
->
view_src
==
nullptr
)
{
GGML_ASSERT
(
ggml_is_contiguously_allocated
(
src0
));
GGML_ASSERT
(
!
src0
->
view_src
);
const
size_t
nbytes_data
=
ggml_row_size
(
src0
->
type
,
(
dev
[
id
].
row_high
-
dev
[
id
].
row_low
)
*
ne00
);
const
size_t
nbytes_padding
=
ggml_row_size
(
src0
->
type
,
MATRIX_ROW_PADDING
-
ne00
%
MATRIX_ROW_PADDING
);
CUDA_CHECK
(
cudaMemsetAsync
(
dev
[
id
].
src0_dd
+
nbytes_data
,
0
,
nbytes_padding
,
stream
));
...
...
@@ -1905,13 +1912,19 @@ static void ggml_cuda_mul_mat_batched_cublas(ggml_backend_cuda_context & ctx, co
static
void
ggml_cuda_mul_mat
(
ggml_backend_cuda_context
&
ctx
,
const
ggml_tensor
*
src0
,
const
ggml_tensor
*
src1
,
ggml_tensor
*
dst
)
{
const
bool
split
=
ggml_backend_buft_is_cuda_split
(
src0
->
buffer
->
buft
);
// If src0 is a temporary compute buffer it may have some padding that needs to be cleared for mul_mat_vec_q or mul_mat_q.
// But if src0 is also a view of another tensor then this cannot be done safely because it may overwrite valid tensor data.
// Therefore, in such cases use cuBLAS.
const
bool
bad_padding_clear
=
ggml_backend_buffer_get_usage
(
src0
->
buffer
)
==
GGML_BACKEND_BUFFER_USAGE_COMPUTE
&&
ggml_nbytes
(
src0
)
!=
ggml_backend_buffer_get_alloc_size
(
src0
->
buffer
,
src0
)
&&
src0
->
view_src
;
bool
use_mul_mat_vec
=
(
src0
->
type
==
GGML_TYPE_F32
||
src0
->
type
==
GGML_TYPE_F16
||
src0
->
type
==
GGML_TYPE_BF16
)
&&
src1
->
type
==
GGML_TYPE_F32
&&
dst
->
type
==
GGML_TYPE_F32
&&
src0
->
ne
[
0
]
%
2
==
0
&&
src1
->
ne
[
1
]
==
1
;
bool
use_mul_mat_vec_q
=
ggml_is_quantized
(
src0
->
type
)
bool
use_mul_mat_vec_q
=
ggml_is_quantized
(
src0
->
type
)
&&
!
bad_padding_clear
&&
src1
->
type
==
GGML_TYPE_F32
&&
dst
->
type
==
GGML_TYPE_F32
&&
src1
->
ne
[
1
]
<=
MMVQ_MAX_BATCH_SIZE
;
bool
use_mul_mat_q
=
ggml_is_quantized
(
src0
->
type
)
bool
use_mul_mat_q
=
ggml_is_quantized
(
src0
->
type
)
&&
!
bad_padding_clear
&&
src1
->
type
==
GGML_TYPE_F32
&&
dst
->
type
==
GGML_TYPE_F32
;
bool
any_gpus_with_slow_fp16
=
false
;
...
...
@@ -2065,9 +2078,11 @@ static void ggml_cuda_mul_mat_id(ggml_backend_cuda_context & ctx, ggml_tensor *
}
ggml_tensor
src0_slice
=
*
src0
;
src0_slice
.
ne
[
2
]
=
1
;
src0_slice
.
nb
[
3
]
=
src0_slice
.
nb
[
2
];
src0_slice
.
data
=
(
char
*
)
src0
->
data
+
i02
*
nb02
;
src0_slice
.
ne
[
2
]
=
1
;
src0_slice
.
nb
[
3
]
=
src0_slice
.
nb
[
2
];
src0_slice
.
op
=
GGML_OP_VIEW
;
src0_slice
.
view_src
=
dst
->
src
[
0
];
// non-const pointer to src0
src0_slice
.
data
=
(
char
*
)
src0
->
data
+
i02
*
nb02
;
ggml_tensor
src1_slice
;
memset
(
&
src1_slice
,
0
,
sizeof
(
src1_slice
));
...
...
@@ -3213,16 +3228,16 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
return
false
;
#endif // FLASH_ATTN_AVAILABLE
if
(
op
->
src
[
1
]
->
ne
[
0
]
!=
op
->
src
[
2
]
->
ne
[
0
])
{
// different head sizes of K and V are not supported yet
return
false
;
const
int
cc
=
ggml_cuda_info
().
devices
[
dev_ctx
->
device
].
cc
;
if
(
!
new_mma_available
(
cc
)
||
cc
<
GGML_CUDA_CC_AMPERE
)
{
return
false
;
}
const
int
gqa_ratio
=
op
->
src
[
0
]
->
ne
[
2
]
/
op
->
src
[
1
]
->
ne
[
2
];
return
op
->
src
[
1
]
->
ne
[
0
]
==
576
&&
op
->
src
[
2
]
->
ne
[
0
]
==
512
&&
op
->
src
[
3
]
&&
gqa_ratio
%
16
==
0
;
}
if
(
op
->
src
[
0
]
->
ne
[
0
]
==
192
)
{
return
false
;
}
if
(
op
->
src
[
0
]
->
ne
[
0
]
==
576
)
{
// DeepSeek MLA
return
false
;
}
if
(
op
->
src
[
0
]
->
ne
[
3
]
!=
1
)
{
return
false
;
}
...
...
ml/backend/ggml/ggml/src/ggml-cuda/mmq.cu
View file @
0cefd46f
...
...
@@ -89,6 +89,17 @@ void ggml_cuda_mul_mat_q(
const
float
*
src1_d
=
(
const
float
*
)
src1
->
data
;
float
*
dst_d
=
(
float
*
)
dst
->
data
;
// If src0 is a temporary compute buffer, clear any potential padding.
if
(
ggml_backend_buffer_get_usage
(
src0
->
buffer
)
==
GGML_BACKEND_BUFFER_USAGE_COMPUTE
)
{
const
size_t
size_data
=
ggml_nbytes
(
src0
);
const
size_t
size_alloc
=
ggml_backend_buffer_get_alloc_size
(
src0
->
buffer
,
src0
);
if
(
size_alloc
>
size_data
)
{
GGML_ASSERT
(
ggml_is_contiguously_allocated
(
src0
));
GGML_ASSERT
(
!
src0
->
view_src
);
CUDA_CHECK
(
cudaMemsetAsync
((
char
*
)
src0
->
data
+
size_data
,
0
,
size_alloc
-
size_data
,
stream
));
}
}
const
int64_t
ne10_padded
=
GGML_PAD
(
ne10
,
MATRIX_ROW_PADDING
);
const
int64_t
s01
=
src0
->
nb
[
1
]
/
ts_src0
;
...
...
@@ -118,7 +129,7 @@ void ggml_cuda_mul_mat_q(
const
mmq_args
args
=
{
src0_d
,
src0
->
type
,
(
const
int
*
)
src1_q8_1
.
ptr
,
nullptr
,
nullptr
,
dst_d
,
ne00
,
ne01
,
ne1
,
s01
,
s1
,
ne00
,
ne01
,
ne1
,
s01
,
ne11
,
s1
,
ne02
,
ne12
,
s02
,
s12
,
s2
,
ne03
,
ne13
,
s03
,
s13
,
s3
,
use_stream_k
};
...
...
@@ -202,7 +213,7 @@ void ggml_cuda_mul_mat_q(
// Note that ne02 is used instead of ne12 because the number of y channels determines the z dimension of the CUDA grid.
const
mmq_args
args
=
{
src0_d
,
src0
->
type
,
(
const
int
*
)
src1_q8_1
.
ptr
,
ids_dst_dev
,
expert_bounds_dev
,
dst_d
,
ne00
,
ne01
,
ne_get_rows
,
s01
,
s1
,
ne00
,
ne01
,
ne_get_rows
,
s01
,
ne_get_rows
,
s1
,
ne02
,
ne02
,
s02
,
s12
,
s2
,
ne03
,
ne13
,
s03
,
s13
,
s3
,
use_stream_k
};
...
...
@@ -241,7 +252,7 @@ void ggml_cuda_op_mul_mat_q(
ggml_cuda_highest_compiled_arch
(
cc
)
>=
GGML_CUDA_CC_VOLTA
&&
src1_ncols
==
ne11
;
const
mmq_args
args
=
{
src0_dd_i
,
src0
->
type
,
(
const
int
*
)
src1_ddq_i
,
nullptr
,
nullptr
,
dst_dd_i
,
ne00
,
row_diff
,
src1_ncols
,
stride01
,
nrows_dst
,
ne00
,
row_diff
,
src1_ncols
,
stride01
,
ne11
,
nrows_dst
,
1
,
1
,
0
,
0
,
0
,
1
,
1
,
0
,
0
,
0
,
use_stream_k
};
...
...
ml/backend/ggml/ggml/src/ggml-cuda/mmq.cuh
View file @
0cefd46f
...
...
@@ -2522,7 +2522,7 @@ template <ggml_type type, int mmq_x, int nwarps, bool need_check, bool fixup>
static
__device__
__forceinline__
void
mul_mat_q_process_tile
(
const
char
*
__restrict__
x
,
const
int
offset_x
,
const
int
*
__restrict__
y
,
const
int
*
__restrict__
ids_dst
,
float
*
__restrict__
dst
,
float
*
__restrict__
tmp_fixup
,
const
int
n
row
s
_x
,
const
int
ncols_y
,
const
int
stride_row_x
,
const
int
stride_col_dst
,
const
int
stride_
row_x
,
const
int
ncols_y
,
const
int
stride_col_dst
,
const
int
tile_x_max_i
,
const
int
tile_y_max_j
,
const
int
kb0_start
,
const
int
kb0_stop
)
{
constexpr
int
qk
=
ggml_cuda_type_traits
<
type
>::
qk
;
...
...
@@ -2606,7 +2606,7 @@ template <ggml_type type, int mmq_x, int nwarps, bool need_check>
static
__global__
void
mul_mat_q
(
const
char
*
__restrict__
x
,
const
int
*
__restrict__
y
,
const
int32_t
*
__restrict__
ids_dst
,
const
int32_t
*
__restrict__
expert_bounds
,
float
*
__restrict__
dst
,
float
*
__restrict__
tmp_fixup
,
const
int
ncols_x
,
const
int
nrows_x
,
const
int
ncols_
y
,
const
int
stride_row_x
,
const
int
stride_col_dst
,
const
int
ncols_x
,
const
int
nrows_x
,
const
int
ncols_
dst
,
const
int
stride_row_x
,
const
int
ncols_y
,
const
int
stride_col_dst
,
const
int
channel_ratio
,
const
int
nchannels_y
,
const
int
stride_channel_x
,
const
int
stride_channel_y
,
const
int
stride_channel_dst
,
const
int
sample_ratio
,
const
int
nsamples_y
,
const
int
stride_sample_x
,
const
int
stride_sample_y
,
const
int
stride_sample_dst
)
{
...
...
@@ -2619,8 +2619,8 @@ static __global__ void mul_mat_q(
constexpr
int
qk
=
ggml_cuda_type_traits
<
type
>::
qk
;
constexpr
int
mmq_y
=
get_mmq_y_device
();
const
int
ntx
=
(
ncols_
y
+
mmq_x
-
1
)
/
mmq_x
;
// Number of tiles x
const
int
nty
=
(
nrows_x
+
mmq_y
-
1
)
/
mmq_y
;
// Number of tiles y
const
int
ntx
=
(
ncols_
dst
+
mmq_x
-
1
)
/
mmq_x
;
// Number of tiles x
const
int
nty
=
(
nrows_x
+
mmq_y
-
1
)
/
mmq_y
;
// Number of tiles y
// Initialize the ids for writing back data with just the index.
// For regular matrix multiplications this is never changed.
...
...
@@ -2636,6 +2636,7 @@ static __global__ void mul_mat_q(
ids_dst_shared
[
j
]
=
j
;
}
__syncthreads
();
// On AMD or old CUDA the performance with stream-k was worse, use conventional tiling instead:
#if (defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) || __CUDA_ARCH__ < GGML_CUDA_CC_VOLTA
...
...
@@ -2647,8 +2648,8 @@ static __global__ void mul_mat_q(
// Defaults for regular matrix multiplication:
int
col_low
=
0
;
int
col_high
=
ncols_
y
;
int
col_diff
=
ncols_
y
;
int
col_high
=
ncols_
dst
;
int
col_diff
=
ncols_
dst
;
int
offset_y
=
wt
*
stride_sample_y
+
zt
*
stride_channel_y
;
int
offset_dst
=
wt
*
stride_sample_dst
+
zt
*
stride_channel_dst
+
jt
*
mmq_x
*
stride_col_dst
;
...
...
@@ -2664,6 +2665,7 @@ static __global__ void mul_mat_q(
return
;
}
// __syncthreads(); // There is no previous tile that could cause a race condition.
#pragma unroll
for
(
int
j0
=
0
;
j0
<
mmq_x
;
j0
+=
nwarps
*
WARP_SIZE
)
{
const
int
j
=
j0
+
threadIdx
.
y
*
WARP_SIZE
+
threadIdx
.
x
;
...
...
@@ -2674,6 +2676,7 @@ static __global__ void mul_mat_q(
ids_dst_shared
[
j
]
=
ids_dst
[
col_low
+
jt
*
mmq_x
+
j
];
}
__syncthreads
();
}
offset_y
+=
(
col_low
+
jt
*
mmq_x
)
*
(
sizeof
(
block_q8_1_mmq
)
/
sizeof
(
int
));
...
...
@@ -2686,7 +2689,7 @@ static __global__ void mul_mat_q(
constexpr
bool
fixup
=
false
;
mul_mat_q_process_tile
<
type
,
mmq_x
,
nwarps
,
need_check
,
fixup
>
(
x
,
offset_x
,
y
+
offset_y
,
ids_dst_shared
,
dst
+
offset_dst
,
tmp_fixup
,
n
row
s
_x
,
ncols_y
,
stride_row_x
,
stride_col_dst
,
(
x
,
offset_x
,
y
+
offset_y
,
ids_dst_shared
,
dst
+
offset_dst
,
tmp_fixup
,
stride_
row_x
,
ncols_y
,
stride_col_dst
,
tile_x_max_i
,
tile_y_max_j
,
0
,
ncols_x
/
qk
);
return
;
}
...
...
@@ -2717,8 +2720,8 @@ static __global__ void mul_mat_q(
// Defaults for regular matrix multiplication:
int
col_low
=
0
;
int
col_high
=
ncols_
y
;
int
col_diff
=
ncols_
y
;
int
col_high
=
ncols_
dst
;
int
col_diff
=
ncols_
dst
;
int
offset_y
=
wt
*
stride_sample_y
+
zt
*
stride_channel_y
;
int
offset_dst
=
wt
*
stride_sample_dst
+
zt
*
stride_channel_dst
+
jt
*
mmq_x
*
stride_col_dst
;
...
...
@@ -2740,6 +2743,7 @@ static __global__ void mul_mat_q(
continue
;
}
__syncthreads
();
#pragma unroll
for
(
int
j0
=
0
;
j0
<
mmq_x
;
j0
+=
nwarps
*
WARP_SIZE
)
{
const
int
j
=
j0
+
threadIdx
.
y
*
WARP_SIZE
+
threadIdx
.
x
;
...
...
@@ -2750,6 +2754,7 @@ static __global__ void mul_mat_q(
ids_dst_shared
[
j
]
=
ids_dst
[
col_low
+
jt
*
mmq_x
+
j
];
}
__syncthreads
();
}
offset_y
+=
(
col_low
+
jt
*
mmq_x
)
*
(
sizeof
(
block_q8_1_mmq
)
/
sizeof
(
int
));
...
...
@@ -2762,7 +2767,7 @@ static __global__ void mul_mat_q(
constexpr
bool
fixup
=
false
;
// All but (potentially) the last iterations write their data to dst rather than the fixup buffer.
mul_mat_q_process_tile
<
type
,
mmq_x
,
nwarps
,
need_check
,
fixup
>
(
x
,
offset_x
,
y
+
offset_y
,
ids_dst_shared
,
dst
+
offset_dst
,
tmp_fixup
,
n
row
s
_x
,
ncols_y
,
stride_row_x
,
stride_col_dst
,
(
x
,
offset_x
,
y
+
offset_y
,
ids_dst_shared
,
dst
+
offset_dst
,
tmp_fixup
,
stride_
row_x
,
ncols_y
,
stride_col_dst
,
tile_x_max_i
,
tile_y_max_j
,
kb0_start
,
kb0_stop
);
kbc
+=
blocks_per_ne00
;
...
...
@@ -2787,8 +2792,8 @@ static __global__ void mul_mat_q(
// Defaults for regular matrix multiplication:
int
col_low
=
0
;
int
col_high
=
ncols_
y
;
int
col_diff
=
ncols_
y
;
int
col_high
=
ncols_
dst
;
int
col_diff
=
ncols_
dst
;
int
offset_y
=
wt
*
stride_sample_y
+
zt
*
stride_channel_y
;
int
offset_dst
=
wt
*
stride_sample_dst
+
zt
*
stride_channel_dst
+
jt
*
mmq_x
*
stride_col_dst
;
...
...
@@ -2805,6 +2810,7 @@ static __global__ void mul_mat_q(
}
// The memory layout for the fixup buffer is always contiguous, therefore reset ids:
__syncthreads
();
#pragma unroll
for
(
int
j0
=
0
;
j0
<
mmq_x
;
j0
+=
nwarps
*
WARP_SIZE
)
{
const
int
j
=
j0
+
threadIdx
.
y
*
WARP_SIZE
+
threadIdx
.
x
;
...
...
@@ -2815,6 +2821,7 @@ static __global__ void mul_mat_q(
ids_dst_shared
[
j
]
=
j
;
}
__syncthreads
();
}
offset_y
+=
(
col_low
+
jt
*
mmq_x
)
*
(
sizeof
(
block_q8_1_mmq
)
/
sizeof
(
int
));
...
...
@@ -2827,7 +2834,7 @@ static __global__ void mul_mat_q(
constexpr
bool
fixup
=
true
;
// Last index writes its data to fixup buffer to avoid data races with other blocks.
mul_mat_q_process_tile
<
type
,
mmq_x
,
nwarps
,
need_check
,
fixup
>
(
x
,
offset_x
,
y
+
offset_y
,
ids_dst_shared
,
dst
+
offset_dst
,
tmp_fixup
,
n
row
s
_x
,
ncols_y
,
stride_row_x
,
stride_col_dst
,
(
x
,
offset_x
,
y
+
offset_y
,
ids_dst_shared
,
dst
+
offset_dst
,
tmp_fixup
,
stride_
row_x
,
ncols_y
,
stride_col_dst
,
tile_x_max_i
,
tile_y_max_j
,
kb0_start
,
kb0_stop
);
}
...
...
@@ -2835,7 +2842,7 @@ static __global__ void mul_mat_q(
template
<
ggml_type
type
,
int
mmq_x
,
int
nwarps
,
bool
need_check
>
static
__global__
void
mul_mat_q_stream_k_fixup
(
const
int32_t
*
ids_dst
,
const
int32_t
*
expert_bounds
,
float
*
__restrict__
dst
,
const
float
*
__restrict__
tmp_last_tile
,
const
int
ncols_x
,
const
int
nrows_x
,
const
int
ncols_
y
,
const
int
stride_col_dst
,
const
int
ncols_x
,
const
int
nrows_x
,
const
int
ncols_
dst
,
const
int
stride_col_dst
,
const
int
nchannels_y
,
const
int
stride_channel_dst
,
const
int
nsamples_y
,
const
int
stride_sample_dst
)
{
constexpr
int
mmq_y
=
get_mmq_y_device
();
constexpr
int
qk
=
ggml_cuda_type_traits
<
type
>::
qk
;
...
...
@@ -2844,8 +2851,8 @@ static __global__ void mul_mat_q_stream_k_fixup(
float
sum
[
mmq_x
*
mmq_y
/
(
nwarps
*
WARP_SIZE
)]
=
{
0.0
f
};
const
int
ntx
=
(
ncols_
y
+
mmq_x
-
1
)
/
mmq_x
;
const
int
nty
=
(
nrows_x
+
mmq_y
-
1
)
/
mmq_y
;
const
int
ntx
=
(
ncols_
dst
+
mmq_x
-
1
)
/
mmq_x
;
const
int
nty
=
(
nrows_x
+
mmq_y
-
1
)
/
mmq_y
;
const
int
bidx0
=
blockIdx
.
x
;
...
...
@@ -2918,8 +2925,8 @@ static __global__ void mul_mat_q_stream_k_fixup(
const
int
offset_dst
=
wt
*
stride_sample_dst
+
zt
*
stride_channel_dst
+
jt
*
mmq_x
*
stride_col_dst
+
it
*
mmq_y
;
dst
+=
offset_dst
;
const
int
i_max
=
nrows_x
-
it
*
mmq_y
-
1
;
const
int
j_max
=
ncols_
y
-
jt
*
mmq_x
-
1
;
const
int
i_max
=
nrows_x
-
it
*
mmq_y
-
1
;
const
int
j_max
=
ncols_
dst
-
jt
*
mmq_x
-
1
;
#pragma unroll
for
(
int
j0
=
0
;
j0
<
mmq_x
;
j0
+=
nwarps
)
{
...
...
@@ -2951,6 +2958,7 @@ static __global__ void mul_mat_q_stream_k_fixup(
for
(
int
j
=
threadIdx
.
y
*
WARP_SIZE
+
threadIdx
.
x
;
j
<
mmq_x
;
j
+=
nwarps
*
WARP_SIZE
)
{
ids_dst_shared
[
j
]
=
ids_dst
[
col_low
+
j
];
}
__syncthreads
();
const
int
offset_dst
=
it
*
mmq_y
;
dst
+=
offset_dst
;
...
...
@@ -2981,7 +2989,7 @@ static __global__ void mul_mat_q_stream_k_fixup(
struct
mmq_args
{
const
char
*
x
;
ggml_type
type_x
;
const
int
*
y
;
const
int32_t
*
ids_dst
;
const
int32_t
*
expert_bounds
;
float
*
dst
;
int64_t
ncols_x
;
int64_t
nrows_x
;
int64_t
ncols_
y
;
int64_t
stride_row_x
;
int64_t
nrows_dst
;
int64_t
ncols_x
;
int64_t
nrows_x
;
int64_t
ncols_
dst
;
int64_t
stride_row_x
;
int64_t
ncols_y
;
int64_t
nrows_dst
;
int64_t
nchannels_x
;
int64_t
nchannels_y
;
int64_t
stride_channel_x
;
int64_t
stride_channel_y
;
int64_t
stride_channel_dst
;
int64_t
nsamples_x
;
int64_t
nsamples_y
;
int64_t
stride_sample_x
;
int64_t
stride_sample_y
;
int64_t
stride_sample_dst
;
bool
use_stream_k
;
...
...
@@ -3017,8 +3025,8 @@ static void launch_mul_mat_q(ggml_backend_cuda_context & ctx, const mmq_args & a
}
#endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && !defined(GGML_USE_MUSA)
const
int
nty
=
(
args
.
nrows_x
+
mmq_y
-
1
)
/
mmq_y
;
const
int
ntx
=
(
args
.
ncols_
y
+
mmq_x
-
1
)
/
mmq_x
;
const
int
nty
=
(
args
.
nrows_x
+
mmq_y
-
1
)
/
mmq_y
;
const
int
ntx
=
(
args
.
ncols_
dst
+
mmq_x
-
1
)
/
mmq_x
;
const
int
ntzw
=
args
.
nchannels_y
*
args
.
nsamples_y
;
const
dim3
block_nums_xy_tiling
(
nty
,
ntx
,
ntzw
);
...
...
@@ -3032,14 +3040,14 @@ static void launch_mul_mat_q(ggml_backend_cuda_context & ctx, const mmq_args & a
constexpr
bool
need_check
=
false
;
mul_mat_q
<
type
,
mmq_x
,
MMQ_NWARPS
,
need_check
><<<
block_nums_xy_tiling
,
block_dims
,
nbytes_shared
,
stream
>>>
(
args
.
x
,
args
.
y
,
args
.
ids_dst
,
args
.
expert_bounds
,
args
.
dst
,
nullptr
,
args
.
ncols_x
,
args
.
nrows_x
,
args
.
ncols_
y
,
args
.
stride_row_x
,
args
.
nrows_dst
,
args
.
ncols_x
,
args
.
nrows_x
,
args
.
ncols_
dst
,
args
.
stride_row_x
,
args
.
ncols_y
,
args
.
nrows_dst
,
channel_ratio
,
args
.
nchannels_y
,
args
.
stride_channel_x
,
args
.
stride_channel_y
,
args
.
stride_channel_dst
,
sample_ratio
,
args
.
nsamples_y
,
args
.
stride_sample_x
,
args
.
stride_sample_y
,
args
.
stride_sample_dst
);
}
else
{
constexpr
bool
need_check
=
true
;
mul_mat_q
<
type
,
mmq_x
,
MMQ_NWARPS
,
need_check
><<<
block_nums_xy_tiling
,
block_dims
,
nbytes_shared
,
stream
>>>
(
args
.
x
,
args
.
y
,
args
.
ids_dst
,
args
.
expert_bounds
,
args
.
dst
,
nullptr
,
args
.
ncols_x
,
args
.
nrows_x
,
args
.
ncols_
y
,
args
.
stride_row_x
,
args
.
nrows_dst
,
args
.
ncols_x
,
args
.
nrows_x
,
args
.
ncols_
dst
,
args
.
stride_row_x
,
args
.
ncols_y
,
args
.
nrows_dst
,
channel_ratio
,
args
.
nchannels_y
,
args
.
stride_channel_x
,
args
.
stride_channel_y
,
args
.
stride_channel_dst
,
sample_ratio
,
args
.
nsamples_y
,
args
.
stride_sample_x
,
args
.
stride_sample_y
,
args
.
stride_sample_dst
);
}
...
...
@@ -3060,7 +3068,7 @@ static void launch_mul_mat_q(ggml_backend_cuda_context & ctx, const mmq_args & a
mul_mat_q
<
type
,
mmq_x
,
MMQ_NWARPS
,
need_check
><<<
block_nums_stream_k
,
block_dims
,
nbytes_shared
,
stream
>>>
(
args
.
x
,
args
.
y
,
args
.
ids_dst
,
args
.
expert_bounds
,
args
.
dst
,
tmp_fixup
.
ptr
,
args
.
ncols_x
,
args
.
nrows_x
,
args
.
ncols_
y
,
args
.
stride_row_x
,
args
.
nrows_dst
,
args
.
ncols_x
,
args
.
nrows_x
,
args
.
ncols_
dst
,
args
.
stride_row_x
,
args
.
ncols_y
,
args
.
nrows_dst
,
channel_ratio
,
args
.
nchannels_y
,
args
.
stride_channel_x
,
args
.
stride_channel_y
,
args
.
stride_channel_dst
,
sample_ratio
,
args
.
nsamples_y
,
args
.
stride_sample_x
,
args
.
stride_sample_y
,
args
.
stride_sample_dst
);
...
...
@@ -3069,14 +3077,14 @@ static void launch_mul_mat_q(ggml_backend_cuda_context & ctx, const mmq_args & a
}
mul_mat_q_stream_k_fixup
<
type
,
mmq_x
,
MMQ_NWARPS
,
need_check
><<<
block_nums_stream_k
,
block_dims
,
0
,
stream
>>>
(
args
.
ids_dst
,
args
.
expert_bounds
,
args
.
dst
,
tmp_fixup
.
ptr
,
args
.
ncols_x
,
args
.
nrows_x
,
args
.
ncols_
y
,
(
args
.
ids_dst
,
args
.
expert_bounds
,
args
.
dst
,
tmp_fixup
.
ptr
,
args
.
ncols_x
,
args
.
nrows_x
,
args
.
ncols_
dst
,
args
.
nrows_dst
,
args
.
nchannels_y
,
args
.
stride_channel_dst
,
args
.
nsamples_y
,
args
.
stride_sample_dst
);
}
else
{
constexpr
bool
need_check
=
true
;
mul_mat_q
<
type
,
mmq_x
,
MMQ_NWARPS
,
need_check
><<<
block_nums_stream_k
,
block_dims
,
nbytes_shared
,
stream
>>>
(
args
.
x
,
args
.
y
,
args
.
ids_dst
,
args
.
expert_bounds
,
args
.
dst
,
tmp_fixup
.
ptr
,
args
.
ncols_x
,
args
.
nrows_x
,
args
.
ncols_
y
,
args
.
stride_row_x
,
args
.
nrows_dst
,
args
.
ncols_x
,
args
.
nrows_x
,
args
.
ncols_
dst
,
args
.
stride_row_x
,
args
.
ncols_y
,
args
.
nrows_dst
,
channel_ratio
,
args
.
nchannels_y
,
args
.
stride_channel_x
,
args
.
stride_channel_y
,
args
.
stride_channel_dst
,
sample_ratio
,
args
.
nsamples_y
,
args
.
stride_sample_x
,
args
.
stride_sample_y
,
args
.
stride_sample_dst
);
...
...
@@ -3085,7 +3093,7 @@ static void launch_mul_mat_q(ggml_backend_cuda_context & ctx, const mmq_args & a
}
mul_mat_q_stream_k_fixup
<
type
,
mmq_x
,
MMQ_NWARPS
,
need_check
><<<
block_nums_stream_k
,
block_dims
,
0
,
stream
>>>
(
args
.
ids_dst
,
args
.
expert_bounds
,
args
.
dst
,
tmp_fixup
.
ptr
,
args
.
ncols_x
,
args
.
nrows_x
,
args
.
ncols_
y
,
(
args
.
ids_dst
,
args
.
expert_bounds
,
args
.
dst
,
tmp_fixup
.
ptr
,
args
.
ncols_x
,
args
.
nrows_x
,
args
.
ncols_
dst
,
args
.
nrows_dst
,
args
.
nchannels_y
,
args
.
stride_channel_dst
,
args
.
nsamples_y
,
args
.
stride_sample_dst
);
}
}
...
...
ml/backend/ggml/ggml/src/ggml-cuda/mmvq.cu
View file @
0cefd46f
...
...
@@ -513,6 +513,17 @@ void ggml_cuda_mul_mat_vec_q(
const
int32_t
*
ids_d
=
ids
?
(
const
int32_t
*
)
ids
->
data
:
nullptr
;
float
*
dst_d
=
(
float
*
)
dst
->
data
;
// If src0 is a temporary compute buffer, clear any potential padding.
if
(
ggml_backend_buffer_get_usage
(
src0
->
buffer
)
==
GGML_BACKEND_BUFFER_USAGE_COMPUTE
)
{
const
size_t
size_data
=
ggml_nbytes
(
src0
);
const
size_t
size_alloc
=
ggml_backend_buffer_get_alloc_size
(
src0
->
buffer
,
src0
);
if
(
size_alloc
>
size_data
)
{
GGML_ASSERT
(
ggml_is_contiguously_allocated
(
src0
));
GGML_ASSERT
(
!
src0
->
view_src
);
CUDA_CHECK
(
cudaMemsetAsync
((
char
*
)
src0
->
data
+
size_data
,
0
,
size_alloc
-
size_data
,
stream
));
}
}
const
int64_t
ne10_padded
=
GGML_PAD
(
ne10
,
MATRIX_ROW_PADDING
);
ggml_cuda_pool_alloc
<
char
>
src1_q8_1
(
ctx
.
pool
(),
ne13
*
ne12
*
ne11
*
ne10_padded
*
sizeof
(
block_q8_1
)
/
QK8_1
);
{
...
...
ml/backend/ggml/ggml/src/ggml-cuda/quantize.cu
View file @
0cefd46f
...
...
@@ -163,6 +163,7 @@ void quantize_mmq_q8_1_cuda(
const
float
*
x
,
const
int32_t
*
ids
,
void
*
vy
,
const
ggml_type
type_src0
,
const
int64_t
ne00
,
const
int64_t
s01
,
const
int64_t
s02
,
const
int64_t
s03
,
const
int64_t
ne0
,
const
int64_t
ne1
,
const
int64_t
ne2
,
const
int64_t
ne3
,
cudaStream_t
stream
)
{
GGML_ASSERT
(
ne00
%
4
==
0
);
GGML_ASSERT
(
ne0
%
(
4
*
QK8_1
)
==
0
);
const
int64_t
block_num_x
=
(
ne0
+
4
*
CUDA_QUANTIZE_BLOCK_SIZE_MMQ
-
1
)
/
(
4
*
CUDA_QUANTIZE_BLOCK_SIZE_MMQ
);
...
...
ml/backend/ggml/ggml/src/ggml-cuda/sum.cu
View file @
0cefd46f
...
...
@@ -31,7 +31,7 @@ void ggml_cuda_op_sum(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
GGML_ASSERT
(
src0
->
type
==
GGML_TYPE_F32
);
GGML_ASSERT
(
dst
->
type
==
GGML_TYPE_F32
);
GGML_ASSERT
(
ggml_is_contiguous
(
src0
));
GGML_ASSERT
(
ggml_is_contiguous
ly_allocated
(
src0
));
const
float
*
src0_d
=
(
const
float
*
)
src0
->
data
;
float
*
dst_d
=
(
float
*
)
dst
->
data
;
...
...
ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_1-ncols2_16.cu
0 → 100644
View file @
0cefd46f
// This file has been autogenerated by generate_cu_files.py, do not edit manually.
#include "../fattn-mma-f16.cuh"
DECL_FATTN_MMA_F16_CASE
(
576
,
512
,
1
,
16
);
ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_1-ncols2_8.cu
View file @
0cefd46f
...
...
@@ -2,9 +2,9 @@
#include "../fattn-mma-f16.cuh"
DECL_FATTN_MMA_F16_CASE
(
64
,
1
,
8
);
DECL_FATTN_MMA_F16_CASE
(
80
,
1
,
8
);
DECL_FATTN_MMA_F16_CASE
(
96
,
1
,
8
);
DECL_FATTN_MMA_F16_CASE
(
112
,
1
,
8
);
DECL_FATTN_MMA_F16_CASE
(
128
,
1
,
8
);
DECL_FATTN_MMA_F16_CASE
(
256
,
1
,
8
);
DECL_FATTN_MMA_F16_CASE
(
64
,
64
,
1
,
8
);
DECL_FATTN_MMA_F16_CASE
(
80
,
80
,
1
,
8
);
DECL_FATTN_MMA_F16_CASE
(
96
,
96
,
1
,
8
);
DECL_FATTN_MMA_F16_CASE
(
112
,
112
,
1
,
8
);
DECL_FATTN_MMA_F16_CASE
(
128
,
128
,
1
,
8
);
DECL_FATTN_MMA_F16_CASE
(
256
,
256
,
1
,
8
);
ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_16-ncols2_1.cu
View file @
0cefd46f
...
...
@@ -2,9 +2,9 @@
#include "../fattn-mma-f16.cuh"
DECL_FATTN_MMA_F16_CASE
(
64
,
16
,
1
);
DECL_FATTN_MMA_F16_CASE
(
80
,
16
,
1
);
DECL_FATTN_MMA_F16_CASE
(
96
,
16
,
1
);
DECL_FATTN_MMA_F16_CASE
(
112
,
16
,
1
);
DECL_FATTN_MMA_F16_CASE
(
128
,
16
,
1
);
DECL_FATTN_MMA_F16_CASE
(
256
,
16
,
1
);
DECL_FATTN_MMA_F16_CASE
(
64
,
64
,
16
,
1
);
DECL_FATTN_MMA_F16_CASE
(
80
,
80
,
16
,
1
);
DECL_FATTN_MMA_F16_CASE
(
96
,
96
,
16
,
1
);
DECL_FATTN_MMA_F16_CASE
(
112
,
112
,
16
,
1
);
DECL_FATTN_MMA_F16_CASE
(
128
,
128
,
16
,
1
);
DECL_FATTN_MMA_F16_CASE
(
256
,
256
,
16
,
1
);
ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_16-ncols2_2.cu
View file @
0cefd46f
...
...
@@ -2,9 +2,9 @@
#include "../fattn-mma-f16.cuh"
DECL_FATTN_MMA_F16_CASE
(
64
,
16
,
2
);
DECL_FATTN_MMA_F16_CASE
(
80
,
16
,
2
);
DECL_FATTN_MMA_F16_CASE
(
96
,
16
,
2
);
DECL_FATTN_MMA_F16_CASE
(
112
,
16
,
2
);
DECL_FATTN_MMA_F16_CASE
(
128
,
16
,
2
);
DECL_FATTN_MMA_F16_CASE
(
256
,
16
,
2
);
DECL_FATTN_MMA_F16_CASE
(
64
,
64
,
16
,
2
);
DECL_FATTN_MMA_F16_CASE
(
80
,
80
,
16
,
2
);
DECL_FATTN_MMA_F16_CASE
(
96
,
96
,
16
,
2
);
DECL_FATTN_MMA_F16_CASE
(
112
,
112
,
16
,
2
);
DECL_FATTN_MMA_F16_CASE
(
128
,
128
,
16
,
2
);
DECL_FATTN_MMA_F16_CASE
(
256
,
256
,
16
,
2
);
ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_16-ncols2_4.cu
View file @
0cefd46f
...
...
@@ -2,9 +2,9 @@
#include "../fattn-mma-f16.cuh"
DECL_FATTN_MMA_F16_CASE
(
64
,
16
,
4
);
DECL_FATTN_MMA_F16_CASE
(
80
,
16
,
4
);
DECL_FATTN_MMA_F16_CASE
(
96
,
16
,
4
);
DECL_FATTN_MMA_F16_CASE
(
112
,
16
,
4
);
DECL_FATTN_MMA_F16_CASE
(
128
,
16
,
4
);
DECL_FATTN_MMA_F16_CASE
(
256
,
16
,
4
);
DECL_FATTN_MMA_F16_CASE
(
64
,
64
,
16
,
4
);
DECL_FATTN_MMA_F16_CASE
(
80
,
80
,
16
,
4
);
DECL_FATTN_MMA_F16_CASE
(
96
,
96
,
16
,
4
);
DECL_FATTN_MMA_F16_CASE
(
112
,
112
,
16
,
4
);
DECL_FATTN_MMA_F16_CASE
(
128
,
128
,
16
,
4
);
DECL_FATTN_MMA_F16_CASE
(
256
,
256
,
16
,
4
);
ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_2-ncols2_16.cu
0 → 100644
View file @
0cefd46f
// This file has been autogenerated by generate_cu_files.py, do not edit manually.
#include "../fattn-mma-f16.cuh"
DECL_FATTN_MMA_F16_CASE
(
576
,
512
,
2
,
16
);
ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_2-ncols2_4.cu
View file @
0cefd46f
...
...
@@ -2,9 +2,9 @@
#include "../fattn-mma-f16.cuh"
DECL_FATTN_MMA_F16_CASE
(
64
,
2
,
4
);
DECL_FATTN_MMA_F16_CASE
(
80
,
2
,
4
);
DECL_FATTN_MMA_F16_CASE
(
96
,
2
,
4
);
DECL_FATTN_MMA_F16_CASE
(
112
,
2
,
4
);
DECL_FATTN_MMA_F16_CASE
(
128
,
2
,
4
);
DECL_FATTN_MMA_F16_CASE
(
256
,
2
,
4
);
DECL_FATTN_MMA_F16_CASE
(
64
,
64
,
2
,
4
);
DECL_FATTN_MMA_F16_CASE
(
80
,
80
,
2
,
4
);
DECL_FATTN_MMA_F16_CASE
(
96
,
96
,
2
,
4
);
DECL_FATTN_MMA_F16_CASE
(
112
,
112
,
2
,
4
);
DECL_FATTN_MMA_F16_CASE
(
128
,
128
,
2
,
4
);
DECL_FATTN_MMA_F16_CASE
(
256
,
256
,
2
,
4
);
ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_2-ncols2_8.cu
View file @
0cefd46f
...
...
@@ -2,9 +2,9 @@
#include "../fattn-mma-f16.cuh"
DECL_FATTN_MMA_F16_CASE
(
64
,
2
,
8
);
DECL_FATTN_MMA_F16_CASE
(
80
,
2
,
8
);
DECL_FATTN_MMA_F16_CASE
(
96
,
2
,
8
);
DECL_FATTN_MMA_F16_CASE
(
112
,
2
,
8
);
DECL_FATTN_MMA_F16_CASE
(
128
,
2
,
8
);
DECL_FATTN_MMA_F16_CASE
(
256
,
2
,
8
);
DECL_FATTN_MMA_F16_CASE
(
64
,
64
,
2
,
8
);
DECL_FATTN_MMA_F16_CASE
(
80
,
80
,
2
,
8
);
DECL_FATTN_MMA_F16_CASE
(
96
,
96
,
2
,
8
);
DECL_FATTN_MMA_F16_CASE
(
112
,
112
,
2
,
8
);
DECL_FATTN_MMA_F16_CASE
(
128
,
128
,
2
,
8
);
DECL_FATTN_MMA_F16_CASE
(
256
,
256
,
2
,
8
);
ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_32-ncols2_1.cu
View file @
0cefd46f
...
...
@@ -2,9 +2,9 @@
#include "../fattn-mma-f16.cuh"
DECL_FATTN_MMA_F16_CASE
(
64
,
32
,
1
);
DECL_FATTN_MMA_F16_CASE
(
80
,
32
,
1
);
DECL_FATTN_MMA_F16_CASE
(
96
,
32
,
1
);
DECL_FATTN_MMA_F16_CASE
(
112
,
32
,
1
);
DECL_FATTN_MMA_F16_CASE
(
128
,
32
,
1
);
DECL_FATTN_MMA_F16_CASE
(
256
,
32
,
1
);
DECL_FATTN_MMA_F16_CASE
(
64
,
64
,
32
,
1
);
DECL_FATTN_MMA_F16_CASE
(
80
,
80
,
32
,
1
);
DECL_FATTN_MMA_F16_CASE
(
96
,
96
,
32
,
1
);
DECL_FATTN_MMA_F16_CASE
(
112
,
112
,
32
,
1
);
DECL_FATTN_MMA_F16_CASE
(
128
,
128
,
32
,
1
);
DECL_FATTN_MMA_F16_CASE
(
256
,
256
,
32
,
1
);
ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_32-ncols2_2.cu
View file @
0cefd46f
...
...
@@ -2,9 +2,9 @@
#include "../fattn-mma-f16.cuh"
DECL_FATTN_MMA_F16_CASE
(
64
,
32
,
2
);
DECL_FATTN_MMA_F16_CASE
(
80
,
32
,
2
);
DECL_FATTN_MMA_F16_CASE
(
96
,
32
,
2
);
DECL_FATTN_MMA_F16_CASE
(
112
,
32
,
2
);
DECL_FATTN_MMA_F16_CASE
(
128
,
32
,
2
);
DECL_FATTN_MMA_F16_CASE
(
256
,
32
,
2
);
DECL_FATTN_MMA_F16_CASE
(
64
,
64
,
32
,
2
);
DECL_FATTN_MMA_F16_CASE
(
80
,
80
,
32
,
2
);
DECL_FATTN_MMA_F16_CASE
(
96
,
96
,
32
,
2
);
DECL_FATTN_MMA_F16_CASE
(
112
,
112
,
32
,
2
);
DECL_FATTN_MMA_F16_CASE
(
128
,
128
,
32
,
2
);
DECL_FATTN_MMA_F16_CASE
(
256
,
256
,
32
,
2
);
ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_4-ncols2_16.cu
0 → 100644
View file @
0cefd46f
// This file has been autogenerated by generate_cu_files.py, do not edit manually.
#include "../fattn-mma-f16.cuh"
DECL_FATTN_MMA_F16_CASE
(
576
,
512
,
4
,
16
);
ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_4-ncols2_2.cu
View file @
0cefd46f
...
...
@@ -2,9 +2,9 @@
#include "../fattn-mma-f16.cuh"
DECL_FATTN_MMA_F16_CASE
(
64
,
4
,
2
);
DECL_FATTN_MMA_F16_CASE
(
80
,
4
,
2
);
DECL_FATTN_MMA_F16_CASE
(
96
,
4
,
2
);
DECL_FATTN_MMA_F16_CASE
(
112
,
4
,
2
);
DECL_FATTN_MMA_F16_CASE
(
128
,
4
,
2
);
DECL_FATTN_MMA_F16_CASE
(
256
,
4
,
2
);
DECL_FATTN_MMA_F16_CASE
(
64
,
64
,
4
,
2
);
DECL_FATTN_MMA_F16_CASE
(
80
,
80
,
4
,
2
);
DECL_FATTN_MMA_F16_CASE
(
96
,
96
,
4
,
2
);
DECL_FATTN_MMA_F16_CASE
(
112
,
112
,
4
,
2
);
DECL_FATTN_MMA_F16_CASE
(
128
,
128
,
4
,
2
);
DECL_FATTN_MMA_F16_CASE
(
256
,
256
,
4
,
2
);
ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_4-ncols2_4.cu
View file @
0cefd46f
...
...
@@ -2,9 +2,9 @@
#include "../fattn-mma-f16.cuh"
DECL_FATTN_MMA_F16_CASE
(
64
,
4
,
4
);
DECL_FATTN_MMA_F16_CASE
(
80
,
4
,
4
);
DECL_FATTN_MMA_F16_CASE
(
96
,
4
,
4
);
DECL_FATTN_MMA_F16_CASE
(
112
,
4
,
4
);
DECL_FATTN_MMA_F16_CASE
(
128
,
4
,
4
);
DECL_FATTN_MMA_F16_CASE
(
256
,
4
,
4
);
DECL_FATTN_MMA_F16_CASE
(
64
,
64
,
4
,
4
);
DECL_FATTN_MMA_F16_CASE
(
80
,
80
,
4
,
4
);
DECL_FATTN_MMA_F16_CASE
(
96
,
96
,
4
,
4
);
DECL_FATTN_MMA_F16_CASE
(
112
,
112
,
4
,
4
);
DECL_FATTN_MMA_F16_CASE
(
128
,
128
,
4
,
4
);
DECL_FATTN_MMA_F16_CASE
(
256
,
256
,
4
,
4
);
Prev
1
2
3
4
5
6
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment