Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
composable_kernel_ROCM
Commits
96a0d5f6
Commit
96a0d5f6
authored
Jan 16, 2025
by
illsilin
Browse files
merge from public
parents
bfdc2430
54de3e55
Changes
345
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
1356 additions
and
263 deletions
+1356
-263
include/ck_tile/ops/smoothquant/pipeline/smoothquant_pipeline_two_pass.hpp
...ps/smoothquant/pipeline/smoothquant_pipeline_two_pass.hpp
+24
-21
include/ck_tile/ops/softmax.hpp
include/ck_tile/ops/softmax.hpp
+1
-1
include/ck_tile/ops/topk.hpp
include/ck_tile/ops/topk.hpp
+1
-1
include/ck_tile/ops/topk_softmax.hpp
include/ck_tile/ops/topk_softmax.hpp
+1
-1
include/ck_tile/ref/naive_attention.hpp
include/ck_tile/ref/naive_attention.hpp
+291
-131
library/include/ck/library/reference_tensor_operation/cpu/reference_fpAintB_gemm.hpp
...reference_tensor_operation/cpu/reference_fpAintB_gemm.hpp
+4
-34
library/include/ck/library/reference_tensor_operation/cpu/reference_gemm.hpp
...library/reference_tensor_operation/cpu/reference_gemm.hpp
+20
-9
library/include/ck/library/reference_tensor_operation/cpu/reference_gemm_multiple_d.hpp
...erence_tensor_operation/cpu/reference_gemm_multiple_d.hpp
+3
-21
library/include/ck/library/tensor_operation_instance/device_operation_instance_factory.hpp
..._operation_instance/device_operation_instance_factory.hpp
+1
-0
library/include/ck/library/tensor_operation_instance/gpu/gemm.hpp
...include/ck/library/tensor_operation_instance/gpu/gemm.hpp
+36
-8
library/include/ck/library/tensor_operation_instance/gpu/gemm_b_scale.hpp
...ck/library/tensor_operation_instance/gpu/gemm_b_scale.hpp
+91
-0
library/include/ck/library/tensor_operation_instance/gpu/gemm_dl.inc
...lude/ck/library/tensor_operation_instance/gpu/gemm_dl.inc
+0
-30
library/include/ck/library/tensor_operation_instance/gpu/gemm_dpp.inc
...ude/ck/library/tensor_operation_instance/gpu/gemm_dpp.inc
+65
-0
library/include/ck/library/tensor_operation_instance/gpu/gemm_multiply_multiply.hpp
.../tensor_operation_instance/gpu/gemm_multiply_multiply.hpp
+109
-2
library/include/ck/library/tensor_operation_instance/gpu/gemm_universal.hpp
.../library/tensor_operation_instance/gpu/gemm_universal.hpp
+33
-0
library/include/ck/library/tensor_operation_instance/gpu/gemm_universal_streamk.hpp
.../tensor_operation_instance/gpu/gemm_universal_streamk.hpp
+500
-0
library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_weight/device_grouped_conv_bwd_weight_two_stage_xdl_instance.hpp
...device_grouped_conv_bwd_weight_two_stage_xdl_instance.hpp
+45
-1
library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_backward_weight.hpp
...tion_instance/gpu/grouped_convolution_backward_weight.hpp
+17
-1
library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_backward_weight_xdl.inc
..._instance/gpu/grouped_convolution_backward_weight_xdl.inc
+97
-1
library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_forward.hpp
...or_operation_instance/gpu/grouped_convolution_forward.hpp
+17
-1
No files found.
include/ck_tile/ops/smoothquant/pipeline/smoothquant_pipeline_two_pass.hpp
View file @
96a0d5f6
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-202
4
, Advanced Micro Devices, Inc. All rights reserved.
// Copyright (c) 2018-202
5
, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
...
...
@@ -16,11 +16,11 @@ struct SmoothquantPipelineTwoPass
using
Problem
=
ck_tile
::
remove_cvref_t
<
Problem_
>
;
using
Policy
=
ck_tile
::
remove_cvref_t
<
Policy_
>
;
using
XDataType
=
ck_tile
::
remove_cvref_t
<
typename
Problem
::
XDataType
>
;
using
X
ScaleDataType
=
ck_tile
::
remove_cvref_t
<
typename
Problem
::
X
ScaleDataType
>
;
using
ComputeDataType
=
ck_tile
::
remove_cvref_t
<
typename
Problem
::
ComputeDataType
>
;
using
QYDataType
=
ck_tile
::
remove_cvref_t
<
typename
Problem
::
QYDataType
>
;
using
YScaleDataType
=
ck_tile
::
remove_cvref_t
<
typename
Problem
::
YScaleDataType
>
;
using
XDataType
=
ck_tile
::
remove_cvref_t
<
typename
Problem
::
XDataType
>
;
using
Smooth
ScaleDataType
=
ck_tile
::
remove_cvref_t
<
typename
Problem
::
Smooth
ScaleDataType
>
;
using
ComputeDataType
=
ck_tile
::
remove_cvref_t
<
typename
Problem
::
ComputeDataType
>
;
using
QYDataType
=
ck_tile
::
remove_cvref_t
<
typename
Problem
::
QYDataType
>
;
using
YScaleDataType
=
ck_tile
::
remove_cvref_t
<
typename
Problem
::
YScaleDataType
>
;
static
constexpr
bool
kNeedCrossWarpSync
=
Problem
::
kNeedCrossWarpSync
;
static
constexpr
bool
kPadM
=
false
;
// TODO - BlockSmoothquantProblem::kPadM
...
...
@@ -39,9 +39,12 @@ struct SmoothquantPipelineTwoPass
return
Policy
::
template
GetSmemSize
<
Problem
>();
}
template
<
typename
XWindow
,
typename
XScaleWindow
,
typename
QYWindow
,
typename
YScaleWindow
>
template
<
typename
XWindow
,
typename
SmoothScaleWindow
,
typename
QYWindow
,
typename
YScaleWindow
>
CK_TILE_DEVICE
auto
operator
()(
const
XWindow
&
x_window_
,
const
X
ScaleWindow
&
x
scale_window_
,
const
Smooth
ScaleWindow
&
sm
scale_window_
,
YScaleWindow
&
yscale_window
,
QYWindow
&
qy_window
,
ck_tile
::
index_t
row_size
,
...
...
@@ -49,8 +52,8 @@ struct SmoothquantPipelineTwoPass
{
auto
x_window
=
make_tile_window
(
x_window_
,
Policy
::
template
MakeXBlockTileDistribution
<
Problem
>());
auto
x
scale_window
=
make_tile_window
(
x
scale_window_
,
Policy
::
template
Make
X
ScaleBlockTileDistribution
<
Problem
>());
auto
sm
scale_window
=
make_tile_window
(
sm
scale_window_
,
Policy
::
template
Make
Smooth
ScaleBlockTileDistribution
<
Problem
>());
static
constexpr
index_t
Block_N
=
Problem
::
BlockShape
::
Block_N
;
index_t
num_n_tile_iteration
=
...
...
@@ -76,14 +79,14 @@ struct SmoothquantPipelineTwoPass
for
(
int
iN
=
__builtin_amdgcn_readfirstlane
(
0
);
iN
<
num_n_tile_iteration
;
++
iN
)
{
const
auto
x
=
load_tile
(
x_window
);
const
auto
x
scale
=
load_tile
(
x
scale_window
);
const
auto
y
=
tile_elementwise_in
(
const
auto
x
=
load_tile
(
x_window
);
const
auto
sm
scale
=
load_tile
(
sm
scale_window
);
const
auto
y
=
tile_elementwise_in
(
[
&
](
const
auto
&
a
,
const
auto
&
b
)
{
return
type_convert
<
ComputeDataType
>
(
a
)
*
type_convert
<
ComputeDataType
>
(
b
);
},
x
,
x
scale
);
sm
scale
);
constexpr
auto
x_size_per_row
=
x
.
get_tile_distribution
().
get_ys_to_d_descriptor
().
get_lengths
().
at
(
number
<
1
>
{});
...
...
@@ -94,7 +97,7 @@ struct SmoothquantPipelineTwoPass
block_reduce2d
(
y
,
absmax
,
reduce_absmax_func
);
move_tile_window
(
x_window
,
{
0
,
Block_N
});
move_tile_window
(
x
scale_window
,
{
Block_N
});
move_tile_window
(
sm
scale_window
,
{
Block_N
});
}
// compute absmax, cross-lane->cross-warp
...
...
@@ -114,20 +117,20 @@ struct SmoothquantPipelineTwoPass
row_size
%
Block_N
==
0
?
row_size
-
Block_N
:
row_size
-
row_size
%
Block_N
;
move_tile_window
(
x_window
,
{
0
,
-
Block_N
});
move_tile_window
(
x
scale_window
,
{
-
Block_N
});
move_tile_window
(
sm
scale_window
,
{
-
Block_N
});
move_tile_window
(
qy_window
,
{
0
,
stride_to_right_most_window
});
// recompute y and quantize y to qy
for
(
int
iN
=
__builtin_amdgcn_readfirstlane
(
0
);
iN
<
num_n_tile_iteration
;
++
iN
)
{
const
auto
x
=
load_tile
(
x_window
);
const
auto
x
scale
=
load_tile
(
x
scale_window
);
const
auto
y
=
tile_elementwise_in
(
const
auto
x
=
load_tile
(
x_window
);
const
auto
sm
scale
=
load_tile
(
sm
scale_window
);
const
auto
y
=
tile_elementwise_in
(
[
&
](
const
auto
&
a
,
const
auto
&
b
)
{
return
type_convert
<
ComputeDataType
>
(
a
)
*
type_convert
<
ComputeDataType
>
(
b
);
},
x
,
x
scale
);
sm
scale
);
auto
qy
=
make_static_distributed_tensor
<
QYDataType
>
(
y
.
get_tile_distribution
());
sweep_tile
(
qy
,
[
&
](
auto
idx
)
{
...
...
@@ -138,7 +141,7 @@ struct SmoothquantPipelineTwoPass
store_tile
(
qy_window
,
qy
);
move_tile_window
(
x_window
,
{
0
,
-
Block_N
});
move_tile_window
(
x
scale_window
,
{
0
,
-
Block_N
});
move_tile_window
(
sm
scale_window
,
{
0
,
-
Block_N
});
move_tile_window
(
qy_window
,
{
0
,
-
Block_N
});
}
}
...
...
include/ck_tile/ops/softmax.hpp
View file @
96a0d5f6
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-202
4
, Advanced Micro Devices, Inc. All rights reserved.
// Copyright (c) 2018-202
5
, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
...
...
include/ck_tile/ops/topk.hpp
View file @
96a0d5f6
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-202
4
, Advanced Micro Devices, Inc. All rights reserved.
// Copyright (c) 2018-202
5
, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
...
...
include/ck_tile/ops/topk_softmax.hpp
View file @
96a0d5f6
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-202
4
, Advanced Micro Devices, Inc. All rights reserved.
// Copyright (c) 2018-202
5
, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
...
...
include/ck_tile/ref/naive_attention.hpp
View file @
96a0d5f6
...
...
@@ -13,13 +13,18 @@ namespace ck_tile {
enum
class
naive_attention_layout_enum
{
BSHD
,
// [batch, seqlen, nhead, hdim]
BHSD
,
// [batch, nhead, seqlen, hdim]
BS3HD
,
// [batch, nhead, 3, seqlen, hdim], used when qkv are packed
PHSD
,
// [pages, nhead, page_size, hdim]
DEFAULT
,
// maybe this tensor is not used, set some irrelevant value
BSHD
,
// [batch, seqlen, nhead, hdim]
BHSD
,
// [batch, nhead, seqlen, hdim]
BS3HD
,
// [batch, nhead, 3, seqlen, hdim], used when qkv are packed
PHSD
,
// [pages, nhead, page_size, hdim]
// PHSDX, // [pages, nhead, page_size/x, hdim, x], where <# used pages>*page_size = seqlen
PHDSX
,
// [pages, nhead, hdim/x, page_size, x], where <# used pages>*page_size = seqlen
PHDS
,
// [pages, nhead, hdim, page_size], where <# used pages>*page_size = seqlen
// scale layout used for dynamic dequant
SCALE_HS
,
// [nhead, tokens] or [nhead, tokens-per-group], nhe KVCache quant
SCALE_SH
,
// [tokens, nhead]
};
// will used to specialize kernel variation
...
...
@@ -30,6 +35,15 @@ enum class naive_attention_variation_enum
DECODE_PAGED
,
// decode attn, where kv token from another buffer called kvcache
};
enum
class
naive_attention_quant_algo
{
NO
=
0
,
KV_8BIT_PERHEAD
=
1
,
// FP8/INT8 quant for KVCache, per-token quant
// [num_tokens, nhead, hdim] -> [nhead, num_tokens]
KV_8BIT_PERTOKEN
=
2
,
};
// TODO: for simplicity, this will be used as host/device arg
struct
naive_attention_fwd_args
{
...
...
@@ -40,7 +54,8 @@ struct naive_attention_fwd_args
void
*
context_len_ptr
;
// [batch] used when seqlen kv come from a pointer(each element is a
// number, not cumsum)
void
*
page_table_ptr
;
// [batch, max_pages_per_seq] seqlen_kv is in different block(paged attn)
void
*
kvscale_ptr
;
// [nhead, 2(kv), hdim] used for kvcache dequant
void
*
kscale_ptr
;
// [nhead, max_kv_tokens] used for kvcache dequant
void
*
vscale_ptr
;
// [nhead, max_kv_tokens] used for kvcache dequant
float
scale_s
;
int
hdim
;
int
hdim_v
;
// could be cross-attn, where V and Q/K hdim are different
...
...
@@ -54,6 +69,7 @@ struct naive_attention_fwd_args
int
nhead_ratio_kv
;
// nhead_q / nhead_kv
int
page_size
;
// if paged, the seqlen-kv per each block
int
max_pages_per_seq
;
int
max_kv_tokens
;
// used as stride to access kv scale ptr
};
// this is trait for host API
...
...
@@ -67,14 +83,16 @@ struct naive_attention_fwd_traits
std
::
string
k_layout
;
std
::
string
v_layout
;
std
::
string
o_layout
;
int
variation
;
// sync with naive_attention_variation_enum
int
variation
;
// sync with naive_attention_variation_enum
int
quant_algo
;
// sync with naive_attention_quant_algo
};
// this is trait for kernel template
template
<
naive_attention_variation_enum
variation_
>
template
<
naive_attention_variation_enum
variation_
,
naive_attention_quant_algo
quant_algo_
>
struct
naive_attention_fwd_kernel_traits
{
static
constexpr
naive_attention_variation_enum
variation
=
variation_
;
static
constexpr
naive_attention_quant_algo
quant_algo
=
quant_algo_
;
};
// for simplicity, please do not use const-reference type for the template type
...
...
@@ -83,28 +101,39 @@ template <typename QType,
typename
VType
,
typename
OType
,
typename
AccType
,
typename
KVScaleType
,
naive_attention_layout_enum
QLayout
,
naive_attention_layout_enum
KLayout
,
naive_attention_layout_enum
VLayout
,
naive_attention_layout_enum
OLayout
,
naive_attention_layout_enum
KScaleLayout
,
naive_attention_layout_enum
VScaleLayout
,
typename
Traits
>
struct
naive_attention_fwd_kernel
{
static
constexpr
bool
is_kvcache_i8
=
std
::
is_same_v
<
KType
,
int8_t
>
&&
std
::
is_same_v
<
VType
,
int8_t
>
&&
sizeof
(
QType
)
!=
1
;
std
::
is_same_v
<
KType
,
int8_t
>
&&
std
::
is_same_v
<
VType
,
int8_t
>
;
static
constexpr
bool
is_kvcache_fp8
=
std
::
is_same_v
<
KType
,
fp8_t
>
&&
std
::
is_same_v
<
VType
,
fp8_t
>
;
// kvcache-i8 will have per head scale, we apply this scale to Q/P matrix instead of original
// K/V matrix. This can speed up conversion since Q/P usually is fp16/bf16/fp32
static
constexpr
bool
is_kvcache_i8_forward_quant
=
is_kvcache_i8
;
static
constexpr
int
v_per_token_quant_group_size
=
64
;
// TODO: hardcode
using
KVScaleType
=
float
;
using
SoftmaxType
=
float
;
using
PType
=
VType
;
// src A of gemm2, same type as V
using
SoftmaxType
=
float
;
// always using float to do softmax compute
using
QuantComputeType
=
float
;
// used for quant/dequant scale compute
using
QCompute
=
KType
;
// src A of gemm1, same type as K
using
PType
=
VType
;
// src A of gemm2, same type as V
using
OAccType
=
float
;
// always float, in case int8 FA
using
p_vec_type
=
ext_vector_t
<
PType
,
16
/
sizeof
(
PType
)
>
;
static
constexpr
int
p_vec_elem
=
vector_traits
<
p_vec_type
>::
vector_size
;
// clang-format off
template
<
typename
T_
>
struct
scale_max
{
static
constexpr
float
value
=
1
;
/* dummy code */
};
template
<
>
struct
scale_max
<
int8_t
>
{
static
constexpr
float
value
=
127.0
;
};
template
<
>
struct
scale_max
<
fp8_t
>
{
static
constexpr
float
value
=
240.0
;
};
// clang-format on
__host__
__device__
naive_attention_fwd_kernel
()
{}
template
<
typename
T
,
naive_attention_layout_enum
Layout
>
...
...
@@ -198,24 +227,31 @@ struct naive_attention_fwd_kernel
__device__
void
store
(
T
/*value*/
,
int
/*i_s*/
,
int
/*i_d*/
)
{}
};
template
<
typename
T
>
template
<
typename
T
,
naive_attention_layout_enum
Layout
>
struct
kvscale_addresser
{
int
h
,
d
;
// nhead, hdim
int
s
,
h
,
d
;
//
seqlen(tokens),
nhead, hdim
T
*
base_ptr
;
__device__
kvscale_addresser
(
int
h_
,
int
d_
,
void
*
p_
)
:
h
(
h_
),
d
(
d_
),
base_ptr
(
reinterpret_cast
<
T
*>
(
p_
))
__device__
kvscale_addresser
(
int
s_
,
int
h_
,
int
d_
,
void
*
p_
)
:
s
(
s_
),
h
(
h_
),
d
(
d_
),
base_ptr
(
reinterpret_cast
<
T
*>
(
p_
))
{
}
__device__
int
get_offset
(
int
i_
h
,
int
i_
d
,
int
i_
kv
/*0 or 1*/
)
__device__
int
get_offset
(
int
i_
s
,
int
i_
h
,
int
i_
d
)
{
if
constexpr
(
Layout
==
naive_attention_layout_enum
::
SCALE_HS
)
{
// [nhead, tokens]
(
void
)
i_d
;
return
i_h
*
s
+
i_s
;
}
else
if
constexpr
(
Layout
==
naive_attention_layout_enum
::
DEFAULT
)
{
return
0
;
}
// [h, 2, d]
return
i_h
*
2
*
d
+
i_kv
*
d
+
i_d
;
}
__device__
T
load
(
int
i_h
,
int
i_d
,
int
i_kv
)
{
return
base_ptr
[
get_offset
(
i_h
,
i_d
,
i_kv
)];
// return i_h * 2 * d + i_kv * d + i_d;
}
__device__
T
load
(
int
i_s
,
int
i_h
,
int
i_d
)
{
return
base_ptr
[
get_offset
(
i_s
,
i_h
,
i_d
)];
}
};
__device__
__host__
static
constexpr
int
get_block_size
()
{
return
256
;
}
...
...
@@ -282,12 +318,13 @@ struct naive_attention_fwd_kernel
__device__
void
operator
()(
naive_attention_fwd_args
args
)
{
constexpr
int
wg_size
=
get_block_size
();
__shared__
char
smem
[
wg_size
*
4
*
sizeof
(
float
)];
// should enough
int
i_dv
=
blockIdx
.
x
*
wg_size
+
threadIdx
.
x
;
// index of hdim_v
int
i_sq
=
blockIdx
.
y
;
// index of seqlen_q
int
i_batch
=
blockIdx
.
z
;
// index of batch_q * nhead_q
int
i_bq
=
i_batch
/
args
.
nhead_q
;
// index of batch_q
int
i_hq
=
i_batch
%
args
.
nhead_q
;
// index of nhead_q
__shared__
char
smem
[
wg_size
*
4
*
sizeof
(
float
)];
// should enough
char
*
smem_quant_q
=
smem
+
wg_size
*
2
*
sizeof
(
float
);
// second half, should enough
int
i_dv
=
blockIdx
.
x
*
wg_size
+
threadIdx
.
x
;
// index of hdim_v
int
i_sq
=
blockIdx
.
y
;
// index of seqlen_q
int
i_batch
=
blockIdx
.
z
;
// index of batch_q * nhead_q
int
i_bq
=
i_batch
/
args
.
nhead_q
;
// index of batch_q
int
i_hq
=
i_batch
%
args
.
nhead_q
;
// index of nhead_q
int
i_bk
=
i_bq
/
args
.
batch_ratio_kv
;
int
i_hk
=
i_hq
/
args
.
nhead_ratio_kv
;
...
...
@@ -360,9 +397,10 @@ struct naive_attention_fwd_kernel
auto
f_max
=
[](
auto
x_
,
auto
y_
)
{
return
max
(
x_
,
y_
);
};
auto
f_sum
=
[](
auto
x_
,
auto
y_
)
{
return
x_
+
y_
;
};
auto
f_absmax_f32
=
[](
float
v_0_
,
float
v_1_
)
{
float
rtn
;
asm
volatile
(
"v_max_f32 %0, abs(%1), abs(%2)"
:
"=v"
(
rtn
)
:
"v"
(
v_0_
),
"v"
(
v_1_
));
return
rtn
;
// float rtn;
// asm volatile("v_max_f32 %0, abs(%1), abs(%2)" : "=v"(rtn) : "v"(v_0_), "v"(v_1_));
// return rtn;
return
max
(
abs
(
v_0_
),
abs
(
v_1_
));
};
int
seqlen_kv
=
[
&
]()
{
...
...
@@ -378,45 +416,82 @@ struct naive_attention_fwd_kernel
SoftmaxType
row_max
=
-
numeric
<
SoftmaxType
>::
infinity
();
SoftmaxType
l
{
0
};
AccType
o_acc
=
{
0
};
// AccType o_acc = {0};
OAccType
o_acc
=
{
0
};
int
sk_loops
=
(
seqlen_kv
+
wg_size
-
1
)
/
wg_size
;
float
qf_scale
=
.0
f
;
kvscale_addresser
<
KVScaleType
>
kvscale_addr
{
args
.
nhead_kv
,
args
.
hdim
,
args
.
kvscale_ptr
};
int
sk_loops
=
(
seqlen_kv
+
wg_size
-
1
)
/
wg_size
;
QuantComputeType
q_dequant_scale
=
.0
f
;
kvscale_addresser
<
KVScaleType
,
KScaleLayout
>
kscale_addr
{
args
.
max_kv_tokens
,
args
.
nhead_kv
,
args
.
hdim
,
args
.
kscale_ptr
};
kvscale_addresser
<
KVScaleType
,
VScaleLayout
>
vscale_addr
{
args
.
max_kv_tokens
,
args
.
nhead_kv
,
args
.
hdim_v
,
args
.
vscale_ptr
};
if
constexpr
(
is_kvcache_i8_forward_quant
)
if
constexpr
(
Traits
::
quant_algo
==
naive_attention_quant_algo
::
KV_8BIT_PERHEAD
)
{
// AccType is i32 now, seqlen_q = 1, hdim up to 256
float
q
=
0
;
float
k_s
=
0
;
AccType
q
=
0
;
AccType
k_s
=
0
;
if
(
static_cast
<
int
>
(
threadIdx
.
x
)
<
args
.
hdim
)
{
q
=
type_convert
<
float
>
(
q_addr
.
load
(
0
,
threadIdx
.
x
));
k_s
=
type_convert
<
float
>
(
k
v
scale_addr
.
load
(
i_hk
,
threadIdx
.
x
,
0
));
q
=
type_convert
<
AccType
>
(
q_addr
.
load
(
0
,
threadIdx
.
x
));
k_s
=
type_convert
<
AccType
>
(
kscale_addr
.
load
(
i_hk
,
threadIdx
.
x
,
0
));
}
// 1) we apply the k scale to q
float
q_forwarded
=
q
*
k_s
;
AccType
q_forwarded
=
q
*
k_s
;
// 2) apply smooth-quant
// find absmax
float
qf_max
=
wave_reduce
(
q_forwarded
,
f_absmax_f32
);
qf_max
=
cross_wave_reduce
(
qf_max
,
f_absmax_f32
,
reinterpret_cast
<
float
*>
(
smem
));
AccType
qf_max
=
wave_reduce
(
q_forwarded
,
f_absmax_f32
);
qf_max
=
cross_wave_reduce
(
qf_max
,
f_absmax_f32
,
reinterpret_cast
<
AccType
*>
(
smem
));
// per-token scale
q
f
_scale
=
qf_max
/
127.0
;
q
_dequant
_scale
=
type_convert
<
QuantComputeType
>
(
qf_max
)
/
scale_max
<
QCompute
>::
value
;
// devide by scale
q
=
q
/
q
f
_scale
;
q
=
q
/
q
_dequant
_scale
;
// fp32->i8
int8_t
quantized_q
=
static_cast
<
int8_t
>
(
q
);
QCompute
quantized_q
=
static_cast
<
QCompute
>
(
q
);
__syncthreads
();
reinterpret_cast
<
int8_t
*>
(
smem
)[
threadIdx
.
x
]
=
quantized_q
;
reinterpret_cast
<
QCompute
*>
(
smem
)[
threadIdx
.
x
]
=
quantized_q
;
__syncthreads
();
// after above process, we have 2 data
// 1) int8 q data stored in smem(no need to reload)
// 2) per-token scale qf_scale, to be mul after 1st gemm
// 2) per-token scale q_dequant_scale, to be mul after 1st gemm
}
else
if
constexpr
(
Traits
::
quant_algo
==
naive_attention_quant_algo
::
KV_8BIT_PERTOKEN
)
{
if
(
std
::
is_same_v
<
QType
,
fp16_t
>
||
std
::
is_same_v
<
QType
,
bf16_t
>
)
{
// dyanmic quant q here
float
q
=
0
;
if
(
static_cast
<
int
>
(
threadIdx
.
x
)
<
args
.
hdim
)
{
q
=
type_convert
<
float
>
(
q_addr
.
load
(
i_sq
,
threadIdx
.
x
));
}
// apply smooth-quant
// find absmax
float
q_max
=
wave_reduce
(
q
,
f_absmax_f32
);
q_max
=
cross_wave_reduce
(
q_max
,
f_absmax_f32
,
reinterpret_cast
<
float
*>
(
smem
));
// per-token scale
q_dequant_scale
=
type_convert
<
QuantComputeType
>
(
q_max
)
/
scale_max
<
QCompute
>::
value
;
// devide by scale
q
=
q
/
q_dequant_scale
;
QCompute
quantized_q
=
type_convert
<
QCompute
>
(
q
);
__syncthreads
();
reinterpret_cast
<
QCompute
*>
(
smem_quant_q
)[
threadIdx
.
x
]
=
quantized_q
;
__syncthreads
();
// after above process, we have 2 data
// 1) fp8 q data stored in smem(no need to reload from global)
// 2) per-token scale q_dequant_scale, to be mul after 1st gemm
}
}
for
(
int
i_loop1
=
0
;
i_loop1
<
sk_loops
;
i_loop1
++
)
...
...
@@ -429,33 +504,41 @@ struct naive_attention_fwd_kernel
AccType
s_acc
{
0
};
// clear for every loop
for
(
auto
i_dq
=
0
;
i_dq
<
args
.
hdim
;
i_dq
++
)
{
if
constexpr
(
is_kvcache_i8_forward_quant
)
{
int8_t
q
=
reinterpret_cast
<
int8_t
*>
(
smem
)[
i_dq
];
auto
k
=
k_addr
.
load
(
i_sk
,
i_dq
);
s_acc
+=
type_convert
<
AccType
>
(
q
)
*
type_convert
<
AccType
>
(
k
);
}
else
{
auto
q
=
q_addr
.
load
(
i_sq
,
i_dq
);
// q will have duplicate load
auto
k
=
k_addr
.
load
(
i_sk
,
i_dq
);
auto
q
=
[
&
]()
{
if
constexpr
(
Traits
::
quant_algo
==
naive_attention_quant_algo
::
KV_8BIT_PERHEAD
||
Traits
::
quant_algo
==
naive_attention_quant_algo
::
KV_8BIT_PERTOKEN
)
{
return
reinterpret_cast
<
QCompute
*>
(
smem_quant_q
)[
i_dq
];
}
else
return
q_addr
.
load
(
i_sq
,
i_dq
);
// q will have duplicate load
}();
auto
k
=
[
&
]()
{
return
k_addr
.
load
(
i_sk
,
i_dq
);
}();
s_acc
+=
type_convert
<
AccType
>
(
q
)
*
type_convert
<
AccType
>
(
k
);
}
s_acc
+=
type_convert
<
AccType
>
(
q
)
*
type_convert
<
AccType
>
(
k
);
}
// scale
s_softmax
=
type_convert
<
SoftmaxType
>
(
s_acc
);
s_softmax
*=
type_convert
<
SoftmaxType
>
(
args
.
scale_s
*
ck_tile
::
log2e_v
<
SoftmaxType
>
);
if
constexpr
(
is_kvcache_i8_forward_quant
)
if
constexpr
(
Traits
::
quant_algo
==
naive_attention_quant_algo
::
KV_8BIT_PERHEAD
)
{
s_softmax
*=
q_dequant_scale
;
// post scale the per-token factor
}
else
if
constexpr
(
Traits
::
quant_algo
==
naive_attention_quant_algo
::
KV_8BIT_PERTOKEN
)
{
s_softmax
*=
qf_scale
;
// post scale the per-token factor
SoftmaxType
k_per_token_scale
=
type_convert
<
SoftmaxType
>
(
kscale_addr
.
load
(
i_sk
,
i_hk
,
0
));
s_softmax
*=
q_dequant_scale
;
s_softmax
*=
k_per_token_scale
;
}
}
// s->p
float
pf
_scale
=
0
.
;
// used for i8 quant
QuantComputeType
p_dequant
_scale
=
1
.
;
{
// softmax, find max
SoftmaxType
old_max
=
row_max
;
...
...
@@ -473,41 +556,69 @@ struct naive_attention_fwd_kernel
// l, pre-scall o_acc
SoftmaxType
tmp
=
__builtin_amdgcn_exp2f
(
old_max
-
row_max
);
l
=
tmp
*
l
+
row_sum
;
o_acc
=
type_convert
<
AccType
>
(
type_convert
<
SoftmaxType
>
(
o_acc
)
*
tmp
);
o_acc
=
type_convert
<
O
AccType
>
(
type_convert
<
SoftmaxType
>
(
o_acc
)
*
tmp
);
// prepare the p_compute into smem, to let every thread read same p_compute and do
// 2nd gemm
if
constexpr
(
is_kvcache_i8_forward_quant
)
if
constexpr
(
Traits
::
quant_algo
==
naive_attention_quant_algo
::
KV_8BIT_PERHEAD
)
{
float
v_s
=
0
;
QuantComputeType
v_s
=
0
;
if
(
static_cast
<
int
>
(
threadIdx
.
x
)
<
args
.
hdim_v
)
{
v_s
=
type_convert
<
float
>
(
kvscale_addr
.
load
(
i_hk
,
threadIdx
.
x
,
1
));
v_s
=
type_convert
<
QuantComputeType
>
(
vscale_addr
.
load
(
i_hk
,
threadIdx
.
x
,
1
));
}
// 1) we apply the v scale to p
float
p_forwarded
=
p_compute
*
v_s
;
QuantComputeType
p_forwarded
=
p_compute
*
v_s
;
// 2) apply smooth-quant
// find absmax
float
pf_max
=
wave_reduce
(
p_forwarded
,
f_absmax_f32
);
pf_max
=
cross_wave_reduce
(
pf_max
,
f_absmax_f32
,
reinterpret_cast
<
float
*>
(
smem
));
QuantComputeType
pf_max
=
wave_reduce
(
p_forwarded
,
f_absmax_f32
);
pf_max
=
cross_wave_reduce
(
pf_max
,
f_absmax_f32
,
reinterpret_cast
<
QuantComputeType
*>
(
smem
));
// per-token scale
p
f
_scale
=
pf_max
/
127.0
;
p
_dequant
_scale
=
pf_max
/
scale_max
<
PType
>::
value
;
//
127.0;
// devide by scale
p_compute
=
p_compute
/
p
f
_scale
;
p_compute
=
p_compute
/
p
_dequant
_scale
;
// fp32->i8
int8_t
quantized_p
=
static_cast
<
int8_t
>
(
p_compute
);
PType
quantized_p
=
static_cast
<
PType
>
(
p_compute
);
__syncthreads
();
reinterpret_cast
<
int8_t
*>
(
smem
)[
threadIdx
.
x
]
=
quantized_p
;
reinterpret_cast
<
PType
*>
(
smem
)[
threadIdx
.
x
]
=
quantized_p
;
__syncthreads
();
// after above process, we have 2 data
// 1) int8 p data stored in smem(no need to reload)
// 2) per-token scale pf_scale, to be mul after 2nd gemm
// 2) per-token scale p_dequant_scale, to be mul after 2nd gemm
}
else
if
constexpr
(
Traits
::
quant_algo
==
naive_attention_quant_algo
::
KV_8BIT_PERTOKEN
)
{
// forward apply the v scale to p_compute, this is compute friendly
auto
v_scale
=
type_convert
<
QuantComputeType
>
(
vscale_addr
.
load
(
i_sk
,
i_hk
,
0
));
p_compute
*=
v_scale
;
// smooth-quant
// find absmax
QuantComputeType
p_max
=
wave_reduce
(
p_compute
,
f_absmax_f32
);
p_max
=
cross_wave_reduce
(
p_max
,
f_absmax_f32
,
reinterpret_cast
<
QuantComputeType
*>
(
smem
));
// per-token scale
p_dequant_scale
=
p_max
/
scale_max
<
PType
>::
value
;
// 240.0;
// devide by scale
p_compute
=
p_compute
/
p_dequant_scale
;
// fp32->i8
PType
quantized_p
=
type_convert
<
PType
>
(
p_compute
);
__syncthreads
();
reinterpret_cast
<
PType
*>
(
smem
)[
threadIdx
.
x
]
=
quantized_p
;
__syncthreads
();
// after above process, we have 2 data
// 1) fp8_t p data stored in smem(no need to reload)
// 2) per-token scale p_dequant_scale, to be mul after 2nd gemm
}
else
{
...
...
@@ -531,29 +642,45 @@ struct naive_attention_fwd_kernel
int
sv_offset
=
i_loop2
*
p_vec_elem
+
i_j
;
int
i_sv
=
sk_start
+
sv_offset
;
VType
v
=
0
.
f
;
VType
v
=
0
;
if
(
i_dv
<
args
.
hdim_v
&&
i_sv
<
seqlen_kv
)
{
v
=
v_addr
.
load
(
i_sv
,
i_dv
);
}
o_acc_local
+=
type_convert
<
AccType
>
(
p_vec
[
i_j
])
*
type_convert
<
AccType
>
(
v
);
AccType
v_compute
=
[
&
]()
{
return
type_convert
<
AccType
>
(
v
);
}();
o_acc_local
+=
type_convert
<
AccType
>
(
p_vec
[
i_j
])
*
v_compute
;
}
}
if
constexpr
(
is_kvcache_i8_forward_quant
)
{
// apply pr scale to local acc
o_acc_local
=
type_convert
<
AccType
>
(
type_convert
<
float
>
(
o_acc_local
)
*
pf_scale
);
}
o_acc
+=
o_acc_local
;
OAccType
post_scale_o_acc_local
=
[
&
]()
{
if
constexpr
(
Traits
::
quant_algo
==
naive_attention_quant_algo
::
KV_8BIT_PERHEAD
)
{
// apply pr scale to local acc
return
type_convert
<
OAccType
>
(
type_convert
<
QuantComputeType
>
(
o_acc_local
)
*
p_dequant_scale
);
}
else
if
constexpr
(
Traits
::
quant_algo
==
naive_attention_quant_algo
::
KV_8BIT_PERTOKEN
)
{
// apply pr scale to local acc
return
type_convert
<
OAccType
>
(
type_convert
<
QuantComputeType
>
(
o_acc_local
)
*
p_dequant_scale
);
}
else
{
return
type_convert
<
OAccType
>
(
o_acc_local
);
}
}();
o_acc
+=
post_scale_o_acc_local
;
}
}
// post scale o_acc
{
SoftmaxType
tmp
=
l
==
0.
f
?
0.
f
:
1.
f
/
l
;
// in case masking
o_acc
=
type_convert
<
AccType
>
(
type_convert
<
SoftmaxType
>
(
o_acc
)
*
tmp
);
o_acc
=
type_convert
<
O
AccType
>
(
type_convert
<
SoftmaxType
>
(
o_acc
)
*
tmp
);
}
// store O
...
...
@@ -564,18 +691,21 @@ struct naive_attention_fwd_kernel
#define CK_TILE_DISPATCH_NAIVE_ATTEN_FWD_INTERNAL_() \
{ \
using ktraits_ =
\
naive_attention_fwd_kernel_traits<
static_cast<naive_attention_variation_enum>( \
variation_)>;
\
using ktraits_ =
naive_attention_fwd_kernel_traits<
\
static_cast<naive_attention_variation_enum>(
variation_),
\
static_cast<naive_attention_quant_algo>(quant_algo_)>;
\
using k_ = naive_attention_fwd_kernel<q_type_, \
k_type_, \
v_type_, \
o_type_, \
acc_type_, \
kvscale_type_, \
q_layout_, \
k_layout_, \
v_layout_, \
o_layout_, \
k_scale_layout_, \
v_scale_layout_, \
ktraits_>; \
dim3 grids = k_::get_grid_size(a); \
r = ck_tile::launch_kernel(s, \
...
...
@@ -586,31 +716,37 @@ struct naive_attention_fwd_kernel
if(t.variation == 0 && t.q_layout == "bshd" && t.k_layout == "bshd" && t.v_layout == "bshd" && \
t.o_layout == "bshd") \
{ \
constexpr auto q_layout_ = naive_attention_layout_enum::BSHD; \
constexpr auto k_layout_ = naive_attention_layout_enum::BSHD; \
constexpr auto v_layout_ = naive_attention_layout_enum::BSHD; \
constexpr auto o_layout_ = naive_attention_layout_enum::BSHD; \
constexpr int variation_ = 0; \
constexpr auto q_layout_ = naive_attention_layout_enum::BSHD; \
constexpr auto k_layout_ = naive_attention_layout_enum::BSHD; \
constexpr auto v_layout_ = naive_attention_layout_enum::BSHD; \
constexpr auto o_layout_ = naive_attention_layout_enum::BSHD; \
constexpr auto k_scale_layout_ = naive_attention_layout_enum::DEFAULT; \
constexpr auto v_scale_layout_ = naive_attention_layout_enum::DEFAULT; \
constexpr int variation_ = 0; \
CK_TILE_DISPATCH_NAIVE_ATTEN_FWD_INTERNAL_(); \
} \
else if(t.variation == 0 && t.q_layout == "bhsd" && t.k_layout == "bhsd" && \
t.v_layout == "bhsd" && t.o_layout == "bhsd") \
{ \
constexpr auto q_layout_ = naive_attention_layout_enum::BHSD; \
constexpr auto k_layout_ = naive_attention_layout_enum::BHSD; \
constexpr auto v_layout_ = naive_attention_layout_enum::BHSD; \
constexpr auto o_layout_ = naive_attention_layout_enum::BHSD; \
constexpr int variation_ = 0; \
constexpr auto q_layout_ = naive_attention_layout_enum::BHSD; \
constexpr auto k_layout_ = naive_attention_layout_enum::BHSD; \
constexpr auto v_layout_ = naive_attention_layout_enum::BHSD; \
constexpr auto o_layout_ = naive_attention_layout_enum::BHSD; \
constexpr auto k_scale_layout_ = naive_attention_layout_enum::DEFAULT; \
constexpr auto v_scale_layout_ = naive_attention_layout_enum::DEFAULT; \
constexpr int variation_ = 0; \
CK_TILE_DISPATCH_NAIVE_ATTEN_FWD_INTERNAL_(); \
} \
else if(t.variation == 2 && t.q_layout == "bhsd" && t.k_layout == "phdsx" && \
t.v_layout == "phds" && t.o_layout == "bhsd") \
{ \
constexpr auto q_layout_ = naive_attention_layout_enum::BHSD; \
constexpr auto k_layout_ = naive_attention_layout_enum::PHDSX; \
constexpr auto v_layout_ = naive_attention_layout_enum::PHDS; \
constexpr auto o_layout_ = naive_attention_layout_enum::BHSD; \
constexpr int variation_ = 2; \
constexpr auto q_layout_ = naive_attention_layout_enum::BHSD; \
constexpr auto k_layout_ = naive_attention_layout_enum::PHDSX; \
constexpr auto v_layout_ = naive_attention_layout_enum::PHDS; \
constexpr auto o_layout_ = naive_attention_layout_enum::BHSD; \
constexpr auto k_scale_layout_ = naive_attention_layout_enum::SCALE_HS; \
constexpr auto v_scale_layout_ = naive_attention_layout_enum::SCALE_HS; \
constexpr int variation_ = 2; \
CK_TILE_DISPATCH_NAIVE_ATTEN_FWD_INTERNAL_(); \
}
...
...
@@ -621,40 +757,64 @@ CK_TILE_HOST float naive_attention_fwd(naive_attention_fwd_traits t,
{
float
r
=
-
1
;
// TODO: do not explicitly create too much instance!
if
(
t
.
q_type
==
"fp16"
&&
t
.
k_type
==
"fp16"
&&
t
.
v_type
==
"fp16"
&&
t
.
o_type
==
"fp16"
)
if
(
t
.
q_type
==
"fp16"
&&
t
.
k_type
==
"fp16"
&&
t
.
v_type
==
"fp16"
&&
t
.
o_type
==
"fp16"
&&
t
.
quant_algo
==
0
)
{
using
q_type_
=
fp16_t
;
using
k_type_
=
fp16_t
;
using
v_type_
=
fp16_t
;
using
o_type_
=
fp16_t
;
using
acc_type_
=
float
;
using
kvscale_type_
=
float
;
constexpr
int
quant_algo_
=
0
;
CK_TILE_DISPATCH_NAIVE_ATTEN_FWD_LAOYUT_
();
}
else
if
(
t
.
q_type
==
"bf16"
&&
t
.
k_type
==
"bf16"
&&
t
.
v_type
==
"bf16"
&&
t
.
o_type
==
"bf16"
&&
t
.
quant_algo
==
0
)
{
using
q_type_
=
fp16_t
;
using
k_type_
=
fp16_t
;
using
v_type_
=
fp16_t
;
using
o_type_
=
fp16_t
;
using
acc_type_
=
float
;
using
q_type_
=
bf16_t
;
using
k_type_
=
bf16_t
;
using
v_type_
=
bf16_t
;
using
o_type_
=
bf16_t
;
using
acc_type_
=
float
;
using
kvscale_type_
=
float
;
constexpr
int
quant_algo_
=
0
;
CK_TILE_DISPATCH_NAIVE_ATTEN_FWD_LAOYUT_
();
}
else
if
(
t
.
q_type
==
"bf16"
&&
t
.
k_type
==
"bf16"
&&
t
.
v_type
==
"bf16"
&&
t
.
o_type
==
"bf16"
)
else
if
(
t
.
q_type
==
"bf16"
&&
t
.
k_type
==
"fp8"
&&
t
.
v_type
==
"fp8"
&&
t
.
o_type
==
"bf16"
&&
t
.
quant_algo
==
2
)
{
using
q_type_
=
bf16_t
;
using
k_type_
=
bf16_t
;
using
v_type_
=
bf16_t
;
using
o_type_
=
bf16_t
;
using
acc_type_
=
float
;
using
q_type_
=
bf16_t
;
using
k_type_
=
fp8_t
;
using
v_type_
=
fp8_t
;
using
o_type_
=
bf16_t
;
using
acc_type_
=
float
;
// NOTE!
using
kvscale_type_
=
float
;
constexpr
int
quant_algo_
=
2
;
CK_TILE_DISPATCH_NAIVE_ATTEN_FWD_LAOYUT_
();
}
else
if
(
t
.
q_type
==
"bf16"
&&
t
.
k_type
==
"int8"
&&
t
.
v_type
==
"int8"
&&
t
.
o_type
==
"bf16"
)
else
if
(
t
.
q_type
==
"fp16"
&&
t
.
k_type
==
"fp8"
&&
t
.
v_type
==
"fp8"
&&
t
.
o_type
==
"fp16"
&&
t
.
quant_algo
==
2
)
{
using
q_type_
=
bf16_t
;
using
k_type_
=
int8_t
;
using
v_type_
=
int8_t
;
using
o_type_
=
bf16_t
;
using
acc_type_
=
int32_t
;
// NOTE!
using
q_type_
=
fp16_t
;
using
k_type_
=
fp8_t
;
using
v_type_
=
fp8_t
;
using
o_type_
=
fp16_t
;
using
acc_type_
=
float
;
// NOTE!
using
kvscale_type_
=
float
;
constexpr
int
quant_algo_
=
2
;
CK_TILE_DISPATCH_NAIVE_ATTEN_FWD_LAOYUT_
();
}
else
if
(
t
.
q_type
==
"fp16"
&&
t
.
k_type
==
"int8"
&&
t
.
v_type
==
"int8"
&&
t
.
o_type
==
"fp16"
)
else
if
(
t
.
q_type
==
"bf16"
&&
t
.
k_type
==
"int8"
&&
t
.
v_type
==
"int8"
&&
t
.
o_type
==
"bf16"
&&
t
.
quant_algo
==
2
)
{
using
q_type_
=
fp16_t
;
using
k_type_
=
int8_t
;
using
v_type_
=
int8_t
;
using
o_type_
=
fp16_t
;
using
acc_type_
=
int32_t
;
// NOTE!
using
q_type_
=
bf16_t
;
using
k_type_
=
int8_t
;
using
v_type_
=
int8_t
;
using
o_type_
=
bf16_t
;
using
acc_type_
=
int32_t
;
// NOTE!
using
kvscale_type_
=
float
;
constexpr
int
quant_algo_
=
2
;
CK_TILE_DISPATCH_NAIVE_ATTEN_FWD_LAOYUT_
();
}
return
r
;
...
...
library/include/ck/library/reference_tensor_operation/cpu/reference_fpAintB_gemm.hpp
View file @
96a0d5f6
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-202
3
, Advanced Micro Devices, Inc. All rights reserved.
// Copyright (c) 2018-202
5
, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
...
...
@@ -73,39 +73,9 @@ struct ReferencefpAintBGemm : public device::BaseOperator
ScaleDataType
v_scale
;
ADataType
v_converted_b
;
// use PassThrough instead of ConvertBF16RTN for reference calculation
if
constexpr
(
is_same_v
<
AElementwiseOperation
,
ck
::
tensor_operation
::
element_wise
::
ConvertBF16RTN
>
)
{
ck
::
tensor_operation
::
element_wise
::
PassThrough
{}(
v_a
,
arg
.
a_m_k_
(
m
,
k
));
}
else
{
arg
.
a_element_op_
(
v_a
,
arg
.
a_m_k_
(
m
,
k
));
}
// same for B matrix
if
constexpr
(
is_same_v
<
BElementwiseOperation
,
ck
::
tensor_operation
::
element_wise
::
ConvertBF16RTN
>
)
{
ck
::
tensor_operation
::
element_wise
::
PassThrough
{}(
v_b
,
arg
.
b_k_n_
(
k
,
n
));
}
else
{
arg
.
b_element_op_
(
v_b
,
arg
.
b_k_n_
(
k
,
n
));
}
// same for scale matrix
if
constexpr
(
is_same_v
<
BElementwiseOperation
,
ck
::
tensor_operation
::
element_wise
::
ConvertBF16RTN
>
)
{
ck
::
tensor_operation
::
element_wise
::
PassThrough
{}(
v_scale
,
arg
.
scale_k_n_
(
k
,
n
));
}
else
{
arg
.
b_element_op_
(
v_scale
,
arg
.
scale_k_n_
(
k
,
n
));
}
arg
.
a_element_op_
(
v_a
,
arg
.
a_m_k_
(
m
,
k
));
arg
.
b_element_op_
(
v_b
,
arg
.
b_k_n_
(
k
,
n
));
arg
.
b_element_op_
(
v_scale
,
arg
.
scale_k_n_
(
k
,
n
));
v_converted_b
=
type_convert
<
ADataType
>
(
v_b
)
*
v_scale
;
v_acc
+=
ck
::
type_convert
<
AccDataType
>
(
v_a
)
*
...
...
library/include/ck/library/reference_tensor_operation/cpu/reference_gemm.hpp
View file @
96a0d5f6
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-202
4
, Advanced Micro Devices, Inc. All rights reserved.
// Copyright (c) 2018-202
5
, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
...
...
@@ -68,21 +68,32 @@ struct ReferenceGemm : public device::BaseOperator
for
(
int
k
=
0
;
k
<
K
;
++
k
)
{
// use PassThrough instead of ConvertBF16RTN for reference calculation
if
constexpr
(
is_same_v
<
AElementwiseOperation
,
ck
::
tensor_operation
::
element_wise
::
ConvertBF16RTN
>
)
if
constexpr
(
is_same_v
<
ADataType
,
pk_i4_t
>
)
{
ck
::
tensor_operation
::
element_wise
::
PassThrough
{}(
v_a
,
arg
.
a_m_k_
(
m
,
k
));
uint8_t
i4x2
=
arg
.
a_m_k_
(
m
,
k
).
data
;
int8_t
i4
=
0
;
if
(
k
%
2
==
1
)
i4
=
(
i4x2
>>
0
)
&
0xf
;
else
i4
=
(
i4x2
>>
4
)
&
0xf
;
i4
=
i4
-
8
;
v_a
=
type_convert
<
ComputeTypeA
>
(
i4
);
}
else
{
arg
.
a_element_op_
(
v_a
,
arg
.
a_m_k_
(
m
,
k
));
}
// same for B matrix
if
constexpr
(
is_same_v
<
BElementwiseOperation
,
ck
::
tensor_operation
::
element_wise
::
ConvertBF16RTN
>
)
if
constexpr
(
is_same_v
<
BDataType
,
pk_i4_t
>
)
{
ck
::
tensor_operation
::
element_wise
::
PassThrough
{}(
v_b
,
arg
.
b_k_n_
(
k
,
n
));
uint8_t
i4x2
=
arg
.
b_k_n_
(
k
,
n
).
data
;
int8_t
i4
=
0
;
if
(
k
%
2
==
1
)
i4
=
(
i4x2
>>
0
)
&
0xf
;
else
i4
=
(
i4x2
>>
4
)
&
0xf
;
i4
=
i4
-
8
;
v_b
=
type_convert
<
ComputeTypeB
>
(
i4
);
}
else
{
...
...
library/include/ck/library/reference_tensor_operation/cpu/reference_gemm_multiple_d.hpp
View file @
96a0d5f6
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-202
4
, Advanced Micro Devices, Inc. All rights reserved.
// Copyright (c) 2018-202
5
, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
...
...
@@ -74,26 +74,8 @@ struct ReferenceGemmMultipleD : public device::BaseOperator
for
(
int
k
=
0
;
k
<
K
;
++
k
)
{
// use PassThrough instead of ConvertBF16RTN for reference calculation
if
constexpr
(
is_same_v
<
AElementwiseOperation
,
ck
::
tensor_operation
::
element_wise
::
ConvertBF16RTN
>
)
{
ck
::
tensor_operation
::
element_wise
::
PassThrough
{}(
v_a
,
arg
.
a_m_k_
(
m
,
k
));
}
else
{
arg
.
a_element_op_
(
v_a
,
arg
.
a_m_k_
(
m
,
k
));
}
// same for B matrix
if
constexpr
(
is_same_v
<
BElementwiseOperation
,
ck
::
tensor_operation
::
element_wise
::
ConvertBF16RTN
>
)
{
ck
::
tensor_operation
::
element_wise
::
PassThrough
{}(
v_b
,
arg
.
b_k_n_
(
k
,
n
));
}
else
{
arg
.
b_element_op_
(
v_b
,
arg
.
b_k_n_
(
k
,
n
));
}
arg
.
a_element_op_
(
v_a
,
arg
.
a_m_k_
(
m
,
k
));
arg
.
b_element_op_
(
v_b
,
arg
.
b_k_n_
(
k
,
n
));
v_acc
+=
ck
::
type_convert
<
AccDataType
>
(
v_a
)
*
ck
::
type_convert
<
AccDataType
>
(
v_b
);
...
...
library/include/ck/library/tensor_operation_instance/device_operation_instance_factory.hpp
View file @
96a0d5f6
...
...
@@ -22,6 +22,7 @@ using I8 = int8_t;
using
I32
=
int32_t
;
using
F8
=
ck
::
f8_t
;
using
BF8
=
ck
::
bf8_t
;
using
I4
=
ck
::
pk_i4_t
;
using
Empty_Tuple
=
ck
::
Tuple
<>
;
...
...
library/include/ck/library/tensor_operation_instance/gpu/gemm.hpp
View file @
96a0d5f6
...
...
@@ -15,6 +15,9 @@
#ifdef DL_KERNELS
#include "gemm_dl.inc"
#endif
#ifdef DPP_KERNELS
#include "gemm_dpp.inc"
#endif
#ifdef CK_USE_WMMA
#include "gemm_wmma.inc"
#endif
...
...
@@ -92,32 +95,24 @@ struct DeviceOperationInstanceFactory<
{
add_device_gemm_dl_f16_f16_f16_mk_kn_mn_instances
(
op_ptrs
);
add_device_gemm_dl_f16_f16_f16_mk_kn_mn_irregular_instances
(
op_ptrs
);
add_device_gemm_dpp_f16_f16_f16_mk_kn_mn_instances
(
op_ptrs
);
add_device_gemm_dpp_f16_f16_f16_mk_kn_mn_irregular_instances
(
op_ptrs
);
}
else
if
constexpr
(
is_same_v
<
ALayout
,
Row
>
&&
is_same_v
<
BLayout
,
Col
>
&&
is_same_v
<
CLayout
,
Row
>
)
{
add_device_gemm_dl_f16_f16_f16_mk_nk_mn_instances
(
op_ptrs
);
add_device_gemm_dl_f16_f16_f16_mk_nk_mn_irregular_instances
(
op_ptrs
);
add_device_gemm_dpp_f16_f16_f16_mk_nk_mn_instances
(
op_ptrs
);
add_device_gemm_dpp_f16_f16_f16_mk_nk_mn_irregular_instances
(
op_ptrs
);
}
else
if
constexpr
(
is_same_v
<
ALayout
,
Col
>
&&
is_same_v
<
BLayout
,
Row
>
&&
is_same_v
<
CLayout
,
Row
>
)
{
add_device_gemm_dl_f16_f16_f16_km_kn_mn_instances
(
op_ptrs
);
add_device_gemm_dl_f16_f16_f16_km_kn_mn_irregular_instances
(
op_ptrs
);
add_device_gemm_dpp_f16_f16_f16_km_kn_mn_instances
(
op_ptrs
);
add_device_gemm_dpp_f16_f16_f16_km_kn_mn_irregular_instances
(
op_ptrs
);
}
else
if
constexpr
(
is_same_v
<
ALayout
,
Col
>
&&
is_same_v
<
BLayout
,
Col
>
&&
is_same_v
<
CLayout
,
Row
>
)
{
add_device_gemm_dl_f16_f16_f16_km_nk_mn_instances
(
op_ptrs
);
add_device_gemm_dl_f16_f16_f16_km_nk_mn_irregular_instances
(
op_ptrs
);
add_device_gemm_dpp_f16_f16_f16_km_nk_mn_instances
(
op_ptrs
);
add_device_gemm_dpp_f16_f16_f16_km_nk_mn_irregular_instances
(
op_ptrs
);
}
}
#endif
...
...
@@ -153,6 +148,39 @@ struct DeviceOperationInstanceFactory<
#endif
#endif // DL_KERNELS
#ifdef DPP_KERNELS
#ifdef CK_ENABLE_FP16
if
constexpr
(
is_same_v
<
ADataType
,
half_t
>
&&
is_same_v
<
BDataType
,
half_t
>
&&
is_same_v
<
CDataType
,
half_t
>
)
{
if
constexpr
(
is_same_v
<
ALayout
,
Row
>
&&
is_same_v
<
BLayout
,
Row
>
&&
is_same_v
<
CLayout
,
Row
>
)
{
add_device_gemm_dpp_f16_f16_f16_mk_kn_mn_instances
(
op_ptrs
);
add_device_gemm_dpp_f16_f16_f16_mk_kn_mn_irregular_instances
(
op_ptrs
);
}
else
if
constexpr
(
is_same_v
<
ALayout
,
Row
>
&&
is_same_v
<
BLayout
,
Col
>
&&
is_same_v
<
CLayout
,
Row
>
)
{
add_device_gemm_dpp_f16_f16_f16_mk_nk_mn_instances
(
op_ptrs
);
add_device_gemm_dpp_f16_f16_f16_mk_nk_mn_irregular_instances
(
op_ptrs
);
}
else
if
constexpr
(
is_same_v
<
ALayout
,
Col
>
&&
is_same_v
<
BLayout
,
Row
>
&&
is_same_v
<
CLayout
,
Row
>
)
{
add_device_gemm_dpp_f16_f16_f16_km_kn_mn_instances
(
op_ptrs
);
add_device_gemm_dpp_f16_f16_f16_km_kn_mn_irregular_instances
(
op_ptrs
);
}
else
if
constexpr
(
is_same_v
<
ALayout
,
Col
>
&&
is_same_v
<
BLayout
,
Col
>
&&
is_same_v
<
CLayout
,
Row
>
)
{
add_device_gemm_dpp_f16_f16_f16_km_nk_mn_instances
(
op_ptrs
);
add_device_gemm_dpp_f16_f16_f16_km_nk_mn_irregular_instances
(
op_ptrs
);
}
}
#endif
#endif // DPP_KERNELS
#ifdef CK_USE_WMMA
#ifdef CK_ENABLE_FP16
if
constexpr
(
is_same_v
<
ADataType
,
half_t
>
&&
is_same_v
<
BDataType
,
half_t
>
&&
...
...
library/include/ck/library/tensor_operation_instance/gpu/gemm_b_scale.hpp
0 → 100644
View file @
96a0d5f6
// SPDX-License-Identifier: MIT
// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_v3_b_scale.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include <memory>
#include <vector>
#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp"
namespace
ck
{
namespace
tensor_operation
{
namespace
device
{
namespace
instance
{
#if(defined(CK_ENABLE_FP16) || defined(CK_ENABLE_FP8))
void
add_device_gemm_b_scale_xdl_f16_i4_f16_mk_nk_mn_mem_v2_default_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceGemmV2BScale
<
Row
,
Col
,
Row
,
F16
,
I4
,
F16
,
F16
,
1
,
128
,
PassThrough
,
PassThrough
,
PassThrough
>>>&
instances
);
#endif
template
<
typename
ADataType
,
typename
BDataType
,
typename
BScaleDataType
,
typename
CDataType
,
typename
ALayout
,
typename
BLayout
,
typename
CLayout
,
index_t
ScaleBlockK
>
struct
DeviceOperationInstanceFactory
<
ck
::
tensor_operation
::
device
::
DeviceGemmV2BScale
<
ALayout
,
BLayout
,
CLayout
,
ADataType
,
BDataType
,
BScaleDataType
,
CDataType
,
1
,
ScaleBlockK
,
ck
::
tensor_operation
::
element_wise
::
PassThrough
,
ck
::
tensor_operation
::
element_wise
::
PassThrough
,
ck
::
tensor_operation
::
element_wise
::
PassThrough
>>
{
using
DeviceOp
=
DeviceGemmV2BScale
<
ALayout
,
BLayout
,
CLayout
,
ADataType
,
BDataType
,
BScaleDataType
,
CDataType
,
1
,
ScaleBlockK
,
ck
::
tensor_operation
::
element_wise
::
PassThrough
,
ck
::
tensor_operation
::
element_wise
::
PassThrough
,
ck
::
tensor_operation
::
element_wise
::
PassThrough
>
;
static
auto
GetInstances
()
{
std
::
vector
<
std
::
unique_ptr
<
DeviceOp
>>
op_ptrs
;
if
constexpr
(
is_same_v
<
ADataType
,
half_t
>
&&
is_same_v
<
BDataType
,
pk_i4_t
>
&&
is_same_v
<
CDataType
,
half_t
>
)
{
if
constexpr
(
is_same_v
<
ALayout
,
Row
>
&&
is_same_v
<
BLayout
,
Col
>
&&
is_same_v
<
CLayout
,
Row
>
)
{
add_device_gemm_b_scale_xdl_f16_i4_f16_mk_nk_mn_mem_v2_default_instances
(
op_ptrs
);
}
}
return
op_ptrs
;
}
};
}
// namespace instance
}
// namespace device
}
// namespace tensor_operation
}
// namespace ck
library/include/ck/library/tensor_operation_instance/gpu/gemm_dl.inc
View file @
96a0d5f6
...
...
@@ -28,16 +28,6 @@ void add_device_gemm_dl_f16_f16_f16_km_kn_mn_irregular_instances(
DeviceGemm
<
Col
,
Row
,
Row
,
F16
,
F16
,
F16
,
PassThrough
,
PassThrough
,
PassThrough
>>>&
instances
);
void
add_device_gemm_dpp_f16_f16_f16_km_kn_mn_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceGemm
<
Col
,
Row
,
Row
,
F16
,
F16
,
F16
,
PassThrough
,
PassThrough
,
PassThrough
>>>&
instances
);
void
add_device_gemm_dpp_f16_f16_f16_km_kn_mn_irregular_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceGemm
<
Col
,
Row
,
Row
,
F16
,
F16
,
F16
,
PassThrough
,
PassThrough
,
PassThrough
>>>&
instances
);
void
add_device_gemm_dl_f16_f16_f16_km_nk_mn_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceGemm
<
Col
,
Col
,
Row
,
F16
,
F16
,
F16
,
PassThrough
,
PassThrough
,
PassThrough
>>>&
...
...
@@ -48,16 +38,6 @@ void add_device_gemm_dl_f16_f16_f16_km_nk_mn_irregular_instances(
DeviceGemm
<
Col
,
Col
,
Row
,
F16
,
F16
,
F16
,
PassThrough
,
PassThrough
,
PassThrough
>>>&
instances
);
void
add_device_gemm_dpp_f16_f16_f16_km_nk_mn_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceGemm
<
Col
,
Col
,
Row
,
F16
,
F16
,
F16
,
PassThrough
,
PassThrough
,
PassThrough
>>>&
instances
);
void
add_device_gemm_dpp_f16_f16_f16_km_nk_mn_irregular_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceGemm
<
Col
,
Col
,
Row
,
F16
,
F16
,
F16
,
PassThrough
,
PassThrough
,
PassThrough
>>>&
instances
);
void
add_device_gemm_dl_f16_f16_f16_mk_kn_mn_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceGemm
<
Row
,
Row
,
Row
,
F16
,
F16
,
F16
,
PassThrough
,
PassThrough
,
PassThrough
>>>&
...
...
@@ -68,16 +48,6 @@ void add_device_gemm_dl_f16_f16_f16_mk_kn_mn_irregular_instances(
DeviceGemm
<
Row
,
Row
,
Row
,
F16
,
F16
,
F16
,
PassThrough
,
PassThrough
,
PassThrough
>>>&
instances
);
void
add_device_gemm_dpp_f16_f16_f16_mk_kn_mn_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceGemm
<
Row
,
Row
,
Row
,
F16
,
F16
,
F16
,
PassThrough
,
PassThrough
,
PassThrough
>>>&
instances
);
void
add_device_gemm_dpp_f16_f16_f16_mk_kn_mn_irregular_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceGemm
<
Row
,
Row
,
Row
,
F16
,
F16
,
F16
,
PassThrough
,
PassThrough
,
PassThrough
>>>&
instances
);
void
add_device_gemm_dl_f16_f16_f16_mk_nk_mn_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceGemm
<
Row
,
Col
,
Row
,
F16
,
F16
,
F16
,
PassThrough
,
PassThrough
,
PassThrough
>>>&
...
...
library/include/ck/library/tensor_operation_instance/gpu/gemm_dpp.inc
0 → 100644
View file @
96a0d5f6
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#include <memory>
#include <vector>
#include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/tensor_operation/gpu/device/device_gemm.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp"
namespace
ck
{
namespace
tensor_operation
{
namespace
device
{
namespace
instance
{
#if defined(CK_ENABLE_FP16)
void
add_device_gemm_dpp_f16_f16_f16_km_kn_mn_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceGemm
<
Col
,
Row
,
Row
,
F16
,
F16
,
F16
,
PassThrough
,
PassThrough
,
PassThrough
>>>&
instances
);
void
add_device_gemm_dpp_f16_f16_f16_km_kn_mn_irregular_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceGemm
<
Col
,
Row
,
Row
,
F16
,
F16
,
F16
,
PassThrough
,
PassThrough
,
PassThrough
>>>&
instances
);
void
add_device_gemm_dpp_f16_f16_f16_km_nk_mn_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceGemm
<
Col
,
Col
,
Row
,
F16
,
F16
,
F16
,
PassThrough
,
PassThrough
,
PassThrough
>>>&
instances
);
void
add_device_gemm_dpp_f16_f16_f16_km_nk_mn_irregular_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceGemm
<
Col
,
Col
,
Row
,
F16
,
F16
,
F16
,
PassThrough
,
PassThrough
,
PassThrough
>>>&
instances
);
void
add_device_gemm_dpp_f16_f16_f16_mk_kn_mn_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceGemm
<
Row
,
Row
,
Row
,
F16
,
F16
,
F16
,
PassThrough
,
PassThrough
,
PassThrough
>>>&
instances
);
void
add_device_gemm_dpp_f16_f16_f16_mk_kn_mn_irregular_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceGemm
<
Row
,
Row
,
Row
,
F16
,
F16
,
F16
,
PassThrough
,
PassThrough
,
PassThrough
>>>&
instances
);
void
add_device_gemm_dpp_f16_f16_f16_mk_nk_mn_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceGemm
<
Row
,
Col
,
Row
,
F16
,
F16
,
F16
,
PassThrough
,
PassThrough
,
PassThrough
>>>&
instances
);
void
add_device_gemm_dpp_f16_f16_f16_mk_nk_mn_irregular_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceGemm
<
Row
,
Col
,
Row
,
F16
,
F16
,
F16
,
PassThrough
,
PassThrough
,
PassThrough
>>>&
instances
);
#endif
}
// namespace instance
}
// namespace device
}
// namespace tensor_operation
}
// namespace ck
library/include/ck/library/tensor_operation_instance/gpu/gemm_multiply_multiply.hpp
View file @
96a0d5f6
...
...
@@ -16,7 +16,8 @@ namespace ck {
namespace
tensor_operation
{
namespace
device
{
namespace
instance
{
#if(defined(CK_ENABLE_BF16) || defined(CK_ENABLE_FP8))
#ifdef CK_ENABLE_FP8
#ifdef CK_ENABLE_BF16
void
add_device_gemm_multiply_multiply_xdl_f8_f8_bf16_mk_nk_mn_comp_default_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceGemmMultipleDSplitK
<
Row
,
Col
,
...
...
@@ -95,6 +96,86 @@ void add_device_gemm_multiply_multiply_xdl_f8_f8_bf16_mk_nk_mn_mem_v2_kpadding_i
PassThrough
,
MultiplyMultiply
>>>&
instances
);
#endif
#ifdef CK_ENABLE_FP16
void
add_device_gemm_multiply_multiply_xdl_f8_f8_f16_mk_nk_mn_comp_default_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceGemmMultipleDSplitK
<
Row
,
Col
,
Tuple
<
Row
,
Col
>
,
Row
,
F8
,
F8
,
Tuple
<
F32
,
F32
>
,
F16
,
PassThrough
,
PassThrough
,
MultiplyMultiply
>>>&
instances
);
void
add_device_gemm_multiply_multiply_xdl_f8_f8_f16_mk_nk_mn_comp_kpadding_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceGemmMultipleDSplitK
<
Row
,
Col
,
Tuple
<
Row
,
Col
>
,
Row
,
F8
,
F8
,
Tuple
<
F32
,
F32
>
,
F16
,
PassThrough
,
PassThrough
,
MultiplyMultiply
>>>&
instances
);
void
add_device_gemm_multiply_multiply_xdl_f8_f8_f16_mk_nk_mn_mem_v1_default_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceGemmMultipleDSplitK
<
Row
,
Col
,
Tuple
<
Row
,
Col
>
,
Row
,
F8
,
F8
,
Tuple
<
F32
,
F32
>
,
F16
,
PassThrough
,
PassThrough
,
MultiplyMultiply
>>>&
instances
);
void
add_device_gemm_multiply_multiply_xdl_f8_f8_f16_mk_nk_mn_mem_v1_kpadding_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceGemmMultipleDSplitK
<
Row
,
Col
,
Tuple
<
Row
,
Col
>
,
Row
,
F8
,
F8
,
Tuple
<
F32
,
F32
>
,
F16
,
PassThrough
,
PassThrough
,
MultiplyMultiply
>>>&
instances
);
void
add_device_gemm_multiply_multiply_xdl_f8_f8_f16_mk_nk_mn_mem_v2_default_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceGemmMultipleDSplitK
<
Row
,
Col
,
Tuple
<
Row
,
Col
>
,
Row
,
F8
,
F8
,
Tuple
<
F32
,
F32
>
,
F16
,
PassThrough
,
PassThrough
,
MultiplyMultiply
>>>&
instances
);
void
add_device_gemm_multiply_multiply_xdl_f8_f8_f16_mk_nk_mn_mem_v2_kpadding_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceGemmMultipleDSplitK
<
Row
,
Col
,
Tuple
<
Row
,
Col
>
,
Row
,
F8
,
F8
,
Tuple
<
F32
,
F32
>
,
F16
,
PassThrough
,
PassThrough
,
MultiplyMultiply
>>>&
instances
);
#endif
#endif
#if(defined(CK_ENABLE_BF16) || defined(CK_ENABLE_INT8))
void
add_device_gemm_multiply_multiply_xdl_i8_i8_bf16_mk_nk_mn_comp_default_instances
(
...
...
@@ -213,7 +294,8 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceGemmMu
{
std
::
vector
<
std
::
unique_ptr
<
DeviceOp
>>
op_ptrs
;
#if(defined(CK_ENABLE_BF16) || defined(CK_ENABLE_FP8))
#ifdef CK_ENABLE_FP8
#ifdef CK_ENABLE_BF16
if
constexpr
(
is_same_v
<
ADataType
,
f8_t
>
&&
is_same_v
<
BDataType
,
f8_t
>
&&
is_same_v
<
CDataType
,
bhalf_t
>
)
{
...
...
@@ -237,6 +319,31 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceGemmMu
}
}
#endif
#ifdef CK_ENABLE_FP16
if
constexpr
(
is_same_v
<
ADataType
,
f8_t
>
&&
is_same_v
<
BDataType
,
f8_t
>
&&
is_same_v
<
CDataType
,
half_t
>
)
{
if
constexpr
(
is_same_v
<
ALayout
,
Row
>
&&
is_same_v
<
BLayout
,
Col
>
&&
is_same_v
<
CLayout
,
Row
>
)
{
add_device_gemm_multiply_multiply_xdl_f8_f8_f16_mk_nk_mn_comp_default_instances
(
op_ptrs
);
add_device_gemm_multiply_multiply_xdl_f8_f8_f16_mk_nk_mn_comp_kpadding_instances
(
op_ptrs
);
add_device_gemm_multiply_multiply_xdl_f8_f8_f16_mk_nk_mn_mem_v1_default_instances
(
op_ptrs
);
add_device_gemm_multiply_multiply_xdl_f8_f8_f16_mk_nk_mn_mem_v1_kpadding_instances
(
op_ptrs
);
add_device_gemm_multiply_multiply_xdl_f8_f8_f16_mk_nk_mn_mem_v2_default_instances
(
op_ptrs
);
add_device_gemm_multiply_multiply_xdl_f8_f8_f16_mk_nk_mn_mem_v2_kpadding_instances
(
op_ptrs
);
}
}
#endif
#endif
#if(defined(CK_ENABLE_BF16) || defined(CK_ENABLE_INT8))
if
constexpr
(
is_same_v
<
ADataType
,
int8_t
>
&&
is_same_v
<
BDataType
,
int8_t
>
&&
is_same_v
<
CDataType
,
bhalf_t
>
)
...
...
library/include/ck/library/tensor_operation_instance/gpu/gemm_universal.hpp
View file @
96a0d5f6
...
...
@@ -166,11 +166,22 @@ void add_device_gemm_xdl_universal_f16_f8_f16_mk_nk_mn_mem_v1_kpadding_instances
std
::
vector
<
std
::
unique_ptr
<
DeviceGemmV2
<
Row
,
Col
,
Row
,
F16
,
F8
,
F16
,
PassThrough
,
PassThrough
,
PassThrough
>>>&
instances
);
void
add_device_gemm_xdl_universal_f16_f8_f16_mk_nk_mn_mem_v2_default_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceGemmV2
<
Row
,
Col
,
Row
,
F16
,
F8
,
F16
,
PassThrough
,
PassThrough
,
PassThrough
>>>&
instances
);
void
add_device_gemm_xdl_universal_f16_i4_f16_mk_nk_mn_mem_v2_default_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceGemmV2
<
Row
,
Col
,
Row
,
F16
,
I4
,
F16
,
PassThrough
,
PassThrough
,
PassThrough
>>>&
instances
);
void
add_device_gemm_xdl_universal_bf16_i4_bf16_mk_nk_mn_mem_v2_default_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceGemmV2
<
Row
,
Col
,
Row
,
BF16
,
I4
,
BF16
,
PassThrough
,
PassThrough
,
PassThrough
>>>&
instances
);
void
add_device_gemm_xdl_universal_f16_f8_f16_mk_nk_mn_mem_v2_kpadding_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceGemmV2
<
Row
,
Col
,
Row
,
F16
,
F8
,
F16
,
PassThrough
,
PassThrough
,
PassThrough
>>>&
...
...
@@ -810,6 +821,28 @@ struct DeviceOperationInstanceFactory<
}
}
#endif
if
constexpr
(
is_same_v
<
ADataType
,
half_t
>
&&
is_same_v
<
BDataType
,
pk_i4_t
>
&&
is_same_v
<
CDataType
,
half_t
>
)
{
if
constexpr
(
is_same_v
<
ALayout
,
Row
>
&&
is_same_v
<
BLayout
,
Col
>
&&
is_same_v
<
CLayout
,
Row
>
)
{
add_device_gemm_xdl_universal_f16_i4_f16_mk_nk_mn_mem_v2_default_instances
(
op_ptrs
);
}
}
if
constexpr
(
is_same_v
<
ADataType
,
bhalf_t
>
&&
is_same_v
<
BDataType
,
pk_i4_t
>
&&
is_same_v
<
CDataType
,
bhalf_t
>
)
{
if
constexpr
(
is_same_v
<
ALayout
,
Row
>
&&
is_same_v
<
BLayout
,
Col
>
&&
is_same_v
<
CLayout
,
Row
>
)
{
add_device_gemm_xdl_universal_bf16_i4_bf16_mk_nk_mn_mem_v2_default_instances
(
op_ptrs
);
}
}
return
op_ptrs
;
}
};
...
...
library/include/ck/library/tensor_operation_instance/gpu/gemm_universal_streamk.hpp
View file @
96a0d5f6
...
...
@@ -238,6 +238,403 @@ void add_device_gemm_xdl_universal_streamk_f16_f16_f16_mk_nk_mn_mem_v2_mnkpaddin
PassThrough
>>>&
instances
);
#endif
#ifdef CK_ENABLE_BF16
void
add_device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_kn_mn_comp_default_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceGemm_Streamk_V2
<
Row
,
Row
,
Row
,
BF16
,
BF16
,
BF16
,
PassThrough
,
PassThrough
,
PassThrough
>>>&
instances
);
void
add_device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_kn_mn_comp_kpadding_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceGemm_Streamk_V2
<
Row
,
Row
,
Row
,
BF16
,
BF16
,
BF16
,
PassThrough
,
PassThrough
,
PassThrough
>>>&
instances
);
void
add_device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_kn_mn_comp_mnpadding_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceGemm_Streamk_V2
<
Row
,
Row
,
Row
,
BF16
,
BF16
,
BF16
,
PassThrough
,
PassThrough
,
PassThrough
>>>&
instances
);
void
add_device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_kn_mn_comp_mnkpadding_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceGemm_Streamk_V2
<
Row
,
Row
,
Row
,
BF16
,
BF16
,
BF16
,
PassThrough
,
PassThrough
,
PassThrough
>>>&
instances
);
void
add_device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_kn_mn_mem_v1_default_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceGemm_Streamk_V2
<
Row
,
Row
,
Row
,
BF16
,
BF16
,
BF16
,
PassThrough
,
PassThrough
,
PassThrough
>>>&
instances
);
void
add_device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_kn_mn_mem_v1_kpadding_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceGemm_Streamk_V2
<
Row
,
Row
,
Row
,
BF16
,
BF16
,
BF16
,
PassThrough
,
PassThrough
,
PassThrough
>>>&
instances
);
void
add_device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_kn_mn_mem_v1_mnkpadding_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceGemm_Streamk_V2
<
Row
,
Row
,
Row
,
BF16
,
BF16
,
BF16
,
PassThrough
,
PassThrough
,
PassThrough
>>>&
instances
);
void
add_device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_kn_mn_mem_v2_default_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceGemm_Streamk_V2
<
Row
,
Row
,
Row
,
BF16
,
BF16
,
BF16
,
PassThrough
,
PassThrough
,
PassThrough
>>>&
instances
);
void
add_device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_kn_mn_mem_v2_kpadding_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceGemm_Streamk_V2
<
Row
,
Row
,
Row
,
BF16
,
BF16
,
BF16
,
PassThrough
,
PassThrough
,
PassThrough
>>>&
instances
);
void
add_device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_kn_mn_mem_v2_mnkpadding_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceGemm_Streamk_V2
<
Row
,
Row
,
Row
,
BF16
,
BF16
,
BF16
,
PassThrough
,
PassThrough
,
PassThrough
>>>&
instances
);
void
add_device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_nk_mn_comp_default_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceGemm_Streamk_V2
<
Row
,
Col
,
Row
,
BF16
,
BF16
,
BF16
,
PassThrough
,
PassThrough
,
PassThrough
>>>&
instances
);
void
add_device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_nk_mn_comp_kpadding_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceGemm_Streamk_V2
<
Row
,
Col
,
Row
,
BF16
,
BF16
,
BF16
,
PassThrough
,
PassThrough
,
PassThrough
>>>&
instances
);
void
add_device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_nk_mn_mem_v1_default_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceGemm_Streamk_V2
<
Row
,
Col
,
Row
,
BF16
,
BF16
,
BF16
,
PassThrough
,
PassThrough
,
PassThrough
>>>&
instances
);
void
add_device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_nk_mn_mem_v1_kpadding_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceGemm_Streamk_V2
<
Row
,
Col
,
Row
,
BF16
,
BF16
,
BF16
,
PassThrough
,
PassThrough
,
PassThrough
>>>&
instances
);
void
add_device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_nk_mn_mem_v2_default_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceGemm_Streamk_V2
<
Row
,
Col
,
Row
,
BF16
,
BF16
,
BF16
,
PassThrough
,
PassThrough
,
PassThrough
>>>&
instances
);
void
add_device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_nk_mn_mem_v2_kpadding_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceGemm_Streamk_V2
<
Row
,
Col
,
Row
,
BF16
,
BF16
,
BF16
,
PassThrough
,
PassThrough
,
PassThrough
>>>&
instances
);
void
add_device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_kn_mn_comp_default_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceGemm_Streamk_V2
<
Col
,
Row
,
Row
,
BF16
,
BF16
,
BF16
,
PassThrough
,
PassThrough
,
PassThrough
>>>&
instances
);
void
add_device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_kn_mn_comp_kpadding_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceGemm_Streamk_V2
<
Col
,
Row
,
Row
,
BF16
,
BF16
,
BF16
,
PassThrough
,
PassThrough
,
PassThrough
>>>&
instances
);
void
add_device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_kn_mn_comp_mnpadding_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceGemm_Streamk_V2
<
Col
,
Row
,
Row
,
BF16
,
BF16
,
BF16
,
PassThrough
,
PassThrough
,
PassThrough
>>>&
instances
);
void
add_device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_kn_mn_comp_mnkpadding_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceGemm_Streamk_V2
<
Col
,
Row
,
Row
,
BF16
,
BF16
,
BF16
,
PassThrough
,
PassThrough
,
PassThrough
>>>&
instances
);
void
add_device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_kn_mn_mem_v1_default_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceGemm_Streamk_V2
<
Col
,
Row
,
Row
,
BF16
,
BF16
,
BF16
,
PassThrough
,
PassThrough
,
PassThrough
>>>&
instances
);
void
add_device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_kn_mn_mem_v1_kpadding_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceGemm_Streamk_V2
<
Col
,
Row
,
Row
,
BF16
,
BF16
,
BF16
,
PassThrough
,
PassThrough
,
PassThrough
>>>&
instances
);
void
add_device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_kn_mn_mem_v1_mnkpadding_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceGemm_Streamk_V2
<
Col
,
Row
,
Row
,
BF16
,
BF16
,
BF16
,
PassThrough
,
PassThrough
,
PassThrough
>>>&
instances
);
void
add_device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_kn_mn_mem_v2_default_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceGemm_Streamk_V2
<
Col
,
Row
,
Row
,
BF16
,
BF16
,
BF16
,
PassThrough
,
PassThrough
,
PassThrough
>>>&
instances
);
void
add_device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_kn_mn_mem_v2_kpadding_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceGemm_Streamk_V2
<
Col
,
Row
,
Row
,
BF16
,
BF16
,
BF16
,
PassThrough
,
PassThrough
,
PassThrough
>>>&
instances
);
void
add_device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_kn_mn_mem_v2_mnkpadding_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceGemm_Streamk_V2
<
Col
,
Row
,
Row
,
BF16
,
BF16
,
BF16
,
PassThrough
,
PassThrough
,
PassThrough
>>>&
instances
);
void
add_device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_nk_mn_comp_default_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceGemm_Streamk_V2
<
Col
,
Col
,
Row
,
BF16
,
BF16
,
BF16
,
PassThrough
,
PassThrough
,
PassThrough
>>>&
instances
);
void
add_device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_nk_mn_comp_kpadding_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceGemm_Streamk_V2
<
Col
,
Col
,
Row
,
BF16
,
BF16
,
BF16
,
PassThrough
,
PassThrough
,
PassThrough
>>>&
instances
);
void
add_device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_nk_mn_comp_mpadding_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceGemm_Streamk_V2
<
Col
,
Col
,
Row
,
BF16
,
BF16
,
BF16
,
PassThrough
,
PassThrough
,
PassThrough
>>>&
instances
);
void
add_device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_nk_mn_comp_mkpadding_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceGemm_Streamk_V2
<
Col
,
Col
,
Row
,
BF16
,
BF16
,
BF16
,
PassThrough
,
PassThrough
,
PassThrough
>>>&
instances
);
void
add_device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_nk_mn_mem_v1_default_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceGemm_Streamk_V2
<
Col
,
Col
,
Row
,
BF16
,
BF16
,
BF16
,
PassThrough
,
PassThrough
,
PassThrough
>>>&
instances
);
void
add_device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_nk_mn_mem_v1_kpadding_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceGemm_Streamk_V2
<
Col
,
Col
,
Row
,
BF16
,
BF16
,
BF16
,
PassThrough
,
PassThrough
,
PassThrough
>>>&
instances
);
void
add_device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_nk_mn_mem_v1_mkpadding_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceGemm_Streamk_V2
<
Col
,
Col
,
Row
,
BF16
,
BF16
,
BF16
,
PassThrough
,
PassThrough
,
PassThrough
>>>&
instances
);
void
add_device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_nk_mn_mem_v2_default_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceGemm_Streamk_V2
<
Col
,
Col
,
Row
,
BF16
,
BF16
,
BF16
,
PassThrough
,
PassThrough
,
PassThrough
>>>&
instances
);
void
add_device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_nk_mn_mem_v2_kpadding_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceGemm_Streamk_V2
<
Col
,
Col
,
Row
,
BF16
,
BF16
,
BF16
,
PassThrough
,
PassThrough
,
PassThrough
>>>&
instances
);
void
add_device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_nk_mn_mem_v2_mkpadding_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceGemm_Streamk_V2
<
Col
,
Col
,
Row
,
BF16
,
BF16
,
BF16
,
PassThrough
,
PassThrough
,
PassThrough
>>>&
instances
);
#endif
#if(defined(CK_ENABLE_FP8))
void
add_device_gemm_xdl_universal_streamk_f16_f8_f16_mk_kn_mn_comp_default_instances
(
std
::
vector
<
std
::
unique_ptr
<
...
...
@@ -527,6 +924,109 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceGemm_S
}
#endif
#ifdef CK_ENABLE_BF16
if
constexpr
(
is_same_v
<
ADataType
,
bhalf_t
>
&&
is_same_v
<
BDataType
,
bhalf_t
>
&&
is_same_v
<
CDataType
,
bhalf_t
>
)
{
if
constexpr
(
is_same_v
<
ALayout
,
Row
>
&&
is_same_v
<
BLayout
,
Row
>
&&
is_same_v
<
CLayout
,
Row
>
)
{
add_device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_kn_mn_comp_default_instances
(
op_ptrs
);
add_device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_kn_mn_comp_kpadding_instances
(
op_ptrs
);
add_device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_kn_mn_comp_mnpadding_instances
(
op_ptrs
);
add_device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_kn_mn_comp_mnkpadding_instances
(
op_ptrs
);
add_device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_kn_mn_mem_v1_default_instances
(
op_ptrs
);
add_device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_kn_mn_mem_v1_kpadding_instances
(
op_ptrs
);
add_device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_kn_mn_mem_v1_mnkpadding_instances
(
op_ptrs
);
add_device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_kn_mn_mem_v2_default_instances
(
op_ptrs
);
add_device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_kn_mn_mem_v2_kpadding_instances
(
op_ptrs
);
add_device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_kn_mn_mem_v2_mnkpadding_instances
(
op_ptrs
);
}
else
if
constexpr
(
is_same_v
<
ALayout
,
Row
>
&&
is_same_v
<
BLayout
,
Col
>
&&
is_same_v
<
CLayout
,
Row
>
)
{
add_device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_nk_mn_comp_default_instances
(
op_ptrs
);
add_device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_nk_mn_comp_kpadding_instances
(
op_ptrs
);
add_device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_nk_mn_mem_v1_default_instances
(
op_ptrs
);
add_device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_nk_mn_mem_v1_kpadding_instances
(
op_ptrs
);
add_device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_nk_mn_mem_v2_default_instances
(
op_ptrs
);
add_device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_nk_mn_mem_v2_kpadding_instances
(
op_ptrs
);
}
else
if
constexpr
(
is_same_v
<
ALayout
,
Col
>
&&
is_same_v
<
BLayout
,
Row
>
&&
is_same_v
<
CLayout
,
Row
>
)
{
add_device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_kn_mn_comp_default_instances
(
op_ptrs
);
add_device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_kn_mn_comp_kpadding_instances
(
op_ptrs
);
add_device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_kn_mn_comp_mnpadding_instances
(
op_ptrs
);
add_device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_kn_mn_comp_mnkpadding_instances
(
op_ptrs
);
add_device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_kn_mn_mem_v1_default_instances
(
op_ptrs
);
add_device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_kn_mn_mem_v1_kpadding_instances
(
op_ptrs
);
add_device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_kn_mn_mem_v1_mnkpadding_instances
(
op_ptrs
);
add_device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_kn_mn_mem_v2_default_instances
(
op_ptrs
);
add_device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_kn_mn_mem_v2_kpadding_instances
(
op_ptrs
);
add_device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_kn_mn_mem_v2_mnkpadding_instances
(
op_ptrs
);
}
else
if
constexpr
(
is_same_v
<
ALayout
,
Col
>
&&
is_same_v
<
BLayout
,
Col
>
&&
is_same_v
<
CLayout
,
Row
>
)
{
add_device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_nk_mn_comp_default_instances
(
op_ptrs
);
add_device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_nk_mn_comp_kpadding_instances
(
op_ptrs
);
add_device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_nk_mn_comp_mpadding_instances
(
op_ptrs
);
add_device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_nk_mn_comp_mkpadding_instances
(
op_ptrs
);
add_device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_nk_mn_mem_v1_default_instances
(
op_ptrs
);
add_device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_nk_mn_mem_v1_kpadding_instances
(
op_ptrs
);
add_device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_nk_mn_mem_v1_mkpadding_instances
(
op_ptrs
);
add_device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_nk_mn_mem_v2_default_instances
(
op_ptrs
);
add_device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_nk_mn_mem_v2_kpadding_instances
(
op_ptrs
);
add_device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_nk_mn_mem_v2_mkpadding_instances
(
op_ptrs
);
}
}
#endif
#if(defined(CK_ENABLE_FP8))
if
constexpr
(
is_same_v
<
ADataType
,
half_t
>
&&
is_same_v
<
BDataType
,
f8_t
>
&&
is_same_v
<
CDataType
,
half_t
>
)
...
...
library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_weight/device_grouped_conv_bwd_weight_two_stage_xdl_instance.hpp
View file @
96a0d5f6
// SPDX-License-Identifier: MIT
// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
// Copyright (c) 2024
-2025
, Advanced Micro Devices, Inc. All rights reserved.
#include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
...
...
@@ -75,6 +75,28 @@ using device_grouped_conv_bwd_weight_two_stage_nhwgc_xdl_c_shuffle_f16_instances
// clang-format on
>
;
template
<
ck
::
index_t
NDimSpatial
,
typename
ALayout
,
typename
BLayout
,
typename
ELayout
,
ConvolutionBackwardWeightSpecialization
ConvSpec
,
BlockGemmPipelineScheduler
Scheduler
,
BlockGemmPipelineVersion
PipelineVersion
>
using
device_grouped_conv_bwd_weight_two_stage_nhwgc_xdl_c_shuffle_f16_irregular_instances
=
std
::
tuple
<
// clang-format off
//#########################################| Num| InLayout| WeiLayout| OutLayout| InData| WeiData| OutData| AccData| In| Wei| Out| ConvBackward| Block| MPer| NPer| K0Per| K1| MPer| NPer| MXdl| NXdl| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds| BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CShuffle| CShuffle| CBlockTransfer| CBlockTransfer| BlockGemm| BlockGemm| NumGroups|
//#########################################| Dim| | | | Type| Type| Type| Type| Elementwise| Elementwise| Elementwise| Weight| Size| Block| Block| Block| | XDL| XDL| Per| Per| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraM| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| ClusterLengths| ScalarPerVector| Pipeline| Pipeline| ToMerge|
//#########################################| Spatial| | | | | | | | Operation| Operation| Operation| Specialization| | | | | | | | Wave| Wave| Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| | Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| | PerShuffle| PerShuffle| MBlock_MPerBlock| NWaveNPerXdl| Scheduler| Version| |
//#########################################| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | NBlock_NPerBlock| | | | |
DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle
<
NDimSpatial
,
ALayout
,
BLayout
,
ELayout
,
F16
,
F16
,
F16
,
F32
,
PassThrough
,
PassThrough
,
PassThrough
,
ConvSpec
,
64
,
48
,
64
,
32
,
8
,
16
,
16
,
3
,
4
,
S
<
4
,
16
,
1
>
,
S
<
2
,
0
,
1
>
,
S
<
2
,
0
,
1
>
,
1
,
3
,
4
,
false
,
S
<
4
,
16
,
1
>
,
S
<
2
,
0
,
1
>
,
S
<
2
,
0
,
1
>
,
1
,
4
,
4
,
false
,
1
,
1
,
S
<
1
,
8
,
1
,
8
>
,
1
,
Scheduler
,
PipelineVersion
,
1
>
,
DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle
<
NDimSpatial
,
ALayout
,
BLayout
,
ELayout
,
F16
,
F16
,
F16
,
F32
,
PassThrough
,
PassThrough
,
PassThrough
,
ConvSpec
,
64
,
64
,
48
,
32
,
8
,
16
,
16
,
4
,
3
,
S
<
4
,
16
,
1
>
,
S
<
2
,
0
,
1
>
,
S
<
2
,
0
,
1
>
,
1
,
4
,
4
,
false
,
S
<
4
,
16
,
1
>
,
S
<
2
,
0
,
1
>
,
S
<
2
,
0
,
1
>
,
1
,
3
,
4
,
false
,
1
,
1
,
S
<
1
,
8
,
1
,
8
>
,
1
,
Scheduler
,
PipelineVersion
,
1
>
,
DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle
<
NDimSpatial
,
ALayout
,
BLayout
,
ELayout
,
F16
,
F16
,
F16
,
F32
,
PassThrough
,
PassThrough
,
PassThrough
,
ConvSpec
,
64
,
64
,
80
,
32
,
8
,
16
,
16
,
4
,
5
,
S
<
4
,
16
,
1
>
,
S
<
2
,
0
,
1
>
,
S
<
2
,
0
,
1
>
,
1
,
4
,
4
,
false
,
S
<
4
,
16
,
1
>
,
S
<
2
,
0
,
1
>
,
S
<
2
,
0
,
1
>
,
1
,
5
,
4
,
false
,
1
,
1
,
S
<
1
,
8
,
1
,
8
>
,
1
,
Scheduler
,
PipelineVersion
,
1
>
,
DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle
<
NDimSpatial
,
ALayout
,
BLayout
,
ELayout
,
F16
,
F16
,
F16
,
F32
,
PassThrough
,
PassThrough
,
PassThrough
,
ConvSpec
,
64
,
64
,
112
,
32
,
8
,
16
,
16
,
4
,
7
,
S
<
4
,
16
,
1
>
,
S
<
2
,
0
,
1
>
,
S
<
2
,
0
,
1
>
,
1
,
4
,
4
,
false
,
S
<
4
,
16
,
1
>
,
S
<
2
,
0
,
1
>
,
S
<
2
,
0
,
1
>
,
1
,
7
,
4
,
false
,
1
,
1
,
S
<
1
,
8
,
1
,
8
>
,
1
,
Scheduler
,
PipelineVersion
,
1
>
,
DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle
<
NDimSpatial
,
ALayout
,
BLayout
,
ELayout
,
F16
,
F16
,
F16
,
F32
,
PassThrough
,
PassThrough
,
PassThrough
,
ConvSpec
,
64
,
64
,
208
,
32
,
8
,
16
,
16
,
4
,
13
,
S
<
4
,
16
,
1
>
,
S
<
2
,
0
,
1
>
,
S
<
2
,
0
,
1
>
,
1
,
4
,
4
,
false
,
S
<
4
,
16
,
1
>
,
S
<
2
,
0
,
1
>
,
S
<
2
,
0
,
1
>
,
1
,
13
,
4
,
false
,
1
,
1
,
S
<
1
,
8
,
1
,
8
>
,
1
,
Scheduler
,
PipelineVersion
,
1
>
// clang-format on
>
;
template
<
ck
::
index_t
NDimSpatial
,
typename
ALayout
,
typename
BLayout
,
...
...
@@ -118,6 +140,28 @@ using device_grouped_conv_bwd_weight_two_stage_nhwgc_xdl_c_shuffle_bf16_instance
// clang-format on
>
;
template
<
ck
::
index_t
NDimSpatial
,
typename
ALayout
,
typename
BLayout
,
typename
ELayout
,
ConvolutionBackwardWeightSpecialization
ConvSpec
,
BlockGemmPipelineScheduler
Scheduler
,
BlockGemmPipelineVersion
PipelineVersion
>
using
device_grouped_conv_bwd_weight_two_stage_nhwgc_xdl_c_shuffle_bf16_irregular_instances
=
std
::
tuple
<
// clang-format off
//#########################################| Num| InLayout| WeiLayout| OutLayout| InData| WeiData| OutData| AccData| In| Wei| Out| ConvBackward| Block| MPer| NPer| K0Per| K1| MPer| NPer| MXdl| NXdl| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds| BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CShuffle| CShuffle| CBlockTransfer| CBlockTransfer| BlockGemm| BlockGemm| NumGroups|
//#########################################| Dim| | | | Type| Type| Type| Type| Elementwise| Elementwise| Elementwise| Weight| Size| Block| Block| Block| | XDL| XDL| Per| Per| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraM| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| ClusterLengths| ScalarPerVector| Pipeline| Pipeline| ToMerge|
//#########################################| Spatial| | | | | | | | Operation| Operation| Operation| Specialization| | | | | | | | Wave| Wave| Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| | Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| | PerShuffle| PerShuffle| MBlock_MPerBlock| NWaveNPerXdl| Scheduler| Version| |
//#########################################| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | NBlock_NPerBlock| | | | |
DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle
<
NDimSpatial
,
ALayout
,
BLayout
,
ELayout
,
BF16
,
BF16
,
BF16
,
F32
,
PassThrough
,
PassThrough
,
PassThrough
,
ConvSpec
,
64
,
48
,
64
,
32
,
8
,
16
,
16
,
3
,
4
,
S
<
4
,
16
,
1
>
,
S
<
2
,
0
,
1
>
,
S
<
2
,
0
,
1
>
,
1
,
3
,
4
,
false
,
S
<
4
,
16
,
1
>
,
S
<
2
,
0
,
1
>
,
S
<
2
,
0
,
1
>
,
1
,
4
,
4
,
false
,
1
,
1
,
S
<
1
,
8
,
1
,
8
>
,
1
,
Scheduler
,
PipelineVersion
,
1
>
,
DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle
<
NDimSpatial
,
ALayout
,
BLayout
,
ELayout
,
BF16
,
BF16
,
BF16
,
F32
,
PassThrough
,
PassThrough
,
PassThrough
,
ConvSpec
,
64
,
64
,
48
,
32
,
8
,
16
,
16
,
4
,
3
,
S
<
4
,
16
,
1
>
,
S
<
2
,
0
,
1
>
,
S
<
2
,
0
,
1
>
,
1
,
4
,
4
,
false
,
S
<
4
,
16
,
1
>
,
S
<
2
,
0
,
1
>
,
S
<
2
,
0
,
1
>
,
1
,
3
,
4
,
false
,
1
,
1
,
S
<
1
,
8
,
1
,
8
>
,
1
,
Scheduler
,
PipelineVersion
,
1
>
,
DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle
<
NDimSpatial
,
ALayout
,
BLayout
,
ELayout
,
BF16
,
BF16
,
BF16
,
F32
,
PassThrough
,
PassThrough
,
PassThrough
,
ConvSpec
,
64
,
64
,
80
,
32
,
8
,
16
,
16
,
4
,
5
,
S
<
4
,
16
,
1
>
,
S
<
2
,
0
,
1
>
,
S
<
2
,
0
,
1
>
,
1
,
4
,
4
,
false
,
S
<
4
,
16
,
1
>
,
S
<
2
,
0
,
1
>
,
S
<
2
,
0
,
1
>
,
1
,
5
,
4
,
false
,
1
,
1
,
S
<
1
,
8
,
1
,
8
>
,
1
,
Scheduler
,
PipelineVersion
,
1
>
,
DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle
<
NDimSpatial
,
ALayout
,
BLayout
,
ELayout
,
BF16
,
BF16
,
BF16
,
F32
,
PassThrough
,
PassThrough
,
PassThrough
,
ConvSpec
,
64
,
64
,
112
,
32
,
8
,
16
,
16
,
4
,
7
,
S
<
4
,
16
,
1
>
,
S
<
2
,
0
,
1
>
,
S
<
2
,
0
,
1
>
,
1
,
4
,
4
,
false
,
S
<
4
,
16
,
1
>
,
S
<
2
,
0
,
1
>
,
S
<
2
,
0
,
1
>
,
1
,
7
,
4
,
false
,
1
,
1
,
S
<
1
,
8
,
1
,
8
>
,
1
,
Scheduler
,
PipelineVersion
,
1
>
,
DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle
<
NDimSpatial
,
ALayout
,
BLayout
,
ELayout
,
BF16
,
BF16
,
BF16
,
F32
,
PassThrough
,
PassThrough
,
PassThrough
,
ConvSpec
,
64
,
64
,
208
,
32
,
8
,
16
,
16
,
4
,
13
,
S
<
4
,
16
,
1
>
,
S
<
2
,
0
,
1
>
,
S
<
2
,
0
,
1
>
,
1
,
4
,
4
,
false
,
S
<
4
,
16
,
1
>
,
S
<
2
,
0
,
1
>
,
S
<
2
,
0
,
1
>
,
1
,
13
,
4
,
false
,
1
,
1
,
S
<
1
,
8
,
1
,
8
>
,
1
,
Scheduler
,
PipelineVersion
,
1
>
// clang-format on
>
;
template
<
ck
::
index_t
NDimSpatial
,
typename
ALayout
,
typename
BLayout
,
...
...
library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_backward_weight.hpp
View file @
96a0d5f6
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-202
4
, Advanced Micro Devices, Inc. All rights reserved.
// Copyright (c) 2018-202
5
, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
...
...
@@ -358,6 +358,10 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceGroupe
op_ptrs
);
add_device_grouped_conv2d_bwd_weight_two_stage_xdl_nhwgc_gkyxc_nhwgk_f16_pipev5_instances
(
op_ptrs
);
add_device_grouped_conv2d_bwd_weight_two_stage_xdl_nhwgc_gkyxc_nhwgk_f16_pipev2_irregular_instances
(
op_ptrs
);
add_device_grouped_conv2d_bwd_weight_two_stage_xdl_nhwgc_gkyxc_nhwgk_f16_pipev5_irregular_instances
(
op_ptrs
);
}
#endif
#ifdef CK_ENABLE_BF16
...
...
@@ -383,6 +387,10 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceGroupe
op_ptrs
);
add_device_grouped_conv2d_bwd_weight_two_stage_xdl_nhwgc_gkyxc_nhwgk_bf16_pipev5_instances
(
op_ptrs
);
add_device_grouped_conv2d_bwd_weight_two_stage_xdl_nhwgc_gkyxc_nhwgk_bf16_pipev2_irregular_instances
(
op_ptrs
);
add_device_grouped_conv2d_bwd_weight_two_stage_xdl_nhwgc_gkyxc_nhwgk_bf16_pipev5_irregular_instances
(
op_ptrs
);
}
#endif
}
...
...
@@ -478,6 +486,10 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceGroupe
op_ptrs
);
add_device_grouped_conv3d_bwd_weight_two_stage_xdl_ndhwgc_gkzyxc_ndhwgk_f16_pipev5_instances
(
op_ptrs
);
add_device_grouped_conv3d_bwd_weight_two_stage_xdl_ndhwgc_gkzyxc_ndhwgk_f16_pipev2_irregular_instances
(
op_ptrs
);
add_device_grouped_conv3d_bwd_weight_two_stage_xdl_ndhwgc_gkzyxc_ndhwgk_f16_pipev5_irregular_instances
(
op_ptrs
);
}
#endif
#ifdef CK_ENABLE_BF16
...
...
@@ -503,6 +515,10 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceGroupe
op_ptrs
);
add_device_grouped_conv3d_bwd_weight_two_stage_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_pipev5_instances
(
op_ptrs
);
add_device_grouped_conv3d_bwd_weight_two_stage_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_pipev2_irregular_instances
(
op_ptrs
);
add_device_grouped_conv3d_bwd_weight_two_stage_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_pipev5_irregular_instances
(
op_ptrs
);
}
#endif
#if defined CK_ENABLE_FP16 && defined CK_ENABLE_FP8 && defined CK_ENABLE_BF8
...
...
library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_backward_weight_xdl.inc
View file @
96a0d5f6
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-202
4
, Advanced Micro Devices, Inc. All rights reserved.
// Copyright (c) 2018-202
5
, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
...
...
@@ -149,6 +149,30 @@ void add_device_grouped_conv2d_bwd_weight_two_stage_xdl_nhwgc_gkyxc_nhwgk_bf16_p
PassThrough
,
PassThrough
>>>&
instances
);
void
add_device_grouped_conv2d_bwd_weight_two_stage_xdl_nhwgc_gkyxc_nhwgk_bf16_pipev2_irregular_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceGroupedConvBwdWeight
<
2
,
NHWGC
,
GKYXC
,
NHWGK
,
BF16
,
BF16
,
BF16
,
PassThrough
,
PassThrough
,
PassThrough
>>>&
instances
);
void
add_device_grouped_conv2d_bwd_weight_two_stage_xdl_nhwgc_gkyxc_nhwgk_bf16_pipev5_irregular_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceGroupedConvBwdWeight
<
2
,
NHWGC
,
GKYXC
,
NHWGK
,
BF16
,
BF16
,
BF16
,
PassThrough
,
PassThrough
,
PassThrough
>>>&
instances
);
void
add_device_grouped_conv2d_bwd_weight_two_stage_xdl_ngchw_gkyxc_ngkhw_bf16_pipev1_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceGroupedConvBwdWeight
<
2
,
NGCHW
,
...
...
@@ -234,6 +258,30 @@ void add_device_grouped_conv2d_bwd_weight_two_stage_xdl_nhwgc_gkyxc_nhwgk_f16_pi
PassThrough
,
PassThrough
>>>&
instances
);
void
add_device_grouped_conv2d_bwd_weight_two_stage_xdl_nhwgc_gkyxc_nhwgk_f16_pipev2_irregular_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceGroupedConvBwdWeight
<
2
,
NHWGC
,
GKYXC
,
NHWGK
,
F16
,
F16
,
F16
,
PassThrough
,
PassThrough
,
PassThrough
>>>&
instances
);
void
add_device_grouped_conv2d_bwd_weight_two_stage_xdl_nhwgc_gkyxc_nhwgk_f16_pipev5_irregular_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceGroupedConvBwdWeight
<
2
,
NHWGC
,
GKYXC
,
NHWGK
,
F16
,
F16
,
F16
,
PassThrough
,
PassThrough
,
PassThrough
>>>&
instances
);
void
add_device_grouped_conv2d_bwd_weight_two_stage_xdl_ngchw_gkyxc_ngkhw_f16_pipev1_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceGroupedConvBwdWeight
<
2
,
NGCHW
,
...
...
@@ -384,6 +432,30 @@ void add_device_grouped_conv3d_bwd_weight_two_stage_xdl_ndhwgc_gkzyxc_ndhwgk_bf1
PassThrough
,
PassThrough
>>>&
instances
);
void
add_device_grouped_conv3d_bwd_weight_two_stage_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_pipev2_irregular_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceGroupedConvBwdWeight
<
3
,
NDHWGC
,
GKZYXC
,
NDHWGK
,
BF16
,
BF16
,
BF16
,
PassThrough
,
PassThrough
,
PassThrough
>>>&
instances
);
void
add_device_grouped_conv3d_bwd_weight_two_stage_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_pipev5_irregular_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceGroupedConvBwdWeight
<
3
,
NDHWGC
,
GKZYXC
,
NDHWGK
,
BF16
,
BF16
,
BF16
,
PassThrough
,
PassThrough
,
PassThrough
>>>&
instances
);
void
add_device_grouped_conv3d_bwd_weight_two_stage_xdl_ngcdhw_gkzyxc_ngkdhw_bf16_pipev1_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceGroupedConvBwdWeight
<
3
,
NGCDHW
,
...
...
@@ -469,6 +541,30 @@ void add_device_grouped_conv3d_bwd_weight_two_stage_xdl_ndhwgc_gkzyxc_ndhwgk_f16
PassThrough
,
PassThrough
>>>&
instances
);
void
add_device_grouped_conv3d_bwd_weight_two_stage_xdl_ndhwgc_gkzyxc_ndhwgk_f16_pipev2_irregular_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceGroupedConvBwdWeight
<
3
,
NDHWGC
,
GKZYXC
,
NDHWGK
,
F16
,
F16
,
F16
,
PassThrough
,
PassThrough
,
PassThrough
>>>&
instances
);
void
add_device_grouped_conv3d_bwd_weight_two_stage_xdl_ndhwgc_gkzyxc_ndhwgk_f16_pipev5_irregular_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceGroupedConvBwdWeight
<
3
,
NDHWGC
,
GKZYXC
,
NDHWGK
,
F16
,
F16
,
F16
,
PassThrough
,
PassThrough
,
PassThrough
>>>&
instances
);
void
add_device_grouped_conv3d_bwd_weight_two_stage_xdl_ngcdhw_gkzyxc_ngkdhw_f16_pipev1_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceGroupedConvBwdWeight
<
3
,
NGCDHW
,
...
...
library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_forward.hpp
View file @
96a0d5f6
...
...
@@ -304,7 +304,23 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceGroupe
op_ptrs
);
}
#endif
#ifdef CK_ENABLE_BF16
if
constexpr
(
is_same_v
<
InDataType
,
ck
::
bhalf_t
>
&&
is_same_v
<
WeiDataType
,
ck
::
bhalf_t
>
&&
is_same_v
<
OutDataType
,
ck
::
bhalf_t
>
&&
is_same_v
<
AComputeType
,
ck
::
bhalf_t
>
&&
is_same_v
<
BComputeType
,
ck
::
bhalf_t
>
)
{
add_device_grouped_conv2d_fwd_xdl_merged_groups_ngchw_gkyxc_ngkhw_bf16_instances
(
op_ptrs
);
add_device_grouped_conv2d_fwd_xdl_ngchw_gkyxc_ngkhw_bf16_instances
(
op_ptrs
);
add_device_grouped_conv2d_fwd_xdl_ngchw_gkyxc_ngkhw_bf16_comp_instances
(
op_ptrs
);
add_device_grouped_conv2d_fwd_xdl_ngchw_gkyxc_ngkhw_bf16_mem_intra_instances
(
op_ptrs
);
add_device_grouped_conv2d_fwd_xdl_ngchw_gkyxc_ngkhw_bf16_mem_inter_instances
(
op_ptrs
);
}
#endif
#ifdef CK_ENABLE_INT8
if
constexpr
(
is_same_v
<
InDataType
,
int8_t
>
&&
is_same_v
<
WeiDataType
,
int8_t
>
&&
is_same_v
<
OutDataType
,
int8_t
>
&&
is_same_v
<
AComputeType
,
int8_t
>
&&
...
...
Prev
1
…
8
9
10
11
12
13
14
15
16
…
18
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment