Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
composable_kernel
Commits
cf0845ac
Unverified
Commit
cf0845ac
authored
May 23, 2023
by
Rostyslav Geyyer
Committed by
GitHub
May 23, 2023
Browse files
Merge branch 'develop' into lwpck-726
parents
8107bbb5
d821d1e5
Changes
64
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
175 additions
and
683 deletions
+175
-683
example/18_batched_gemm_reduce/CMakeLists.txt
example/18_batched_gemm_reduce/CMakeLists.txt
+3
-1
example/20_grouped_conv_bwd_weight/CMakeLists.txt
example/20_grouped_conv_bwd_weight/CMakeLists.txt
+6
-5
example/20_grouped_conv_bwd_weight/run_grouped_conv_bwd_weight_example.inc
...d_conv_bwd_weight/run_grouped_conv_bwd_weight_example.inc
+3
-1
example/21_gemm_layernorm/CMakeLists.txt
example/21_gemm_layernorm/CMakeLists.txt
+6
-4
example/26_contraction/contraction_bilinear_xdl_fp32.cpp
example/26_contraction/contraction_bilinear_xdl_fp32.cpp
+17
-151
example/26_contraction/contraction_bilinear_xdl_fp64.cpp
example/26_contraction/contraction_bilinear_xdl_fp64.cpp
+17
-151
example/26_contraction/contraction_scale_xdl_fp32.cpp
example/26_contraction/contraction_scale_xdl_fp32.cpp
+18
-151
example/26_contraction/contraction_scale_xdl_fp64.cpp
example/26_contraction/contraction_scale_xdl_fp64.cpp
+18
-151
example/30_grouped_conv_fwd_multiple_d/CMakeLists.txt
example/30_grouped_conv_fwd_multiple_d/CMakeLists.txt
+17
-19
example/31_batched_gemm_gemm/CMakeLists.txt
example/31_batched_gemm_gemm/CMakeLists.txt
+11
-9
example/35_splitK_gemm/CMakeLists.txt
example/35_splitK_gemm/CMakeLists.txt
+11
-10
example/38_grouped_conv_bwd_data_multiple_d/CMakeLists.txt
example/38_grouped_conv_bwd_data_multiple_d/CMakeLists.txt
+7
-6
example/40_conv2d_fwd_quantization/CMakeLists.txt
example/40_conv2d_fwd_quantization/CMakeLists.txt
+7
-5
example/41_grouped_conv_conv_fwd/CMakeLists.txt
example/41_grouped_conv_conv_fwd/CMakeLists.txt
+10
-8
example/47_gemm_bias_softmax_gemm_permute/CMakeLists.txt
example/47_gemm_bias_softmax_gemm_permute/CMakeLists.txt
+3
-1
include/ck/tensor_operation/gpu/device/device_grouped_conv_fwd_dl_multiple_d_nhwc_kyxc_nhwk.hpp
.../device_grouped_conv_fwd_dl_multiple_d_nhwc_kyxc_nhwk.hpp
+5
-3
include/ck/tensor_operation/gpu/device/device_grouped_conv_fwd_dl_nhwc_kyxc_nhwk.hpp
.../gpu/device/device_grouped_conv_fwd_dl_nhwc_kyxc_nhwk.hpp
+5
-2
include/ck/tensor_operation/gpu/device/impl/device_convnd_bwd_data_nwc_kxc_nwk_dl.hpp
...gpu/device/impl/device_convnd_bwd_data_nwc_kxc_nwk_dl.hpp
+3
-1
include/ck/tensor_operation/gpu/device/impl/device_gemm_dl.hpp
...de/ck/tensor_operation/gpu/device/impl/device_gemm_dl.hpp
+3
-1
include/ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_dl.hpp
...r_operation/gpu/device/impl/device_gemm_multiple_d_dl.hpp
+5
-3
No files found.
example/18_batched_gemm_reduce/CMakeLists.txt
View file @
cf0845ac
add_example_executable
(
example_batched_gemm_reduce_xdl_fp16 batched_gemm_reduce_xdl_fp16.cpp
)
if
(
GPU_TARGETS MATCHES
"gfx908"
OR GPU_TARGETS MATCHES
"gfx90a"
OR GPU_TARGETS MATCHES
"gfx940"
)
add_example_executable
(
example_batched_gemm_reduce_xdl_fp16 batched_gemm_reduce_xdl_fp16.cpp
)
endif
()
example/20_grouped_conv_bwd_weight/CMakeLists.txt
View file @
cf0845ac
add_custom_target
(
example_grouped_conv_bwd_weight
)
if
(
GPU_TARGETS MATCHES
"gfx908"
OR GPU_TARGETS MATCHES
"gfx90a"
OR GPU_TARGETS MATCHES
"gfx940"
)
add_custom_target
(
example_grouped_conv_bwd_weight
)
add_example_executable
(
example_grouped_conv_bwd_weight_xdl_fp16 grouped_conv_bwd_weight_xdl_fp16.cpp
)
add_example_executable
(
example_grouped_conv_bwd_weight_xdl_fp16 grouped_conv_bwd_weight_xdl_fp16.cpp
)
add_example_executable
(
example_grouped_conv_bwd_weight_xdl_bf16 grouped_conv_bwd_weight_xdl_bf16.cpp
)
add_example_executable
(
example_grouped_conv_bwd_weight_xdl_bf16 grouped_conv_bwd_weight_xdl_bf16.cpp
)
add_dependencies
(
example_grouped_conv_bwd_weight example_grouped_conv_bwd_weight_xdl_fp16
add_dependencies
(
example_grouped_conv_bwd_weight example_grouped_conv_bwd_weight_xdl_fp16
example_grouped_conv_bwd_weight_xdl_bf16
)
example_grouped_conv_bwd_weight_xdl_bf16
)
endif
()
add_custom_target
(
example_grouped_conv_bwd_weight_dl
)
add_custom_target
(
example_grouped_conv_bwd_weight_dl
)
...
...
example/20_grouped_conv_bwd_weight/run_grouped_conv_bwd_weight_example.inc
View file @
cf0845ac
...
@@ -18,7 +18,9 @@ bool run_grouped_conv_bwd_weight(const ExecutionConfig& config,
...
@@ -18,7 +18,9 @@ bool run_grouped_conv_bwd_weight(const ExecutionConfig& config,
// Set split_k = 2 for xdl op, split_k = 1 for dl
// Set split_k = 2 for xdl op, split_k = 1 for dl
// Dl op doesn't support split_k > 1
// Dl op doesn't support split_k > 1
// TODO: Add Dl op split_k > 1 support
// TODO: Add Dl op split_k > 1 support
if
(
!
(
ck
::
get_device_name
()
==
"gfx906"
||
ck
::
get_device_name
()
==
"gfx1030"
))
if
(
!
(
ck
::
get_device_name
()
==
"gfx906"
||
ck
::
get_device_name
()
==
"gfx1030"
||
ck
::
get_device_name
()
==
"gfx1100"
||
ck
::
get_device_name
()
==
"gfx1101"
||
ck
::
get_device_name
()
==
"gfx1102"
))
{
{
split_k
=
2
;
split_k
=
2
;
}
}
...
...
example/21_gemm_layernorm/CMakeLists.txt
View file @
cf0845ac
add_example_executable
(
example_gemm_bias_relu_add_layernorm_xdl_welford_fp16 gemm_bias_relu_add_layernorm_xdl_welford_fp16.cpp
)
if
(
GPU_TARGETS MATCHES
"gfx908"
OR GPU_TARGETS MATCHES
"gfx90a"
OR GPU_TARGETS MATCHES
"gfx940"
)
add_example_executable
(
example_gemm_bias_relu_add_layernorm_xdl_naive_fp16 gemm_bias_relu_add_layernorm_xdl_naive_fp16.cpp
)
add_example_executable
(
example_gemm_bias_relu_add_layernorm_xdl_welford_fp16 gemm_bias_relu_add_layernorm_xdl_welford_fp16.cpp
)
add_example_executable
(
example_gemm_layernorm_xdl_naive_fp16 gemm_layernorm_xdl_naive_fp16.cpp
)
add_example_executable
(
example_gemm_bias_relu_add_layernorm_xdl_naive_fp16 gemm_bias_relu_add_layernorm_xdl_naive_fp16.cpp
)
add_example_executable
(
example_gemm_xdl_layernorm_naive_single_kernel_fp16 gemm_xdl_layernorm_naive_single_kernel_fp16.cpp
)
add_example_executable
(
example_gemm_layernorm_xdl_naive_fp16 gemm_layernorm_xdl_naive_fp16.cpp
)
add_example_executable
(
example_gemm_xdl_layernorm_naive_single_kernel_fp16 gemm_xdl_layernorm_naive_single_kernel_fp16.cpp
)
endif
()
example/26_contraction/contraction_bilinear_xdl_fp32.cpp
View file @
cf0845ac
...
@@ -16,6 +16,7 @@
...
@@ -16,6 +16,7 @@
#include "ck/library/utility/host_tensor.hpp"
#include "ck/library/utility/host_tensor.hpp"
#include "ck/library/utility/host_tensor_generator.hpp"
#include "ck/library/utility/host_tensor_generator.hpp"
#include "ck/library/utility/numeric.hpp"
#include "ck/library/utility/numeric.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_contraction.hpp"
template
<
ck
::
index_t
...
Is
>
template
<
ck
::
index_t
...
Is
>
using
S
=
ck
::
Sequence
<
Is
...
>
;
using
S
=
ck
::
Sequence
<
Is
...
>
;
...
@@ -74,141 +75,6 @@ using DeviceOpInstanceMNNN = ck::tensor_operation::device::
...
@@ -74,141 +75,6 @@ using DeviceOpInstanceMNNN = ck::tensor_operation::device::
using
DeviceOpInstance
=
DeviceOpInstanceKKNN
;
using
DeviceOpInstance
=
DeviceOpInstanceKKNN
;
// hardcoded for NumDimM == NumDimN == NumDimK == 2
template
<
ck
::
index_t
NumDimM
,
ck
::
index_t
NumDimN
,
ck
::
index_t
NumDimK
,
typename
ADataType
,
typename
BDataType
,
typename
EDataType
,
typename
AccDataType
,
typename
AElementwiseOperation
,
typename
BElementwiseOperation
,
typename
CDEElementwiseOperation
,
ck
::
enable_if_t
<
NumDimM
==
2
&&
NumDimN
==
2
&&
NumDimK
==
2
,
bool
>
=
false
>
struct
ReferenceContraction_M2_N2_K2
:
public
ck
::
tensor_operation
::
device
::
BaseOperator
{
// Argument
struct
Argument
:
public
ck
::
tensor_operation
::
device
::
BaseArgument
{
Argument
(
const
Tensor
<
ADataType
>&
a_ms_ks
,
const
Tensor
<
BDataType
>&
b_ns_ks
,
Tensor
<
EDataType
>&
e_ms_ns
,
AElementwiseOperation
a_element_op
,
BElementwiseOperation
b_element_op
,
CDEElementwiseOperation
cde_element_op
)
:
a_ms_ks_
{
a_ms_ks
},
b_ns_ks_
{
b_ns_ks
},
e_ms_ns_
{
e_ms_ns
},
a_element_op_
{
a_element_op
},
b_element_op_
{
b_element_op
},
cde_element_op_
{
cde_element_op
}
{
}
const
Tensor
<
ADataType
>&
a_ms_ks_
;
const
Tensor
<
BDataType
>&
b_ns_ks_
;
Tensor
<
EDataType
>&
e_ms_ns_
;
AElementwiseOperation
a_element_op_
;
BElementwiseOperation
b_element_op_
;
CDEElementwiseOperation
cde_element_op_
;
};
// Invoker
struct
Invoker
:
public
ck
::
tensor_operation
::
device
::
BaseInvoker
{
using
Argument
=
ReferenceContraction_M2_N2_K2
::
Argument
;
float
Run
(
const
Argument
&
arg
)
{
auto
f_ms_ns
=
[
&
](
auto
m0
,
auto
m1
,
auto
n0
,
auto
n1
)
{
const
int
K0
=
arg
.
a_ms_ks_
.
mDesc
.
GetLengths
()[
2
];
const
int
K1
=
arg
.
a_ms_ks_
.
mDesc
.
GetLengths
()[
3
];
AccDataType
v_acc
=
0
;
for
(
int
k0
=
0
;
k0
<
K0
;
++
k0
)
{
for
(
int
k1
=
0
;
k1
<
K1
;
++
k1
)
{
AccDataType
v_a
;
AccDataType
v_b
;
arg
.
a_element_op_
(
v_a
,
ck
::
type_convert
<
const
AccDataType
>
(
arg
.
a_ms_ks_
(
m0
,
m1
,
k0
,
k1
)));
arg
.
b_element_op_
(
v_b
,
ck
::
type_convert
<
const
AccDataType
>
(
arg
.
b_ns_ks_
(
n0
,
n1
,
k0
,
k1
)));
v_acc
+=
v_a
*
v_b
;
}
}
AccDataType
v_c
;
arg
.
cde_element_op_
(
v_c
,
v_acc
);
arg
.
e_ms_ns_
(
m0
,
m1
,
n0
,
n1
)
=
v_c
;
};
make_ParallelTensorFunctor
(
f_ms_ns
,
arg
.
e_ms_ns_
.
mDesc
.
GetLengths
()[
0
],
arg
.
e_ms_ns_
.
mDesc
.
GetLengths
()[
1
],
arg
.
e_ms_ns_
.
mDesc
.
GetLengths
()[
2
],
arg
.
e_ms_ns_
.
mDesc
.
GetLengths
()[
3
])(
std
::
thread
::
hardware_concurrency
());
return
0
;
}
float
Run
(
const
ck
::
tensor_operation
::
device
::
BaseArgument
*
p_arg
,
const
StreamConfig
&
/* stream_config */
=
StreamConfig
{})
override
{
return
Run
(
*
dynamic_cast
<
const
Argument
*>
(
p_arg
));
}
};
static
constexpr
bool
IsValidCompilationParameter
()
{
// TODO: properly implement this check
return
true
;
}
bool
IsSupportedArgument
(
const
ck
::
tensor_operation
::
device
::
BaseArgument
*
)
override
{
return
true
;
}
static
auto
MakeArgument
(
const
Tensor
<
ADataType
>&
a_ms_ks
,
const
Tensor
<
BDataType
>&
b_ns_ks
,
Tensor
<
EDataType
>&
e_ms_ns
,
AElementwiseOperation
a_element_op
,
BElementwiseOperation
b_element_op
,
CDEElementwiseOperation
cde_element_op
)
{
return
Argument
{
a_ms_ks
,
b_ns_ks
,
e_ms_ns
,
a_element_op
,
b_element_op
,
cde_element_op
};
}
static
auto
MakeInvoker
()
{
return
Invoker
{};
}
virtual
std
::
unique_ptr
<
ck
::
tensor_operation
::
device
::
BaseInvoker
>
MakeInvokerPointer
()
{
return
std
::
make_unique
<
Invoker
>
(
Invoker
{});
}
std
::
string
GetTypeString
()
const
override
{
auto
str
=
std
::
stringstream
();
// clang-format off
str
<<
"ReferenceContraction_M2_N2_K2"
<<
std
::
endl
;
// clang-format on
return
str
.
str
();
}
};
int
main
(
int
argc
,
char
*
argv
[])
int
main
(
int
argc
,
char
*
argv
[])
{
{
bool
do_verification
=
true
;
bool
do_verification
=
true
;
...
@@ -385,22 +251,22 @@ int main(int argc, char* argv[])
...
@@ -385,22 +251,22 @@ int main(int argc, char* argv[])
{
{
Tensor
<
CShuffleDataType
>
c_ms_ns_host_result
(
e_ms_ns_lengths
,
e_ms_ns_strides
);
Tensor
<
CShuffleDataType
>
c_ms_ns_host_result
(
e_ms_ns_lengths
,
e_ms_ns_strides
);
using
ReferenceOpInstance
=
ReferenceContraction_M2_N2_K2
<
NumDimM
,
using
ReferenceOpInstance
=
NumDim
N
,
ck
::
tensor_operation
::
host
::
ReferenceContraction_M2_N2_K2
<
NumDim
M
,
NumDim
K
,
NumDim
N
,
ADataType
,
NumDimK
,
B
DataType
,
A
DataType
,
CShuffle
DataType
,
B
DataType
,
Acc
DataType
,
CShuffle
DataType
,
AElementOp
,
AccDataType
,
B
ElementOp
,
A
ElementOp
,
PassThrough
>
;
BElementOp
>
;
auto
ref_
gemm
=
ReferenceOpInstance
{};
auto
ref_
op
=
ReferenceOpInstance
{};
auto
ref_invoker
=
ref_
gemm
.
MakeInvoker
();
auto
ref_invoker
=
ref_
op
.
MakeInvoker
();
auto
ref_argument
=
ref_gemm
.
MakeArgument
(
auto
ref_argument
=
a_ms_ks
,
b_ns_ks
,
c_ms_ns_host_result
,
a_element_op
,
b_element_op
,
PassThrough
{}
);
ref_op
.
MakeArgument
(
a_ms_ks
,
b_ns_ks
,
c_ms_ns_host_result
,
a_element_op
,
b_element_op
);
ref_invoker
.
Run
(
ref_argument
);
ref_invoker
.
Run
(
ref_argument
);
...
...
example/26_contraction/contraction_bilinear_xdl_fp64.cpp
View file @
cf0845ac
...
@@ -16,6 +16,7 @@
...
@@ -16,6 +16,7 @@
#include "ck/library/utility/host_tensor.hpp"
#include "ck/library/utility/host_tensor.hpp"
#include "ck/library/utility/host_tensor_generator.hpp"
#include "ck/library/utility/host_tensor_generator.hpp"
#include "ck/library/utility/numeric.hpp"
#include "ck/library/utility/numeric.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_contraction.hpp"
template
<
ck
::
index_t
...
Is
>
template
<
ck
::
index_t
...
Is
>
using
S
=
ck
::
Sequence
<
Is
...
>
;
using
S
=
ck
::
Sequence
<
Is
...
>
;
...
@@ -74,141 +75,6 @@ using DeviceOpInstanceMNNN = ck::tensor_operation::device::
...
@@ -74,141 +75,6 @@ using DeviceOpInstanceMNNN = ck::tensor_operation::device::
using
DeviceOpInstance
=
DeviceOpInstanceKKNN
;
using
DeviceOpInstance
=
DeviceOpInstanceKKNN
;
// hardcoded for NumDimM == NumDimN == NumDimK == 2
template
<
ck
::
index_t
NumDimM
,
ck
::
index_t
NumDimN
,
ck
::
index_t
NumDimK
,
typename
ADataType
,
typename
BDataType
,
typename
EDataType
,
typename
AccDataType
,
typename
AElementwiseOperation
,
typename
BElementwiseOperation
,
typename
CDEElementwiseOperation
,
ck
::
enable_if_t
<
NumDimM
==
2
&&
NumDimN
==
2
&&
NumDimK
==
2
,
bool
>
=
false
>
struct
ReferenceContraction_M2_N2_K2
:
public
ck
::
tensor_operation
::
device
::
BaseOperator
{
// Argument
struct
Argument
:
public
ck
::
tensor_operation
::
device
::
BaseArgument
{
Argument
(
const
Tensor
<
ADataType
>&
a_ms_ks
,
const
Tensor
<
BDataType
>&
b_ns_ks
,
Tensor
<
EDataType
>&
e_ms_ns
,
AElementwiseOperation
a_element_op
,
BElementwiseOperation
b_element_op
,
CDEElementwiseOperation
cde_element_op
)
:
a_ms_ks_
{
a_ms_ks
},
b_ns_ks_
{
b_ns_ks
},
e_ms_ns_
{
e_ms_ns
},
a_element_op_
{
a_element_op
},
b_element_op_
{
b_element_op
},
cde_element_op_
{
cde_element_op
}
{
}
const
Tensor
<
ADataType
>&
a_ms_ks_
;
const
Tensor
<
BDataType
>&
b_ns_ks_
;
Tensor
<
EDataType
>&
e_ms_ns_
;
AElementwiseOperation
a_element_op_
;
BElementwiseOperation
b_element_op_
;
CDEElementwiseOperation
cde_element_op_
;
};
// Invoker
struct
Invoker
:
public
ck
::
tensor_operation
::
device
::
BaseInvoker
{
using
Argument
=
ReferenceContraction_M2_N2_K2
::
Argument
;
float
Run
(
const
Argument
&
arg
)
{
auto
f_ms_ns
=
[
&
](
auto
m0
,
auto
m1
,
auto
n0
,
auto
n1
)
{
const
int
K0
=
arg
.
a_ms_ks_
.
mDesc
.
GetLengths
()[
2
];
const
int
K1
=
arg
.
a_ms_ks_
.
mDesc
.
GetLengths
()[
3
];
AccDataType
v_acc
=
0
;
for
(
int
k0
=
0
;
k0
<
K0
;
++
k0
)
{
for
(
int
k1
=
0
;
k1
<
K1
;
++
k1
)
{
AccDataType
v_a
;
AccDataType
v_b
;
arg
.
a_element_op_
(
v_a
,
ck
::
type_convert
<
const
AccDataType
>
(
arg
.
a_ms_ks_
(
m0
,
m1
,
k0
,
k1
)));
arg
.
b_element_op_
(
v_b
,
ck
::
type_convert
<
const
AccDataType
>
(
arg
.
b_ns_ks_
(
n0
,
n1
,
k0
,
k1
)));
v_acc
+=
v_a
*
v_b
;
}
}
AccDataType
v_c
;
arg
.
cde_element_op_
(
v_c
,
v_acc
);
arg
.
e_ms_ns_
(
m0
,
m1
,
n0
,
n1
)
=
v_c
;
};
make_ParallelTensorFunctor
(
f_ms_ns
,
arg
.
e_ms_ns_
.
mDesc
.
GetLengths
()[
0
],
arg
.
e_ms_ns_
.
mDesc
.
GetLengths
()[
1
],
arg
.
e_ms_ns_
.
mDesc
.
GetLengths
()[
2
],
arg
.
e_ms_ns_
.
mDesc
.
GetLengths
()[
3
])(
std
::
thread
::
hardware_concurrency
());
return
0
;
}
float
Run
(
const
ck
::
tensor_operation
::
device
::
BaseArgument
*
p_arg
,
const
StreamConfig
&
/* stream_config */
=
StreamConfig
{})
override
{
return
Run
(
*
dynamic_cast
<
const
Argument
*>
(
p_arg
));
}
};
static
constexpr
bool
IsValidCompilationParameter
()
{
// TODO: properly implement this check
return
true
;
}
bool
IsSupportedArgument
(
const
ck
::
tensor_operation
::
device
::
BaseArgument
*
)
override
{
return
true
;
}
static
auto
MakeArgument
(
const
Tensor
<
ADataType
>&
a_ms_ks
,
const
Tensor
<
BDataType
>&
b_ns_ks
,
Tensor
<
EDataType
>&
e_ms_ns
,
AElementwiseOperation
a_element_op
,
BElementwiseOperation
b_element_op
,
CDEElementwiseOperation
cde_element_op
)
{
return
Argument
{
a_ms_ks
,
b_ns_ks
,
e_ms_ns
,
a_element_op
,
b_element_op
,
cde_element_op
};
}
static
auto
MakeInvoker
()
{
return
Invoker
{};
}
virtual
std
::
unique_ptr
<
ck
::
tensor_operation
::
device
::
BaseInvoker
>
MakeInvokerPointer
()
{
return
std
::
make_unique
<
Invoker
>
(
Invoker
{});
}
std
::
string
GetTypeString
()
const
override
{
auto
str
=
std
::
stringstream
();
// clang-format off
str
<<
"ReferenceContraction_M2_N2_K2"
<<
std
::
endl
;
// clang-format on
return
str
.
str
();
}
};
int
main
(
int
argc
,
char
*
argv
[])
int
main
(
int
argc
,
char
*
argv
[])
{
{
bool
do_verification
=
true
;
bool
do_verification
=
true
;
...
@@ -385,22 +251,22 @@ int main(int argc, char* argv[])
...
@@ -385,22 +251,22 @@ int main(int argc, char* argv[])
{
{
Tensor
<
CShuffleDataType
>
c_ms_ns_host_result
(
e_ms_ns_lengths
,
e_ms_ns_strides
);
Tensor
<
CShuffleDataType
>
c_ms_ns_host_result
(
e_ms_ns_lengths
,
e_ms_ns_strides
);
using
ReferenceOpInstance
=
ReferenceContraction_M2_N2_K2
<
NumDimM
,
using
ReferenceOpInstance
=
NumDim
N
,
ck
::
tensor_operation
::
host
::
ReferenceContraction_M2_N2_K2
<
NumDim
M
,
NumDim
K
,
NumDim
N
,
ADataType
,
NumDimK
,
B
DataType
,
A
DataType
,
CShuffle
DataType
,
B
DataType
,
Acc
DataType
,
CShuffle
DataType
,
AElementOp
,
AccDataType
,
B
ElementOp
,
A
ElementOp
,
PassThrough
>
;
BElementOp
>
;
auto
ref_
gemm
=
ReferenceOpInstance
{};
auto
ref_
op
=
ReferenceOpInstance
{};
auto
ref_invoker
=
ref_
gemm
.
MakeInvoker
();
auto
ref_invoker
=
ref_
op
.
MakeInvoker
();
auto
ref_argument
=
ref_gemm
.
MakeArgument
(
auto
ref_argument
=
a_ms_ks
,
b_ns_ks
,
c_ms_ns_host_result
,
a_element_op
,
b_element_op
,
PassThrough
{}
);
ref_op
.
MakeArgument
(
a_ms_ks
,
b_ns_ks
,
c_ms_ns_host_result
,
a_element_op
,
b_element_op
);
ref_invoker
.
Run
(
ref_argument
);
ref_invoker
.
Run
(
ref_argument
);
...
...
example/26_contraction/contraction_scale_xdl_fp32.cpp
View file @
cf0845ac
...
@@ -16,6 +16,7 @@
...
@@ -16,6 +16,7 @@
#include "ck/library/utility/host_tensor.hpp"
#include "ck/library/utility/host_tensor.hpp"
#include "ck/library/utility/host_tensor_generator.hpp"
#include "ck/library/utility/host_tensor_generator.hpp"
#include "ck/library/utility/numeric.hpp"
#include "ck/library/utility/numeric.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_contraction.hpp"
template
<
ck
::
index_t
...
Is
>
template
<
ck
::
index_t
...
Is
>
using
S
=
ck
::
Sequence
<
Is
...
>
;
using
S
=
ck
::
Sequence
<
Is
...
>
;
...
@@ -73,141 +74,6 @@ using DeviceOpInstanceMNN = ck::tensor_operation::device::
...
@@ -73,141 +74,6 @@ using DeviceOpInstanceMNN = ck::tensor_operation::device::
using
DeviceOpInstance
=
DeviceOpInstanceKKN
;
using
DeviceOpInstance
=
DeviceOpInstanceKKN
;
// hardcoded for NumDimM == NumDimN == NumDimK == 2
template
<
ck
::
index_t
NumDimM
,
ck
::
index_t
NumDimN
,
ck
::
index_t
NumDimK
,
typename
ADataType
,
typename
BDataType
,
typename
EDataType
,
typename
AccDataType
,
typename
AElementwiseOperation
,
typename
BElementwiseOperation
,
typename
CDEElementwiseOperation
,
ck
::
enable_if_t
<
NumDimM
==
2
&&
NumDimN
==
2
&&
NumDimK
==
2
,
bool
>
=
false
>
struct
ReferenceContraction_M2_N2_K2
:
public
ck
::
tensor_operation
::
device
::
BaseOperator
{
// Argument
struct
Argument
:
public
ck
::
tensor_operation
::
device
::
BaseArgument
{
Argument
(
const
Tensor
<
ADataType
>&
a_ms_ks
,
const
Tensor
<
BDataType
>&
b_ns_ks
,
Tensor
<
EDataType
>&
e_ms_ns
,
AElementwiseOperation
a_element_op
,
BElementwiseOperation
b_element_op
,
CDEElementwiseOperation
cde_element_op
)
:
a_ms_ks_
{
a_ms_ks
},
b_ns_ks_
{
b_ns_ks
},
e_ms_ns_
{
e_ms_ns
},
a_element_op_
{
a_element_op
},
b_element_op_
{
b_element_op
},
cde_element_op_
{
cde_element_op
}
{
}
const
Tensor
<
ADataType
>&
a_ms_ks_
;
const
Tensor
<
BDataType
>&
b_ns_ks_
;
Tensor
<
EDataType
>&
e_ms_ns_
;
AElementwiseOperation
a_element_op_
;
BElementwiseOperation
b_element_op_
;
CDEElementwiseOperation
cde_element_op_
;
};
// Invoker
struct
Invoker
:
public
ck
::
tensor_operation
::
device
::
BaseInvoker
{
using
Argument
=
ReferenceContraction_M2_N2_K2
::
Argument
;
float
Run
(
const
Argument
&
arg
)
{
auto
f_ms_ns
=
[
&
](
auto
m0
,
auto
m1
,
auto
n0
,
auto
n1
)
{
const
int
K0
=
arg
.
a_ms_ks_
.
mDesc
.
GetLengths
()[
2
];
const
int
K1
=
arg
.
a_ms_ks_
.
mDesc
.
GetLengths
()[
3
];
AccDataType
v_acc
=
0
;
for
(
int
k0
=
0
;
k0
<
K0
;
++
k0
)
{
for
(
int
k1
=
0
;
k1
<
K1
;
++
k1
)
{
AccDataType
v_a
;
AccDataType
v_b
;
arg
.
a_element_op_
(
v_a
,
ck
::
type_convert
<
const
AccDataType
>
(
arg
.
a_ms_ks_
(
m0
,
m1
,
k0
,
k1
)));
arg
.
b_element_op_
(
v_b
,
ck
::
type_convert
<
const
AccDataType
>
(
arg
.
b_ns_ks_
(
n0
,
n1
,
k0
,
k1
)));
v_acc
+=
v_a
*
v_b
;
}
}
AccDataType
v_c
;
arg
.
cde_element_op_
(
v_c
,
v_acc
);
arg
.
e_ms_ns_
(
m0
,
m1
,
n0
,
n1
)
=
v_c
;
};
make_ParallelTensorFunctor
(
f_ms_ns
,
arg
.
e_ms_ns_
.
mDesc
.
GetLengths
()[
0
],
arg
.
e_ms_ns_
.
mDesc
.
GetLengths
()[
1
],
arg
.
e_ms_ns_
.
mDesc
.
GetLengths
()[
2
],
arg
.
e_ms_ns_
.
mDesc
.
GetLengths
()[
3
])(
std
::
thread
::
hardware_concurrency
());
return
0
;
}
float
Run
(
const
ck
::
tensor_operation
::
device
::
BaseArgument
*
p_arg
,
const
StreamConfig
&
/* stream_config */
=
StreamConfig
{})
override
{
return
Run
(
*
dynamic_cast
<
const
Argument
*>
(
p_arg
));
}
};
static
constexpr
bool
IsValidCompilationParameter
()
{
// TODO: properly implement this check
return
true
;
}
bool
IsSupportedArgument
(
const
ck
::
tensor_operation
::
device
::
BaseArgument
*
)
override
{
return
true
;
}
static
auto
MakeArgument
(
const
Tensor
<
ADataType
>&
a_ms_ks
,
const
Tensor
<
BDataType
>&
b_ns_ks
,
Tensor
<
EDataType
>&
e_ms_ns
,
AElementwiseOperation
a_element_op
,
BElementwiseOperation
b_element_op
,
CDEElementwiseOperation
cde_element_op
)
{
return
Argument
{
a_ms_ks
,
b_ns_ks
,
e_ms_ns
,
a_element_op
,
b_element_op
,
cde_element_op
};
}
static
auto
MakeInvoker
()
{
return
Invoker
{};
}
virtual
std
::
unique_ptr
<
ck
::
tensor_operation
::
device
::
BaseInvoker
>
MakeInvokerPointer
()
{
return
std
::
make_unique
<
Invoker
>
(
Invoker
{});
}
std
::
string
GetTypeString
()
const
override
{
auto
str
=
std
::
stringstream
();
// clang-format off
str
<<
"ReferenceContraction_M2_N2_K2"
<<
std
::
endl
;
// clang-format on
return
str
.
str
();
}
};
int
main
(
int
argc
,
char
*
argv
[])
int
main
(
int
argc
,
char
*
argv
[])
{
{
bool
do_verification
=
true
;
bool
do_verification
=
true
;
...
@@ -368,22 +234,23 @@ int main(int argc, char* argv[])
...
@@ -368,22 +234,23 @@ int main(int argc, char* argv[])
{
{
Tensor
<
CShuffleDataType
>
c_ms_ns_host_result
(
e_ms_ns_lengths
,
e_ms_ns_strides
);
Tensor
<
CShuffleDataType
>
c_ms_ns_host_result
(
e_ms_ns_lengths
,
e_ms_ns_strides
);
using
ReferenceOpInstance
=
ReferenceContraction_M2_N2_K2
<
NumDimM
,
using
ReferenceOpInstance
=
NumDimN
,
ck
::
tensor_operation
::
host
::
ReferenceContraction_M2_N2_K2
<
NumDimM
,
NumDimK
,
NumDimN
,
ADataType
,
NumDimK
,
BDataType
,
ADataType
,
CShuffleDataType
,
BDataType
,
AccDataType
,
CShuffleDataType
,
AElementOp
,
AccDataType
,
BElementOp
,
AElementOp
,
PassThrough
>
;
BElementOp
>
;
auto
ref_gemm
=
ReferenceOpInstance
{};
auto
ref_op
=
ReferenceOpInstance
{};
auto
ref_invoker
=
ref_gemm
.
MakeInvoker
();
auto
ref_invoker
=
ref_op
.
MakeInvoker
();
auto
ref_argument
=
ref_gemm
.
MakeArgument
(
Tensor
<
float
>
empty_tensor
(
std
::
vector
<
ck
::
index_t
>
{},
std
::
vector
<
ck
::
index_t
>
{});
a_ms_ks
,
b_ns_ks
,
c_ms_ns_host_result
,
a_element_op
,
b_element_op
,
PassThrough
{});
auto
ref_argument
=
ref_op
.
MakeArgument
(
a_ms_ks
,
b_ns_ks
,
c_ms_ns_host_result
,
a_element_op
,
b_element_op
);
ref_invoker
.
Run
(
ref_argument
);
ref_invoker
.
Run
(
ref_argument
);
...
...
example/26_contraction/contraction_scale_xdl_fp64.cpp
View file @
cf0845ac
...
@@ -16,6 +16,7 @@
...
@@ -16,6 +16,7 @@
#include "ck/library/utility/host_tensor.hpp"
#include "ck/library/utility/host_tensor.hpp"
#include "ck/library/utility/host_tensor_generator.hpp"
#include "ck/library/utility/host_tensor_generator.hpp"
#include "ck/library/utility/numeric.hpp"
#include "ck/library/utility/numeric.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_contraction.hpp"
template
<
ck
::
index_t
...
Is
>
template
<
ck
::
index_t
...
Is
>
using
S
=
ck
::
Sequence
<
Is
...
>
;
using
S
=
ck
::
Sequence
<
Is
...
>
;
...
@@ -73,141 +74,6 @@ using DeviceOpInstanceMNN = ck::tensor_operation::device::
...
@@ -73,141 +74,6 @@ using DeviceOpInstanceMNN = ck::tensor_operation::device::
using
DeviceOpInstance
=
DeviceOpInstanceKKN
;
using
DeviceOpInstance
=
DeviceOpInstanceKKN
;
// hardcoded for NumDimM == NumDimN == NumDimK == 2
template
<
ck
::
index_t
NumDimM
,
ck
::
index_t
NumDimN
,
ck
::
index_t
NumDimK
,
typename
ADataType
,
typename
BDataType
,
typename
EDataType
,
typename
AccDataType
,
typename
AElementwiseOperation
,
typename
BElementwiseOperation
,
typename
CDEElementwiseOperation
,
ck
::
enable_if_t
<
NumDimM
==
2
&&
NumDimN
==
2
&&
NumDimK
==
2
,
bool
>
=
false
>
struct
ReferenceContraction_M2_N2_K2
:
public
ck
::
tensor_operation
::
device
::
BaseOperator
{
// Argument
struct
Argument
:
public
ck
::
tensor_operation
::
device
::
BaseArgument
{
Argument
(
const
Tensor
<
ADataType
>&
a_ms_ks
,
const
Tensor
<
BDataType
>&
b_ns_ks
,
Tensor
<
EDataType
>&
e_ms_ns
,
AElementwiseOperation
a_element_op
,
BElementwiseOperation
b_element_op
,
CDEElementwiseOperation
cde_element_op
)
:
a_ms_ks_
{
a_ms_ks
},
b_ns_ks_
{
b_ns_ks
},
e_ms_ns_
{
e_ms_ns
},
a_element_op_
{
a_element_op
},
b_element_op_
{
b_element_op
},
cde_element_op_
{
cde_element_op
}
{
}
const
Tensor
<
ADataType
>&
a_ms_ks_
;
const
Tensor
<
BDataType
>&
b_ns_ks_
;
Tensor
<
EDataType
>&
e_ms_ns_
;
AElementwiseOperation
a_element_op_
;
BElementwiseOperation
b_element_op_
;
CDEElementwiseOperation
cde_element_op_
;
};
// Invoker
struct
Invoker
:
public
ck
::
tensor_operation
::
device
::
BaseInvoker
{
using
Argument
=
ReferenceContraction_M2_N2_K2
::
Argument
;
float
Run
(
const
Argument
&
arg
)
{
auto
f_ms_ns
=
[
&
](
auto
m0
,
auto
m1
,
auto
n0
,
auto
n1
)
{
const
int
K0
=
arg
.
a_ms_ks_
.
mDesc
.
GetLengths
()[
2
];
const
int
K1
=
arg
.
a_ms_ks_
.
mDesc
.
GetLengths
()[
3
];
AccDataType
v_acc
=
0
;
for
(
int
k0
=
0
;
k0
<
K0
;
++
k0
)
{
for
(
int
k1
=
0
;
k1
<
K1
;
++
k1
)
{
AccDataType
v_a
;
AccDataType
v_b
;
arg
.
a_element_op_
(
v_a
,
ck
::
type_convert
<
const
AccDataType
>
(
arg
.
a_ms_ks_
(
m0
,
m1
,
k0
,
k1
)));
arg
.
b_element_op_
(
v_b
,
ck
::
type_convert
<
const
AccDataType
>
(
arg
.
b_ns_ks_
(
n0
,
n1
,
k0
,
k1
)));
v_acc
+=
v_a
*
v_b
;
}
}
AccDataType
v_c
;
arg
.
cde_element_op_
(
v_c
,
v_acc
);
arg
.
e_ms_ns_
(
m0
,
m1
,
n0
,
n1
)
=
v_c
;
};
make_ParallelTensorFunctor
(
f_ms_ns
,
arg
.
e_ms_ns_
.
mDesc
.
GetLengths
()[
0
],
arg
.
e_ms_ns_
.
mDesc
.
GetLengths
()[
1
],
arg
.
e_ms_ns_
.
mDesc
.
GetLengths
()[
2
],
arg
.
e_ms_ns_
.
mDesc
.
GetLengths
()[
3
])(
std
::
thread
::
hardware_concurrency
());
return
0
;
}
float
Run
(
const
ck
::
tensor_operation
::
device
::
BaseArgument
*
p_arg
,
const
StreamConfig
&
/* stream_config */
=
StreamConfig
{})
override
{
return
Run
(
*
dynamic_cast
<
const
Argument
*>
(
p_arg
));
}
};
static
constexpr
bool
IsValidCompilationParameter
()
{
// TODO: properly implement this check
return
true
;
}
bool
IsSupportedArgument
(
const
ck
::
tensor_operation
::
device
::
BaseArgument
*
)
override
{
return
true
;
}
static
auto
MakeArgument
(
const
Tensor
<
ADataType
>&
a_ms_ks
,
const
Tensor
<
BDataType
>&
b_ns_ks
,
Tensor
<
EDataType
>&
e_ms_ns
,
AElementwiseOperation
a_element_op
,
BElementwiseOperation
b_element_op
,
CDEElementwiseOperation
cde_element_op
)
{
return
Argument
{
a_ms_ks
,
b_ns_ks
,
e_ms_ns
,
a_element_op
,
b_element_op
,
cde_element_op
};
}
static
auto
MakeInvoker
()
{
return
Invoker
{};
}
virtual
std
::
unique_ptr
<
ck
::
tensor_operation
::
device
::
BaseInvoker
>
MakeInvokerPointer
()
{
return
std
::
make_unique
<
Invoker
>
(
Invoker
{});
}
std
::
string
GetTypeString
()
const
override
{
auto
str
=
std
::
stringstream
();
// clang-format off
str
<<
"ReferenceContraction_M2_N2_K2"
<<
std
::
endl
;
// clang-format on
return
str
.
str
();
}
};
int
main
(
int
argc
,
char
*
argv
[])
int
main
(
int
argc
,
char
*
argv
[])
{
{
bool
do_verification
=
true
;
bool
do_verification
=
true
;
...
@@ -368,22 +234,23 @@ int main(int argc, char* argv[])
...
@@ -368,22 +234,23 @@ int main(int argc, char* argv[])
{
{
Tensor
<
CShuffleDataType
>
c_ms_ns_host_result
(
e_ms_ns_lengths
,
e_ms_ns_strides
);
Tensor
<
CShuffleDataType
>
c_ms_ns_host_result
(
e_ms_ns_lengths
,
e_ms_ns_strides
);
using
ReferenceOpInstance
=
ReferenceContraction_M2_N2_K2
<
NumDimM
,
using
ReferenceOpInstance
=
NumDimN
,
ck
::
tensor_operation
::
host
::
ReferenceContraction_M2_N2_K2
<
NumDimM
,
NumDimK
,
NumDimN
,
ADataType
,
NumDimK
,
BDataType
,
ADataType
,
CShuffleDataType
,
BDataType
,
AccDataType
,
CShuffleDataType
,
AElementOp
,
AccDataType
,
BElementOp
,
AElementOp
,
PassThrough
>
;
BElementOp
>
;
auto
ref_gemm
=
ReferenceOpInstance
{};
auto
ref_op
=
ReferenceOpInstance
{};
auto
ref_invoker
=
ref_gemm
.
MakeInvoker
();
auto
ref_invoker
=
ref_op
.
MakeInvoker
();
auto
ref_argument
=
ref_gemm
.
MakeArgument
(
Tensor
<
float
>
empty_tensor
(
std
::
vector
<
ck
::
index_t
>
{},
std
::
vector
<
ck
::
index_t
>
{});
a_ms_ks
,
b_ns_ks
,
c_ms_ns_host_result
,
a_element_op
,
b_element_op
,
PassThrough
{});
auto
ref_argument
=
ref_op
.
MakeArgument
(
a_ms_ks
,
b_ns_ks
,
c_ms_ns_host_result
,
a_element_op
,
b_element_op
);
ref_invoker
.
Run
(
ref_argument
);
ref_invoker
.
Run
(
ref_argument
);
...
...
example/30_grouped_conv_fwd_multiple_d/CMakeLists.txt
View file @
cf0845ac
add_custom_target
(
example_grouped_conv_fwd_multiple_d
)
if
(
GPU_TARGETS MATCHES
"gfx908"
OR GPU_TARGETS MATCHES
"gfx90a"
OR GPU_TARGETS MATCHES
"gfx940"
)
add_custom_target
(
example_grouped_conv_fwd_multiple_d
)
add_example_executable
(
example_grouped_conv_fwd_bias_relu_add_xdl_fp16 grouped_conv_fwd_bias_relu_add_xdl_fp16.cpp
)
add_example_executable
(
example_grouped_conv_fwd_bias_relu_add_xdl_fp16 grouped_conv_fwd_bias_relu_add_xdl_fp16.cpp
)
add_example_executable
(
example_grouped_conv_fwd_bias_relu_add_xdl_fp32 grouped_conv_fwd_bias_relu_add_xdl_fp32.cpp
)
add_example_executable
(
example_grouped_conv_fwd_bias_relu_add_xdl_fp32 grouped_conv_fwd_bias_relu_add_xdl_fp32.cpp
)
add_example_executable
(
example_grouped_conv_fwd_bias_relu_add_xdl_bf16 grouped_conv_fwd_bias_relu_add_xdl_bf16.cpp
)
add_example_executable
(
example_grouped_conv_fwd_bias_relu_add_xdl_bf16 grouped_conv_fwd_bias_relu_add_xdl_bf16.cpp
)
add_example_executable
(
example_grouped_conv_fwd_bias_relu_add_xdl_int8 grouped_conv_fwd_bias_relu_add_xdl_int8.cpp
)
add_example_executable
(
example_grouped_conv_fwd_bias_relu_add_xdl_int8 grouped_conv_fwd_bias_relu_add_xdl_int8.cpp
)
add_dependencies
(
example_grouped_conv_fwd_multiple_d example_grouped_conv_fwd_bias_relu_add_xdl_fp16
)
add_dependencies
(
example_grouped_conv_fwd_multiple_d example_grouped_conv_fwd_bias_relu_add_xdl_fp16
)
add_dependencies
(
example_grouped_conv_fwd_multiple_d example_grouped_conv_fwd_bias_relu_add_xdl_fp32
)
add_dependencies
(
example_grouped_conv_fwd_multiple_d example_grouped_conv_fwd_bias_relu_add_xdl_fp32
)
add_dependencies
(
example_grouped_conv_fwd_multiple_d example_grouped_conv_fwd_bias_relu_add_xdl_bf16
)
add_dependencies
(
example_grouped_conv_fwd_multiple_d example_grouped_conv_fwd_bias_relu_add_xdl_bf16
)
add_dependencies
(
example_grouped_conv_fwd_multiple_d example_grouped_conv_fwd_bias_relu_add_xdl_int8
)
add_dependencies
(
example_grouped_conv_fwd_multiple_d example_grouped_conv_fwd_bias_relu_add_xdl_int8
)
if
(
USE_BITINT_EXTENSION_INT4
)
add_example_executable
(
example_grouped_conv_fwd_bias_relu_add_xdl_int4 grouped_conv_fwd_bias_relu_add_xdl_int4.cpp
)
add_dependencies
(
example_grouped_conv_fwd_multiple_d example_grouped_conv_fwd_bias_relu_add_xdl_int4
)
endif
()
# USE_BITINT_EXTENSION_INT4
if
(
USE_BITINT_EXTENSION_INT4
)
add_example_executable
(
example_grouped_conv_fwd_bias_relu_add_xdl_int4 grouped_conv_fwd_bias_relu_add_xdl_int4.cpp
)
add_dependencies
(
example_grouped_conv_fwd_multiple_d example_grouped_conv_fwd_bias_relu_add_xdl_int4
)
endif
()
# USE_BITINT_EXTENSION_INT4
add_example_executable
(
example_grouped_conv_fwd_xdl_fp16 grouped_conv_fwd_xdl_fp16.cpp
)
add_dependencies
(
example_grouped_conv_fwd_multiple_d example_grouped_conv_fwd_xdl_fp16
)
endif
()
if
(
GPU_TARGETS MATCHES
"gfx1100"
OR GPU_TARGETS MATCHES
"gfx1101"
OR GPU_TARGETS MATCHES
"gfx1102"
)
if
(
GPU_TARGETS MATCHES
"gfx1100"
OR GPU_TARGETS MATCHES
"gfx1101"
OR GPU_TARGETS MATCHES
"gfx1102"
)
add_example_executable
(
example_grouped_conv_fwd_bias_relu_add_wmma_fp16 grouped_conv_fwd_bias_relu_add_wmma_fp16.cpp
)
add_example_executable
(
example_grouped_conv_fwd_bias_relu_add_wmma_fp16 grouped_conv_fwd_bias_relu_add_wmma_fp16.cpp
)
endif
()
endif
()
add_example_executable
(
example_grouped_conv_fwd_xdl_fp16 grouped_conv_fwd_xdl_fp16.cpp
)
add_dependencies
(
example_grouped_conv_fwd_multiple_d example_grouped_conv_fwd_xdl_fp16
)
example/31_batched_gemm_gemm/CMakeLists.txt
View file @
cf0845ac
add_example_executable
(
example_batched_gemm_gemm_xdl_fp32 batched_gemm_gemm_xdl_fp32.cpp
)
if
(
GPU_TARGETS MATCHES
"gfx908"
OR GPU_TARGETS MATCHES
"gfx90a"
OR GPU_TARGETS MATCHES
"gfx940"
)
add_example_executable
(
example_batched_gemm_gemm_xdl_fp16 batched_gemm_gemm_xdl_fp16.cpp
)
add_example_executable
(
example_batched_gemm_gemm_xdl_fp32 batched_gemm_gemm_xdl_fp32.cpp
)
add_example_executable
(
example_batched_gemm_gemm_xdl_bf16 batched_gemm_gemm_xdl_bf16.cpp
)
add_example_executable
(
example_batched_gemm_gemm_xdl_fp16 batched_gemm_gemm_xdl_fp16.cpp
)
if
(
NOT GPU_TARGETS MATCHES
"gfx940"
)
add_example_executable
(
example_batched_gemm_gemm_xdl_bf16 batched_gemm_gemm_xdl_bf16.cpp
)
add_example_executable
(
example_batched_gemm_gemm_xdl_int8 batched_gemm_gemm_xdl_int8.cpp
)
if
(
NOT GPU_TARGETS MATCHES
"gfx940"
)
endif
()
add_example_executable
(
example_batched_gemm_gemm_xdl_int8 batched_gemm_gemm_xdl_int8.cpp
)
endif
()
if
(
USE_BITINT_EXTENSION_INT4
)
if
(
USE_BITINT_EXTENSION_INT4
)
add_example_executable
(
example_batched_gemm_gemm_xdl_int4 batched_gemm_gemm_xdl_int4.cpp
)
add_example_executable
(
example_batched_gemm_gemm_xdl_int4 batched_gemm_gemm_xdl_int4.cpp
)
endif
(
USE_BITINT_EXTENSION_INT4
)
endif
(
USE_BITINT_EXTENSION_INT4
)
endif
()
\ No newline at end of file
example/35_splitK_gemm/CMakeLists.txt
View file @
cf0845ac
add_custom_target
(
example_splitK_gemm_xdl
)
if
(
GPU_TARGETS MATCHES
"gfx908"
OR GPU_TARGETS MATCHES
"gfx90a"
OR GPU_TARGETS MATCHES
"gfx940"
)
add_custom_target
(
example_splitK_gemm_xdl
)
add_example_executable
(
example_splitK_gemm_xdl_fp32 splitK_gemm_xdl_fp32.cpp
)
add_example_executable
(
example_splitK_gemm_xdl_fp16 splitK_gemm_xdl_fp16.cpp
)
add_example_executable
(
example_splitK_gemm_xdl_bfp16 splitK_gemm_xdl_bfp16.cpp
)
add_example_executable
(
example_splitK_gemm_xdl_int8 splitK_gemm_xdl_int8.cpp
)
add_example_executable
(
example_splitK_gemm_xdl_fp32 splitK_gemm_xdl_fp32.cpp
)
add_dependencies
(
example_splitK_gemm_xdl
add_example_executable
(
example_splitK_gemm_xdl_fp16 splitK_gemm_xdl_fp16.cpp
)
add_example_executable
(
example_splitK_gemm_xdl_bfp16 splitK_gemm_xdl_bfp16.cpp
)
add_example_executable
(
example_splitK_gemm_xdl_int8 splitK_gemm_xdl_int8.cpp
)
add_dependencies
(
example_splitK_gemm_xdl
example_splitK_gemm_xdl_fp32
example_splitK_gemm_xdl_fp32
example_splitK_gemm_xdl_fp16
example_splitK_gemm_xdl_fp16
example_splitK_gemm_xdl_bfp16
example_splitK_gemm_xdl_bfp16
example_splitK_gemm_xdl_int8
)
example_splitK_gemm_xdl_int8
)
if
(
USE_BITINT_EXTENSION_INT4
)
if
(
USE_BITINT_EXTENSION_INT4
)
add_example_executable
(
example_splitK_gemm_xdl_int4 splitK_gemm_xdl_int4.cpp
)
add_example_executable
(
example_splitK_gemm_xdl_int4 splitK_gemm_xdl_int4.cpp
)
add_dependencies
(
example_splitK_gemm_xdl example_splitK_gemm_xdl_int4
)
add_dependencies
(
example_splitK_gemm_xdl example_splitK_gemm_xdl_int4
)
endif
()
endif
()
endif
()
example/38_grouped_conv_bwd_data_multiple_d/CMakeLists.txt
View file @
cf0845ac
add_custom_target
(
example_grouped_conv_bwd_data
)
if
(
GPU_TARGETS MATCHES
"gfx908"
OR GPU_TARGETS MATCHES
"gfx90a"
OR GPU_TARGETS MATCHES
"gfx940"
)
add_custom_target
(
example_grouped_conv_bwd_data
)
add_example_executable
(
example_grouped_conv_bwd_data_fp16 grouped_conv_bwd_data_fp16.cpp
)
add_example_executable
(
example_grouped_conv_bwd_data_bias_relu_fp16 grouped_conv_bwd_data_bias_relu_fp16.cpp
)
add_example_executable
(
example_grouped_conv_bwd_data_fp16 grouped_conv_bwd_data_fp16.cpp
)
add_dependencies
(
example_grouped_conv_bwd_data example_grouped_conv_bwd_data_fp16
)
add_example_executable
(
example_grouped_conv_bwd_data_bias_relu_fp16 grouped_conv_bwd_data_bias_relu_fp16.cpp
)
add_dependencies
(
example_grouped_conv_bwd_data example_grouped_conv_bwd_data_bias_relu_fp16
)
endif
()
add_dependencies
(
example_grouped_conv_bwd_data example_grouped_conv_bwd_data_fp16
)
\ No newline at end of file
add_dependencies
(
example_grouped_conv_bwd_data example_grouped_conv_bwd_data_bias_relu_fp16
)
example/40_conv2d_fwd_quantization/CMakeLists.txt
View file @
cf0845ac
if
(
GPU_TARGETS MATCHES
"gfx908"
OR GPU_TARGETS MATCHES
"gfx90a"
OR GPU_TARGETS MATCHES
"gfx940"
)
add_example_executable
(
example_conv2d_fwd_xdl_perlayer_quantization_int8 conv2d_fwd_xdl_perlayer_quantization_int8.cpp
)
add_example_executable
(
example_conv2d_fwd_xdl_perchannel_quantization_int8 conv2d_fwd_xdl_perchannel_quantization_int8.cpp
)
add_example_executable
(
example_conv2d_fwd_xdl_bias_relu_perlayer_quantization_int8 conv2d_fwd_xdl_bias_relu_perlayer_quantization_int8.cpp
)
add_example_executable
(
example_conv2d_fwd_xdl_bias_relu_perchannel_quantization_int8 conv2d_fwd_xdl_bias_relu_perchannel_quantization_int8.cpp
)
endif
()
# Conv perlayer quantization
# Conv perlayer quantization
add_example_executable
(
example_conv2d_fwd_dl_perlayer_quantization_int8 conv2d_fwd_dl_perlayer_quantization_int8.cpp
)
add_example_executable
(
example_conv2d_fwd_dl_perlayer_quantization_int8 conv2d_fwd_dl_perlayer_quantization_int8.cpp
)
add_example_executable
(
example_conv2d_fwd_xdl_perlayer_quantization_int8 conv2d_fwd_xdl_perlayer_quantization_int8.cpp
)
# Conv perchannel quantization
# Conv perchannel quantization
add_example_executable
(
example_conv2d_fwd_dl_perchannel_quantization_int8 conv2d_fwd_dl_perchannel_quantization_int8.cpp
)
add_example_executable
(
example_conv2d_fwd_dl_perchannel_quantization_int8 conv2d_fwd_dl_perchannel_quantization_int8.cpp
)
add_example_executable
(
example_conv2d_fwd_xdl_perchannel_quantization_int8 conv2d_fwd_xdl_perchannel_quantization_int8.cpp
)
# Conv + bias + relu perlayer quantization
# Conv + bias + relu perlayer quantization
add_example_executable
(
example_conv2d_fwd_dl_bias_relu_perlayer_quantization_int8 conv2d_fwd_dl_bias_relu_perlayer_quantization_int8.cpp
)
add_example_executable
(
example_conv2d_fwd_dl_bias_relu_perlayer_quantization_int8 conv2d_fwd_dl_bias_relu_perlayer_quantization_int8.cpp
)
add_example_executable
(
example_conv2d_fwd_xdl_bias_relu_perlayer_quantization_int8 conv2d_fwd_xdl_bias_relu_perlayer_quantization_int8.cpp
)
# Conv + bias + relu perchannel quantization
# Conv + bias + relu perchannel quantization
add_example_executable
(
example_conv2d_fwd_dl_bias_relu_perchannel_quantization_int8 conv2d_fwd_dl_bias_relu_perchannel_quantization_int8.cpp
)
add_example_executable
(
example_conv2d_fwd_dl_bias_relu_perchannel_quantization_int8 conv2d_fwd_dl_bias_relu_perchannel_quantization_int8.cpp
)
add_example_executable
(
example_conv2d_fwd_xdl_bias_relu_perchannel_quantization_int8 conv2d_fwd_xdl_bias_relu_perchannel_quantization_int8.cpp
)
# Conv + bias + tanh perlayer quantization
# Conv + bias + tanh perlayer quantization
add_example_executable
(
example_conv2d_fwd_dl_bias_tanh_perlayer_quantization_int8 conv2d_fwd_dl_bias_tanh_perlayer_quantization_int8.cpp
)
add_example_executable
(
example_conv2d_fwd_dl_bias_tanh_perlayer_quantization_int8 conv2d_fwd_dl_bias_tanh_perlayer_quantization_int8.cpp
)
# Conv + bias + tanh perchannel quantization
# Conv + bias + tanh perchannel quantization
add_example_executable
(
example_conv2d_fwd_dl_bias_tanh_perchannel_quantization_int8 conv2d_fwd_dl_bias_tanh_perchannel_quantization_int8.cpp
)
add_example_executable
(
example_conv2d_fwd_dl_bias_tanh_perchannel_quantization_int8 conv2d_fwd_dl_bias_tanh_perchannel_quantization_int8.cpp
)
\ No newline at end of file
example/41_grouped_conv_conv_fwd/CMakeLists.txt
View file @
cf0845ac
add_example_executable
(
example_grouped_conv_conv_fwd_xdl_fp32 grouped_conv_conv_fwd_xdl_fp32.cpp
)
if
(
GPU_TARGETS MATCHES
"gfx908"
OR GPU_TARGETS MATCHES
"gfx90a"
OR GPU_TARGETS MATCHES
"gfx940"
)
add_example_executable
(
example_grouped_conv_conv_fwd_xdl_fp16 grouped_conv_conv_fwd_xdl_fp16.cpp
)
add_example_executable
(
example_grouped_conv_conv_fwd_xdl_fp32 grouped_conv_conv_fwd_xdl_fp32.cpp
)
add_example_executable
(
example_grouped_conv_conv_fwd_xdl_bf16 grouped_conv_conv_fwd_xdl_bf16.cpp
)
add_example_executable
(
example_grouped_conv_conv_fwd_xdl_fp16 grouped_conv_conv_fwd_xdl_fp16.cpp
)
if
(
NOT GPU_TARGETS MATCHES
"gfx940"
)
add_example_executable
(
example_grouped_conv_conv_fwd_xdl_bf16 grouped_conv_conv_fwd_xdl_bf16.cpp
)
add_example_executable
(
example_grouped_conv_conv_fwd_xdl_int8 grouped_conv_conv_fwd_xdl_int8.cpp
)
if
(
NOT GPU_TARGETS MATCHES
"gfx940"
)
add_example_executable
(
example_grouped_conv_conv_fwd_xdl_int8 grouped_conv_conv_fwd_xdl_int8.cpp
)
endif
()
if
(
USE_BITINT_EXTENSION_INT4
)
add_example_executable
(
example_grouped_conv_conv_fwd_xdl_int4 grouped_conv_conv_fwd_xdl_int4.cpp
)
endif
(
USE_BITINT_EXTENSION_INT4
)
endif
()
endif
()
if
(
USE_BITINT_EXTENSION_INT4
)
add_example_executable
(
example_grouped_conv_conv_fwd_xdl_int4 grouped_conv_conv_fwd_xdl_int4.cpp
)
endif
(
USE_BITINT_EXTENSION_INT4
)
example/47_gemm_bias_softmax_gemm_permute/CMakeLists.txt
View file @
cf0845ac
add_example_executable
(
example_gemm_bias_softmax_gemm_permute gemm_bias_softmax_gemm_permute.cpp
)
if
(
GPU_TARGETS MATCHES
"gfx908"
OR GPU_TARGETS MATCHES
"gfx90a"
OR GPU_TARGETS MATCHES
"gfx940"
)
add_example_executable
(
example_gemm_bias_softmax_gemm_permute gemm_bias_softmax_gemm_permute.cpp
)
endif
()
include/ck/tensor_operation/gpu/device/device_grouped_conv_fwd_dl_multiple_d_nhwc_kyxc_nhwk.hpp
View file @
cf0845ac
...
@@ -134,8 +134,9 @@ __global__ void
...
@@ -134,8 +134,9 @@ __global__ void
const
Block2CTileMap
block_2_ctile_map
,
const
Block2CTileMap
block_2_ctile_map
,
const
ComputePtrOffsetOfBatch
compute_ptr_offset_of_batch
)
const
ComputePtrOffsetOfBatch
compute_ptr_offset_of_batch
)
{
{
#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx906__) || defined(__gfx1030__) || \
#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx906__) || defined(__gfx1030__) || \
defined(__gfx90a__) || defined(__gfx908__) || defined(__gfx940__))
defined(__gfx90a__) || defined(__gfx908__) || defined(__gfx940__) || defined(__gfx1100__) || \
defined(__gfx1101__) || defined(__gfx1102__))
// offset base pointer for each work-group
// offset base pointer for each work-group
const
index_t
num_blocks_per_batch
=
const
index_t
num_blocks_per_batch
=
__builtin_amdgcn_readfirstlane
(
get_grid_size
()
/
batch_count
);
__builtin_amdgcn_readfirstlane
(
get_grid_size
()
/
batch_count
);
...
@@ -711,7 +712,8 @@ struct DeviceGroupedConvFwdDlMultipleD_NHWC_KYXC_NHWK
...
@@ -711,7 +712,8 @@ struct DeviceGroupedConvFwdDlMultipleD_NHWC_KYXC_NHWK
// check device
// check device
if
(
!
(
ck
::
get_device_name
()
==
"gfx906"
||
ck
::
get_device_name
()
==
"gfx1030"
||
if
(
!
(
ck
::
get_device_name
()
==
"gfx906"
||
ck
::
get_device_name
()
==
"gfx1030"
||
ck
::
get_device_name
()
==
"gfx90a"
||
ck
::
get_device_name
()
==
"gfx908"
||
ck
::
get_device_name
()
==
"gfx90a"
||
ck
::
get_device_name
()
==
"gfx908"
||
ck
::
get_device_name
()
==
"gfx940"
))
ck
::
get_device_name
()
==
"gfx940"
||
ck
::
get_device_name
()
==
"gfx1100"
||
ck
::
get_device_name
()
==
"gfx1101"
||
ck
::
get_device_name
()
==
"gfx1102"
))
{
{
return
false
;
return
false
;
}
}
...
...
include/ck/tensor_operation/gpu/device/device_grouped_conv_fwd_dl_nhwc_kyxc_nhwk.hpp
View file @
cf0845ac
...
@@ -106,7 +106,8 @@ __global__ void
...
@@ -106,7 +106,8 @@ __global__ void
const
Block2CTileMap
block_2_ctile_map
,
const
Block2CTileMap
block_2_ctile_map
,
const
ComputePtrOffsetOfBatch
compute_ptr_offset_of_batch
)
const
ComputePtrOffsetOfBatch
compute_ptr_offset_of_batch
)
{
{
#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx906__) || defined(__gfx1030__))
#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx906__) || defined(__gfx1030__) || \
defined(__gfx1100__) || defined(__gfx1101__) || defined(__gfx1102__))
// offset base pointer for each work-group
// offset base pointer for each work-group
const
index_t
num_blocks_per_batch
=
const
index_t
num_blocks_per_batch
=
__builtin_amdgcn_readfirstlane
(
get_grid_size
()
/
batch_count
);
__builtin_amdgcn_readfirstlane
(
get_grid_size
()
/
batch_count
);
...
@@ -600,7 +601,9 @@ struct DeviceGroupedConvFwdDl_NHWC_KYXC_NHWK : public DeviceGroupedConvFwd<NDimS
...
@@ -600,7 +601,9 @@ struct DeviceGroupedConvFwdDl_NHWC_KYXC_NHWK : public DeviceGroupedConvFwd<NDimS
namespace
ctc
=
tensor_layout
::
convolution
;
namespace
ctc
=
tensor_layout
::
convolution
;
// check device
// check device
if
(
!
(
ck
::
get_device_name
()
==
"gfx906"
||
ck
::
get_device_name
()
==
"gfx1030"
))
if
(
!
(
ck
::
get_device_name
()
==
"gfx906"
||
ck
::
get_device_name
()
==
"gfx1030"
||
ck
::
get_device_name
()
==
"gfx1100"
||
ck
::
get_device_name
()
==
"gfx1101"
||
ck
::
get_device_name
()
==
"gfx1102"
))
{
{
return
false
;
return
false
;
}
}
...
...
include/ck/tensor_operation/gpu/device/impl/device_convnd_bwd_data_nwc_kxc_nwk_dl.hpp
View file @
cf0845ac
...
@@ -1393,7 +1393,9 @@ struct DeviceConvNdBwdDataNwcKxcNwk_Dl
...
@@ -1393,7 +1393,9 @@ struct DeviceConvNdBwdDataNwcKxcNwk_Dl
static
bool
IsSupportedArgument
(
const
Argument
&
arg
)
static
bool
IsSupportedArgument
(
const
Argument
&
arg
)
{
{
// check device
// check device
if
(
!
(
ck
::
get_device_name
()
==
"gfx906"
||
ck
::
get_device_name
()
==
"gfx1030"
))
if
(
!
(
ck
::
get_device_name
()
==
"gfx906"
||
ck
::
get_device_name
()
==
"gfx1030"
||
ck
::
get_device_name
()
==
"gfx1100"
||
ck
::
get_device_name
()
==
"gfx1101"
||
ck
::
get_device_name
()
==
"gfx1102"
))
{
{
return
false
;
return
false
;
}
}
...
...
include/ck/tensor_operation/gpu/device/impl/device_gemm_dl.hpp
View file @
cf0845ac
...
@@ -485,7 +485,9 @@ struct DeviceGemmDl : public DeviceGemm<ALayout,
...
@@ -485,7 +485,9 @@ struct DeviceGemmDl : public DeviceGemm<ALayout,
static
bool
IsSupportedArgument
(
const
Argument
&
arg
)
static
bool
IsSupportedArgument
(
const
Argument
&
arg
)
{
{
if
(
ck
::
get_device_name
()
==
"gfx906"
||
ck
::
get_device_name
()
==
"gfx1030"
)
if
(
ck
::
get_device_name
()
==
"gfx906"
||
ck
::
get_device_name
()
==
"gfx1030"
||
ck
::
get_device_name
()
==
"gfx1100"
||
ck
::
get_device_name
()
==
"gfx1101"
||
ck
::
get_device_name
()
==
"gfx1102"
)
{
{
return
GridwiseGemm
::
CheckValidity
(
return
GridwiseGemm
::
CheckValidity
(
arg
.
a_grid_desc_k0_m_k1_
,
arg
.
b_grid_desc_k0_n_k1_
,
arg
.
c_grid_desc_m_n_
);
arg
.
a_grid_desc_k0_m_k1_
,
arg
.
b_grid_desc_k0_n_k1_
,
arg
.
c_grid_desc_m_n_
);
...
...
include/ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_dl.hpp
View file @
cf0845ac
...
@@ -50,8 +50,9 @@ __global__ void
...
@@ -50,8 +50,9 @@ __global__ void
const
CGridDesc_M0_M10_M11_N0_N10_N11
e_grid_desc_m0_m10_m11_n0_n10_n11
,
const
CGridDesc_M0_M10_M11_N0_N10_N11
e_grid_desc_m0_m10_m11_n0_n10_n11
,
const
Block2CTileMap
block_2_ctile_map
)
const
Block2CTileMap
block_2_ctile_map
)
{
{
#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx906__) || defined(__gfx908__) || \
#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx906__) || defined(__gfx908__) || \
defined(__gfx90a__) || defined(__gfx940__) || defined(__gfx1030__))
defined(__gfx90a__) || defined(__gfx940__) || defined(__gfx1030__) || defined(__gfx1100__) || \
defined(__gfx1101__) || defined(__gfx1102__))
constexpr
index_t
shared_block_size
=
constexpr
index_t
shared_block_size
=
GridwiseGemm
::
GetSharedMemoryNumberOfByte
()
/
sizeof
(
ABDataType
);
GridwiseGemm
::
GetSharedMemoryNumberOfByte
()
/
sizeof
(
ABDataType
);
...
@@ -553,7 +554,8 @@ struct DeviceGemmMultipleD_Dl : public DeviceGemmMultipleD<ALayout,
...
@@ -553,7 +554,8 @@ struct DeviceGemmMultipleD_Dl : public DeviceGemmMultipleD<ALayout,
{
{
if
(
ck
::
get_device_name
()
==
"gfx906"
||
ck
::
get_device_name
()
==
"gfx908"
||
if
(
ck
::
get_device_name
()
==
"gfx906"
||
ck
::
get_device_name
()
==
"gfx908"
||
ck
::
get_device_name
()
==
"gfx90a"
||
ck
::
get_device_name
()
==
"gfx1030"
||
ck
::
get_device_name
()
==
"gfx90a"
||
ck
::
get_device_name
()
==
"gfx1030"
||
ck
::
get_device_name
()
==
"gfx940"
)
ck
::
get_device_name
()
==
"gfx940"
||
ck
::
get_device_name
()
==
"gfx1100"
||
ck
::
get_device_name
()
==
"gfx1101"
||
ck
::
get_device_name
()
==
"gfx1102"
)
{
{
return
GridwiseGemm
::
CheckValidity
(
return
GridwiseGemm
::
CheckValidity
(
arg
.
a_grid_desc_k0_m_k1_
,
arg
.
b_grid_desc_k0_n_k1_
,
arg
.
e_grid_desc_m_n_
);
arg
.
a_grid_desc_k0_m_k1_
,
arg
.
b_grid_desc_k0_n_k1_
,
arg
.
e_grid_desc_m_n_
);
...
...
Prev
1
2
3
4
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment