Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
composable_kernel
Commits
9312548e
Unverified
Commit
9312548e
authored
Feb 15, 2023
by
zjing14
Committed by
GitHub
Feb 15, 2023
Browse files
Merge branch 'develop' into sphinx_doc
parents
09908c9f
19490ac4
Changes
63
Expand all
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
4091 additions
and
150 deletions
+4091
-150
example/26_contraction/contraction_scale_xdl_fp64.cpp
example/26_contraction/contraction_scale_xdl_fp64.cpp
+409
-0
example/27_layernorm/layernorm_blockwise.cpp
example/27_layernorm/layernorm_blockwise.cpp
+8
-8
example/29_batched_gemm_bias_e_permute/CMakeLists.txt
example/29_batched_gemm_bias_e_permute/CMakeLists.txt
+4
-0
example/29_batched_gemm_bias_e_permute/batched_gemm_bias_e_permute_wmma_fp16.cpp
..._bias_e_permute/batched_gemm_bias_e_permute_wmma_fp16.cpp
+431
-0
example/30_grouped_conv_fwd_multiple_d/CMakeLists.txt
example/30_grouped_conv_fwd_multiple_d/CMakeLists.txt
+3
-0
example/30_grouped_conv_fwd_multiple_d/common.hpp
example/30_grouped_conv_fwd_multiple_d/common.hpp
+1
-1
example/30_grouped_conv_fwd_multiple_d/common_wmma.hpp
example/30_grouped_conv_fwd_multiple_d/common_wmma.hpp
+355
-0
example/30_grouped_conv_fwd_multiple_d/grouped_conv_fwd_bias_relu_add_wmma_fp16.cpp
...d_multiple_d/grouped_conv_fwd_bias_relu_add_wmma_fp16.cpp
+26
-0
example/30_grouped_conv_fwd_multiple_d/run_grouped_conv_fwd_bias_relu_add_wmma_example.inc
...ple_d/run_grouped_conv_fwd_bias_relu_add_wmma_example.inc
+286
-0
example/42_groupnorm/groupnorm_sigmoid_fp16.cpp
example/42_groupnorm/groupnorm_sigmoid_fp16.cpp
+7
-7
example/46_gemm_add_multiply/run_gemm_add_multiply_example.inc
...le/46_gemm_add_multiply/run_gemm_add_multiply_example.inc
+2
-3
include/ck/ck.hpp
include/ck/ck.hpp
+0
-7
include/ck/host_utility/kernel_launch.hpp
include/ck/host_utility/kernel_launch.hpp
+5
-4
include/ck/tensor_operation/gpu/device/device_normalization.hpp
...e/ck/tensor_operation/gpu/device/device_normalization.hpp
+7
-7
include/ck/tensor_operation/gpu/device/impl/device_batched_contraction_multiple_d_wmma_cshuffle.hpp
...l/device_batched_contraction_multiple_d_wmma_cshuffle.hpp
+991
-0
include/ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_wmma_cshuffle.hpp
.../gpu/device/impl/device_gemm_multiple_d_wmma_cshuffle.hpp
+654
-0
include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_d_wmma_cshuffle.hpp
...impl/device_grouped_conv_fwd_multiple_d_wmma_cshuffle.hpp
+850
-0
include/ck/tensor_operation/gpu/device/impl/device_normalization_impl.hpp
...r_operation/gpu/device/impl/device_normalization_impl.hpp
+39
-113
include/ck/tensor_operation/gpu/element/binary_element_wise_operation.hpp
...r_operation/gpu/element/binary_element_wise_operation.hpp
+7
-0
include/ck/tensor_operation/gpu/element/unary_element_wise_operation.hpp
...or_operation/gpu/element/unary_element_wise_operation.hpp
+6
-0
No files found.
example/26_contraction/contraction_scale_xdl_fp64.cpp
0 → 100644
View file @
9312548e
This diff is collapsed.
Click to expand it.
example/27_layernorm/layernorm_blockwise.cpp
View file @
9312548e
...
@@ -20,12 +20,12 @@
...
@@ -20,12 +20,12 @@
#include "ck/library/utility/literals.hpp"
#include "ck/library/utility/literals.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_layernorm.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_layernorm.hpp"
using
XDataType
=
ck
::
half_t
;
using
XDataType
=
ck
::
half_t
;
using
GammaDataType
=
ck
::
half_t
;
using
GammaDataType
=
ck
::
half_t
;
using
BetaDataType
=
ck
::
half_t
;
using
BetaDataType
=
ck
::
half_t
;
using
YDataType
=
ck
::
half_t
;
using
YDataType
=
ck
::
half_t
;
using
Acc
DataType
=
float
;
using
Compute
DataType
=
float
;
using
PassThrough
=
ck
::
tensor_operation
::
element_wise
::
PassThrough
;
using
PassThrough
=
ck
::
tensor_operation
::
element_wise
::
PassThrough
;
constexpr
int
Rank
=
2
;
constexpr
int
Rank
=
2
;
constexpr
int
NumReduceDim
=
1
;
constexpr
int
NumReduceDim
=
1
;
...
@@ -34,7 +34,7 @@ using DeviceInstance =
...
@@ -34,7 +34,7 @@ using DeviceInstance =
ck
::
tensor_operation
::
device
::
DeviceNormalizationImpl
<
XDataType
,
ck
::
tensor_operation
::
device
::
DeviceNormalizationImpl
<
XDataType
,
GammaDataType
,
GammaDataType
,
BetaDataType
,
BetaDataType
,
Acc
DataType
,
Compute
DataType
,
YDataType
,
YDataType
,
PassThrough
,
PassThrough
,
Rank
,
Rank
,
...
@@ -121,7 +121,7 @@ int main()
...
@@ -121,7 +121,7 @@ int main()
GammaDataType
,
GammaDataType
,
BetaDataType
,
BetaDataType
,
YDataType
,
YDataType
,
Acc
DataType
,
Compute
DataType
,
PassThrough
,
PassThrough
,
Rank
,
Rank
,
NumReduceDim
>
;
NumReduceDim
>
;
...
...
example/29_batched_gemm_bias_e_permute/CMakeLists.txt
View file @
9312548e
add_example_executable
(
example_batched_gemm_bias_e_permute_xdl_fp16 batched_gemm_bias_e_permute_xdl_fp16.cpp
)
add_example_executable
(
example_batched_gemm_bias_e_permute_xdl_fp16 batched_gemm_bias_e_permute_xdl_fp16.cpp
)
if
(
GPU_TARGETS MATCHES
"gfx1100"
)
add_example_executable
(
example_batched_gemm_bias_e_permute_wmma_fp16 batched_gemm_bias_e_permute_wmma_fp16.cpp
)
endif
()
example/29_batched_gemm_bias_e_permute/batched_gemm_bias_e_permute_wmma_fp16.cpp
0 → 100644
View file @
9312548e
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#include <iostream>
#include <numeric>
#include <initializer_list>
#include <cstdlib>
#include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
#include "ck/tensor_operation/gpu/device/impl/device_batched_contraction_multiple_d_wmma_cshuffle.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/library/utility/check_err.hpp"
#include "ck/library/utility/device_memory.hpp"
#include "ck/library/utility/host_tensor.hpp"
#include "ck/library/utility/host_tensor_generator.hpp"
#include "ck/library/utility/numeric.hpp"
template
<
ck
::
index_t
...
Is
>
using
S
=
ck
::
Sequence
<
Is
...
>
;
using
F16
=
ck
::
half_t
;
using
F32
=
float
;
using
PassThrough
=
ck
::
tensor_operation
::
element_wise
::
PassThrough
;
using
Add
=
ck
::
tensor_operation
::
element_wise
::
Add
;
using
ADataType
=
F16
;
using
BDataType
=
F16
;
using
AccDataType
=
F32
;
using
CShuffleDataType
=
F16
;
using
DDataType
=
F16
;
using
DsDataType
=
ck
::
Tuple
<
DDataType
>
;
using
EDataType
=
F16
;
static
constexpr
ck
::
index_t
NumDimG
=
2
;
static
constexpr
ck
::
index_t
NumDimM
=
2
;
static
constexpr
ck
::
index_t
NumDimN
=
2
;
static
constexpr
ck
::
index_t
NumDimK
=
1
;
using
AElementOp
=
ck
::
tensor_operation
::
element_wise
::
PassThrough
;
using
BElementOp
=
ck
::
tensor_operation
::
element_wise
::
PassThrough
;
using
CDEElementOp
=
ck
::
tensor_operation
::
element_wise
::
Add
;
static
constexpr
auto
GemmSpec
=
ck
::
tensor_operation
::
device
::
GemmSpecialization
::
Default
;
static
constexpr
auto
ABSpec
=
ck
::
tensor_operation
::
device
::
TensorSpecialization
::
Packed
;
static
constexpr
auto
DESpec
=
ck
::
tensor_operation
::
device
::
TensorSpecialization
::
Default
;
using
DeviceOpInstanceKKNN
=
ck
::
tensor_operation
::
device
::
DeviceBatchedContractionMultipleD_Wmma_CShuffle
<
NumDimG
,
NumDimM
,
NumDimN
,
NumDimK
,
ADataType
,
BDataType
,
DsDataType
,
EDataType
,
AccDataType
,
CShuffleDataType
,
AElementOp
,
BElementOp
,
CDEElementOp
,
GemmSpec
,
ABSpec
,
ABSpec
,
DESpec
,
256
,
128
,
256
,
8
,
8
,
16
,
16
,
4
,
4
,
S
<
4
,
64
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
8
,
8
,
true
,
S
<
4
,
64
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
8
,
8
,
true
,
1
,
1
,
S
<
1
,
32
,
1
,
8
>
,
8
>
;
using
DeviceOpInstance
=
DeviceOpInstanceKKNN
;
// hardcoded for NumDimM == NumDimN == NumDimK == 2
template
<
ck
::
index_t
NumDimG
,
ck
::
index_t
NumDimM
,
ck
::
index_t
NumDimN
,
ck
::
index_t
NumDimK
,
typename
ADataType
,
typename
BDataType
,
typename
EDataType
,
typename
AccDataType
,
typename
AElementwiseOperation
,
typename
BElementwiseOperation
,
typename
CDEElementwiseOperation
,
ck
::
enable_if_t
<
NumDimG
==
2
&&
NumDimM
==
2
&&
NumDimN
==
2
&&
NumDimK
==
1
,
bool
>
=
false
>
struct
ReferenceContraction_G2_M2_N2_K1
:
public
ck
::
tensor_operation
::
device
::
BaseOperator
{
// Argument
struct
Argument
:
public
ck
::
tensor_operation
::
device
::
BaseArgument
{
Argument
(
const
Tensor
<
ADataType
>&
a_gs_ms_ks
,
const
Tensor
<
BDataType
>&
b_gs_ns_ks
,
Tensor
<
EDataType
>&
e_gs_ms_ns
,
AElementwiseOperation
a_element_op
,
BElementwiseOperation
b_element_op
,
CDEElementwiseOperation
cde_element_op
)
:
a_gs_ms_ks_
{
a_gs_ms_ks
},
b_gs_ns_ks_
{
b_gs_ns_ks
},
e_gs_ms_ns_
{
e_gs_ms_ns
},
a_element_op_
{
a_element_op
},
b_element_op_
{
b_element_op
},
cde_element_op_
{
cde_element_op
}
{
}
const
Tensor
<
ADataType
>&
a_gs_ms_ks_
;
const
Tensor
<
BDataType
>&
b_gs_ns_ks_
;
Tensor
<
EDataType
>&
e_gs_ms_ns_
;
AElementwiseOperation
a_element_op_
;
BElementwiseOperation
b_element_op_
;
CDEElementwiseOperation
cde_element_op_
;
};
// Invoker
struct
Invoker
:
public
ck
::
tensor_operation
::
device
::
BaseInvoker
{
using
Argument
=
ReferenceContraction_G2_M2_N2_K1
::
Argument
;
float
Run
(
const
Argument
&
arg
)
{
auto
f_ms_ns
=
[
&
](
auto
g0
,
auto
g1
,
auto
m0
,
auto
m1
,
auto
n0
,
auto
n1
)
{
const
int
K0
=
arg
.
a_gs_ms_ks_
.
mDesc
.
GetLengths
()[
4
];
AccDataType
v_acc
=
0
;
for
(
int
k0
=
0
;
k0
<
K0
;
++
k0
)
{
AccDataType
v_a
;
AccDataType
v_b
;
arg
.
a_element_op_
(
v_a
,
ck
::
type_convert
<
const
AccDataType
>
(
arg
.
a_gs_ms_ks_
(
g0
,
g1
,
m0
,
m1
,
k0
)));
arg
.
b_element_op_
(
v_b
,
ck
::
type_convert
<
const
AccDataType
>
(
arg
.
b_gs_ns_ks_
(
g0
,
g1
,
n0
,
n1
,
k0
)));
v_acc
+=
v_a
*
v_b
;
}
AccDataType
v_c
;
arg
.
cde_element_op_
(
v_c
,
v_acc
);
arg
.
e_gs_ms_ns_
(
g0
,
g1
,
m0
,
m1
,
n0
,
n1
)
=
v_c
;
};
make_ParallelTensorFunctor
(
f_ms_ns
,
arg
.
e_gs_ms_ns_
.
mDesc
.
GetLengths
()[
0
],
arg
.
e_gs_ms_ns_
.
mDesc
.
GetLengths
()[
1
],
arg
.
e_gs_ms_ns_
.
mDesc
.
GetLengths
()[
2
],
arg
.
e_gs_ms_ns_
.
mDesc
.
GetLengths
()[
3
],
arg
.
e_gs_ms_ns_
.
mDesc
.
GetLengths
()[
4
],
arg
.
e_gs_ms_ns_
.
mDesc
.
GetLengths
()[
5
])(
std
::
thread
::
hardware_concurrency
());
return
0
;
}
float
Run
(
const
ck
::
tensor_operation
::
device
::
BaseArgument
*
p_arg
,
const
StreamConfig
&
/* stream_config */
=
StreamConfig
{})
override
{
return
Run
(
*
dynamic_cast
<
const
Argument
*>
(
p_arg
));
}
};
static
constexpr
bool
IsValidCompilationParameter
()
{
// TODO: properly implement this check
return
true
;
}
bool
IsSupportedArgument
(
const
ck
::
tensor_operation
::
device
::
BaseArgument
*
)
override
{
return
true
;
}
static
auto
MakeArgument
(
const
Tensor
<
ADataType
>&
a_gs_ms_ks
,
const
Tensor
<
BDataType
>&
b_gs_ns_ks
,
Tensor
<
EDataType
>&
e_gs_ms_ns
,
AElementwiseOperation
a_element_op
,
BElementwiseOperation
b_element_op
,
CDEElementwiseOperation
cde_element_op
)
{
return
Argument
{
a_gs_ms_ks
,
b_gs_ns_ks
,
e_gs_ms_ns
,
a_element_op
,
b_element_op
,
cde_element_op
};
}
static
auto
MakeInvoker
()
{
return
Invoker
{};
}
virtual
std
::
unique_ptr
<
ck
::
tensor_operation
::
device
::
BaseInvoker
>
MakeInvokerPointer
()
{
return
std
::
make_unique
<
Invoker
>
(
Invoker
{});
}
std
::
string
GetTypeString
()
const
override
{
auto
str
=
std
::
stringstream
();
// clang-format off
str
<<
"ReferenceContraction_G2_M2_N2_K1"
<<
std
::
endl
;
// clang-format on
return
str
.
str
();
}
};
int
main
(
int
argc
,
char
*
argv
[])
{
bool
do_verification
=
true
;
int
init_method
=
1
;
bool
time_kernel
=
true
;
ck
::
index_t
G0
=
1
;
ck
::
index_t
G1
=
2
;
ck
::
index_t
M0
=
4
;
ck
::
index_t
M1
=
128
;
ck
::
index_t
N0
=
16
;
ck
::
index_t
N1
=
256
;
ck
::
index_t
K0
=
2048
;
// A[G0, G1, M0, M1, K0]
std
::
vector
<
ck
::
index_t
>
a_gs_ms_ks_lengths
{
G0
,
G1
,
M0
,
M1
,
K0
};
std
::
vector
<
ck
::
index_t
>
a_gs_ms_ks_strides
{
G1
*
M0
*
M1
*
K0
,
M0
*
M1
*
K0
,
M1
*
K0
,
K0
,
1
};
// B[G0, G1, N0, N1, K0]
std
::
vector
<
ck
::
index_t
>
b_gs_ns_ks_lengths
{
G0
,
G1
,
N0
,
N1
,
K0
};
std
::
vector
<
ck
::
index_t
>
b_gs_ns_ks_strides
{
G1
*
N0
*
N1
*
K0
,
N0
*
N1
*
K0
,
N1
*
K0
,
K0
,
1
};
// D[G0, G1, M0, N0, M1, N1]
std
::
vector
<
ck
::
index_t
>
d_gs_ms_ns_lengths
{
G0
,
G1
,
M0
,
M1
,
N0
,
N1
};
std
::
vector
<
ck
::
index_t
>
d_gs_ms_ns_strides
{
G1
*
N0
*
N1
,
N0
*
N1
,
0
,
0
,
N1
,
1
};
// E[G0, G1, M0, N0, M1, N1]
std
::
vector
<
ck
::
index_t
>
e_gs_ms_ns_lengths
{
G0
,
G1
,
M0
,
M1
,
N0
,
N1
};
std
::
vector
<
ck
::
index_t
>
e_gs_ms_ns_strides
{
G1
*
M0
*
N0
*
M1
*
N1
,
M0
*
N0
*
M1
*
N1
,
N0
*
M1
*
N1
,
N1
,
M1
*
N1
,
1
};
if
(
argc
==
1
)
{
// use default case
}
else
if
(
argc
==
4
)
{
do_verification
=
std
::
stoi
(
argv
[
1
]);
init_method
=
std
::
stoi
(
argv
[
2
]);
time_kernel
=
std
::
stoi
(
argv
[
3
]);
}
else
{
printf
(
"arg1: verification (0=no, 1=yes)
\n
"
);
printf
(
"arg2: initialization (0=no init, 1=integer value, 2=decimal value)
\n
"
);
printf
(
"arg3: time kernel (0=no, 1=yes)
\n
"
);
exit
(
0
);
}
Tensor
<
ADataType
>
a_gs_ms_ks
(
a_gs_ms_ks_lengths
,
a_gs_ms_ks_strides
);
Tensor
<
BDataType
>
b_gs_ns_ks
(
b_gs_ns_ks_lengths
,
b_gs_ns_ks_strides
);
Tensor
<
DDataType
>
d_gs_ms_ns
(
d_gs_ms_ns_lengths
,
d_gs_ms_ns_strides
);
Tensor
<
EDataType
>
e_gs_ms_ns_host_result
(
e_gs_ms_ns_lengths
,
e_gs_ms_ns_strides
);
Tensor
<
EDataType
>
e_gs_ms_ns_device_result
(
e_gs_ms_ns_lengths
,
e_gs_ms_ns_strides
);
std
::
cout
<<
"a_gs_ms_ks: "
<<
a_gs_ms_ks
.
mDesc
<<
std
::
endl
;
std
::
cout
<<
"b_gs_ns_ks: "
<<
b_gs_ns_ks
.
mDesc
<<
std
::
endl
;
std
::
cout
<<
"d_gs_ms_ns: "
<<
d_gs_ms_ns
.
mDesc
<<
std
::
endl
;
std
::
cout
<<
"e_gs_ms_ns: "
<<
e_gs_ms_ns_host_result
.
mDesc
<<
std
::
endl
;
switch
(
init_method
)
{
case
0
:
break
;
case
1
:
a_gs_ms_ks
.
GenerateTensorValue
(
GeneratorTensor_2
<
ADataType
>
{
-
5
,
5
});
b_gs_ns_ks
.
GenerateTensorValue
(
GeneratorTensor_2
<
BDataType
>
{
-
5
,
5
});
d_gs_ms_ns
.
GenerateTensorValue
(
GeneratorTensor_2
<
DDataType
>
{
-
5
,
5
});
break
;
default:
a_gs_ms_ks
.
GenerateTensorValue
(
GeneratorTensor_3
<
ADataType
>
{
0.0
,
1.0
});
b_gs_ns_ks
.
GenerateTensorValue
(
GeneratorTensor_3
<
BDataType
>
{
-
0.5
,
0.5
});
d_gs_ms_ns
.
GenerateTensorValue
(
GeneratorTensor_3
<
DDataType
>
{
-
0.5
,
0.5
});
break
;
}
DeviceMem
a_device_buf
(
sizeof
(
ADataType
)
*
a_gs_ms_ks
.
mDesc
.
GetElementSpaceSize
());
DeviceMem
b_device_buf
(
sizeof
(
BDataType
)
*
b_gs_ns_ks
.
mDesc
.
GetElementSpaceSize
());
DeviceMem
d_device_buf
(
sizeof
(
DDataType
)
*
d_gs_ms_ns
.
mDesc
.
GetElementSpaceSize
());
DeviceMem
e_device_buf
(
sizeof
(
EDataType
)
*
e_gs_ms_ns_device_result
.
mDesc
.
GetElementSpaceSize
());
a_device_buf
.
ToDevice
(
a_gs_ms_ks
.
mData
.
data
());
b_device_buf
.
ToDevice
(
b_gs_ns_ks
.
mData
.
data
());
d_device_buf
.
ToDevice
(
d_gs_ms_ns
.
mData
.
data
());
// set zero
e_device_buf
.
SetZero
();
auto
a_element_op
=
AElementOp
{};
auto
b_element_op
=
BElementOp
{};
auto
cde_element_op
=
CDEElementOp
{};
// device operation
auto
op
=
DeviceOpInstance
{};
auto
invoker
=
op
.
MakeInvoker
();
auto
argument
=
op
.
MakeArgument
(
a_device_buf
.
GetDeviceBuffer
(),
b_device_buf
.
GetDeviceBuffer
(),
std
::
array
<
const
void
*
,
1
>
{
d_device_buf
.
GetDeviceBuffer
()},
e_device_buf
.
GetDeviceBuffer
(),
a_gs_ms_ks_lengths
,
a_gs_ms_ks_strides
,
b_gs_ns_ks_lengths
,
b_gs_ns_ks_strides
,
std
::
array
<
std
::
vector
<
ck
::
index_t
>
,
1
>
{
d_gs_ms_ns_lengths
},
std
::
array
<
std
::
vector
<
ck
::
index_t
>
,
1
>
{
d_gs_ms_ns_strides
},
e_gs_ms_ns_lengths
,
e_gs_ms_ns_strides
,
a_element_op
,
b_element_op
,
cde_element_op
);
if
(
!
op
.
IsSupportedArgument
(
argument
))
{
std
::
cout
<<
op
.
GetTypeString
()
<<
" does not support this problem"
<<
std
::
endl
;
return
0
;
}
float
ave_time
=
invoker
.
Run
(
argument
,
StreamConfig
{
nullptr
,
time_kernel
});
ck
::
index_t
G
=
ck
::
accumulate_n
<
ck
::
index_t
>
(
e_gs_ms_ns_lengths
.
begin
(),
NumDimG
,
1
,
std
::
multiplies
<>
{});
ck
::
index_t
M
=
ck
::
accumulate_n
<
ck
::
index_t
>
(
e_gs_ms_ns_lengths
.
begin
()
+
NumDimG
,
NumDimM
,
1
,
std
::
multiplies
<>
{});
ck
::
index_t
N
=
ck
::
accumulate_n
<
ck
::
index_t
>
(
e_gs_ms_ns_lengths
.
begin
()
+
NumDimG
+
NumDimM
,
NumDimN
,
1
,
std
::
multiplies
<>
{});
ck
::
index_t
K
=
ck
::
accumulate_n
<
ck
::
index_t
>
(
a_gs_ms_ks_lengths
.
begin
()
+
NumDimG
+
NumDimM
,
NumDimK
,
1
,
std
::
multiplies
<>
{});
std
::
cout
<<
"GMNK="
<<
G
<<
", "
<<
M
<<
", "
<<
N
<<
", "
<<
K
<<
std
::
endl
;
std
::
size_t
flop
=
std
::
size_t
(
2
)
*
G
*
M
*
N
*
K
;
std
::
size_t
num_btype
=
sizeof
(
ADataType
)
*
G
*
M
*
K
+
sizeof
(
BDataType
)
*
G
*
K
*
N
+
sizeof
(
DDataType
)
*
G
*
M
*
N
+
sizeof
(
EDataType
)
*
G
*
M
*
N
;
float
tflops
=
static_cast
<
float
>
(
flop
)
/
1.E9
/
ave_time
;
float
gb_per_sec
=
num_btype
/
1.E6
/
ave_time
;
std
::
cout
<<
"Perf: "
<<
ave_time
<<
" ms, "
<<
tflops
<<
" TFlops, "
<<
gb_per_sec
<<
" GB/s, "
<<
op
.
GetTypeString
()
<<
std
::
endl
;
e_device_buf
.
FromDevice
(
e_gs_ms_ns_device_result
.
mData
.
data
());
if
(
do_verification
)
{
Tensor
<
CShuffleDataType
>
c_ms_ns_host_result
(
e_gs_ms_ns_lengths
,
e_gs_ms_ns_strides
);
using
ReferenceOpInstance
=
ReferenceContraction_G2_M2_N2_K1
<
NumDimG
,
NumDimM
,
NumDimN
,
NumDimK
,
ADataType
,
BDataType
,
CShuffleDataType
,
AccDataType
,
AElementOp
,
BElementOp
,
PassThrough
>
;
auto
ref_gemm
=
ReferenceOpInstance
{};
auto
ref_invoker
=
ref_gemm
.
MakeInvoker
();
auto
ref_argument
=
ref_gemm
.
MakeArgument
(
a_gs_ms_ks
,
b_gs_ns_ks
,
c_ms_ns_host_result
,
a_element_op
,
b_element_op
,
PassThrough
{});
ref_invoker
.
Run
(
ref_argument
);
for
(
size_t
g0
=
0
;
g0
<
e_gs_ms_ns_host_result
.
mDesc
.
GetLengths
()[
0
];
++
g0
)
{
for
(
size_t
g1
=
0
;
g1
<
e_gs_ms_ns_host_result
.
mDesc
.
GetLengths
()[
1
];
++
g1
)
{
for
(
size_t
m0
=
0
;
m0
<
e_gs_ms_ns_host_result
.
mDesc
.
GetLengths
()[
2
];
++
m0
)
{
for
(
size_t
m1
=
0
;
m1
<
e_gs_ms_ns_host_result
.
mDesc
.
GetLengths
()[
3
];
++
m1
)
{
for
(
size_t
n0
=
0
;
n0
<
e_gs_ms_ns_host_result
.
mDesc
.
GetLengths
()[
4
];
++
n0
)
{
for
(
size_t
n1
=
0
;
n1
<
e_gs_ms_ns_host_result
.
mDesc
.
GetLengths
()[
5
];
++
n1
)
{
cde_element_op
(
e_gs_ms_ns_host_result
(
g0
,
g1
,
m0
,
m1
,
n0
,
n1
),
c_ms_ns_host_result
(
g0
,
g1
,
m0
,
m1
,
n0
,
n1
),
d_gs_ms_ns
(
g0
,
g1
,
m0
,
m1
,
n0
,
n1
));
}
}
}
}
}
}
return
ck
::
utils
::
check_err
(
e_gs_ms_ns_device_result
,
e_gs_ms_ns_host_result
)
?
0
:
1
;
}
return
0
;
}
example/30_grouped_conv_fwd_multiple_d/CMakeLists.txt
View file @
9312548e
...
@@ -16,6 +16,9 @@ if(USE_BITINT_EXTENSION_INT4)
...
@@ -16,6 +16,9 @@ if(USE_BITINT_EXTENSION_INT4)
add_dependencies
(
example_grouped_conv_fwd_multiple_d example_grouped_conv_fwd_bias_relu_add_xdl_int4
)
add_dependencies
(
example_grouped_conv_fwd_multiple_d example_grouped_conv_fwd_bias_relu_add_xdl_int4
)
endif
()
# USE_BITINT_EXTENSION_INT4
endif
()
# USE_BITINT_EXTENSION_INT4
if
(
GPU_TARGETS MATCHES
"gfx1100"
)
add_example_executable
(
example_grouped_conv_fwd_bias_relu_add_wmma_fp16 grouped_conv_fwd_bias_relu_add_wmma_fp16.cpp
)
endif
()
add_example_executable
(
example_grouped_conv_fwd_xdl_fp16 grouped_conv_fwd_xdl_fp16.cpp
)
add_example_executable
(
example_grouped_conv_fwd_xdl_fp16 grouped_conv_fwd_xdl_fp16.cpp
)
...
...
example/30_grouped_conv_fwd_multiple_d/common.hpp
View file @
9312548e
...
@@ -137,7 +137,7 @@ inline bool parse_cmd_args(int argc,
...
@@ -137,7 +137,7 @@ inline bool parse_cmd_args(int argc,
const
ck
::
index_t
num_dim_spatial
=
std
::
stoi
(
argv
[
4
]);
const
ck
::
index_t
num_dim_spatial
=
std
::
stoi
(
argv
[
4
]);
conv_param
=
ck
::
utils
::
conv
::
parse_conv_param
(
conv_param
=
ck
::
utils
::
conv
::
parse_conv_param
(
num_dim_spatial
,
threshold_to_catch_partial_args
,
argv
);
num_dim_spatial
,
threshold_to_catch_partial_args
+
1
,
argv
);
}
}
else
else
{
{
...
...
example/30_grouped_conv_fwd_multiple_d/common_wmma.hpp
0 → 100644
View file @
9312548e
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#include <algorithm>
#include <array>
#include <iostream>
#include <string>
#include <type_traits>
#include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/device/convolution_forward_specialization.hpp"
#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
#include "ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_d_wmma_cshuffle.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/library/utility/algorithm.hpp"
#include "ck/library/utility/check_err.hpp"
#include "ck/library/utility/device_memory.hpp"
#include "ck/library/utility/host_tensor.hpp"
#include "ck/library/utility/host_tensor_generator.hpp"
#include "ck/library/utility/convolution_parameter.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_conv_fwd.hpp"
using
BF16
=
ck
::
bhalf_t
;
using
FP16
=
ck
::
half_t
;
using
FP32
=
float
;
#ifdef CK_EXPERIMENTAL_BIT_INT_EXTENSION_INT4
using
I4
=
ck
::
int4_t
;
#endif
using
I8
=
std
::
int8_t
;
using
I32
=
std
::
int32_t
;
template
<
ck
::
index_t
...
Is
>
using
S
=
ck
::
Sequence
<
Is
...
>
;
using
PassThrough
=
ck
::
tensor_operation
::
element_wise
::
PassThrough
;
static
constexpr
auto
ConvSpec
=
ck
::
tensor_operation
::
device
::
ConvolutionForwardSpecialization
::
Default
;
static
constexpr
auto
GemmSpec
=
ck
::
tensor_operation
::
device
::
GemmSpecialization
::
MNKPadding
;
template
<
typename
InputLay
,
typename
WeightLay
,
typename
OutputLay
>
struct
CommonLayoutSetting
{
using
InputLayout
=
InputLay
;
using
WeightLayout
=
WeightLay
;
using
OutputLayout
=
OutputLay
;
};
template
<
ck
::
index_t
NDimSpatial
>
struct
CommonLayoutSettingSelector
;
namespace
ctl
=
ck
::
tensor_layout
::
convolution
;
template
<
>
struct
CommonLayoutSettingSelector
<
1
>
final
:
CommonLayoutSetting
<
ctl
::
G_NW_C
,
ctl
::
G_K_X_C
,
ctl
::
G_NW_K
>
{
};
template
<
>
struct
CommonLayoutSettingSelector
<
2
>
final
:
CommonLayoutSetting
<
ctl
::
G_NHW_C
,
ctl
::
G_K_YX_C
,
ctl
::
G_NHW_K
>
{
};
template
<
>
struct
CommonLayoutSettingSelector
<
3
>
final
:
CommonLayoutSetting
<
ctl
::
G_NDHW_C
,
ctl
::
G_K_ZYX_C
,
ctl
::
G_NDHW_K
>
{
};
template
<
ck
::
index_t
NDimSpatial
>
using
InputLayout
=
typename
CommonLayoutSettingSelector
<
NDimSpatial
>::
InputLayout
;
template
<
ck
::
index_t
NDimSpatial
>
using
WeightLayout
=
typename
CommonLayoutSettingSelector
<
NDimSpatial
>::
WeightLayout
;
template
<
ck
::
index_t
NDimSpatial
>
using
OutputLayout
=
typename
CommonLayoutSettingSelector
<
NDimSpatial
>::
OutputLayout
;
struct
ExecutionConfig
final
{
bool
do_verification
=
true
;
int
init_method
=
1
;
bool
time_kernel
=
true
;
};
#define DefaultConvParam \
ck::utils::conv::ConvParam \
{ \
2, 32, 2, 256, 192, {3, 3}, {71, 71}, {2, 2}, {1, 1}, {1, 1}, { 1, 1 } \
}
inline
void
print_help_msg
()
{
std
::
cerr
<<
"arg1: verification (0=no, 1=yes)
\n
"
<<
"arg2: initialization (0=no init, 1=integer value, 2=decimal value)
\n
"
<<
"arg3: time kernel (0=no, 1=yes)
\n
"
<<
ck
::
utils
::
conv
::
get_conv_param_parser_helper_msg
()
<<
std
::
endl
;
}
inline
bool
parse_cmd_args
(
int
argc
,
char
*
argv
[],
ExecutionConfig
&
config
,
ck
::
utils
::
conv
::
ConvParam
&
conv_param
)
{
constexpr
int
num_execution_config_args
=
3
;
// arguments for do_verification, init_method, time_kernel
constexpr
int
num_conv_param_leading_args
=
5
;
// arguments for num_dim_spatial_, G_, N_, K_, C_
constexpr
int
threshold_to_catch_partial_args
=
1
+
num_execution_config_args
;
constexpr
int
threshold_to_catch_all_args
=
threshold_to_catch_partial_args
+
num_conv_param_leading_args
;
if
(
argc
==
1
)
{
// use default
}
// catch only ExecutionConfig arguments
else
if
(
argc
==
threshold_to_catch_partial_args
)
{
config
.
do_verification
=
std
::
stoi
(
argv
[
1
]);
config
.
init_method
=
std
::
stoi
(
argv
[
2
]);
config
.
time_kernel
=
std
::
stoi
(
argv
[
3
]);
}
// catch both ExecutionConfig & ConvParam arguments
else
if
(
threshold_to_catch_all_args
<
argc
&&
((
argc
-
threshold_to_catch_all_args
)
%
3
==
0
))
{
config
.
do_verification
=
std
::
stoi
(
argv
[
1
]);
config
.
init_method
=
std
::
stoi
(
argv
[
2
]);
config
.
time_kernel
=
std
::
stoi
(
argv
[
3
]);
const
ck
::
index_t
num_dim_spatial
=
std
::
stoi
(
argv
[
4
]);
conv_param
=
ck
::
utils
::
conv
::
parse_conv_param
(
num_dim_spatial
,
threshold_to_catch_partial_args
+
1
,
argv
);
}
else
{
print_help_msg
();
return
false
;
}
return
true
;
}
inline
HostTensorDescriptor
make_input_descriptor
(
const
ck
::
utils
::
conv
::
ConvParam
&
conv_param
)
{
switch
(
conv_param
.
num_dim_spatial_
)
{
case
1
:
return
HostTensorDescriptor
(
{
conv_param
.
G_
,
conv_param
.
N_
,
conv_param
.
C_
,
conv_param
.
input_spatial_lengths_
[
0
]},
{
conv_param
.
C_
,
// g
conv_param
.
input_spatial_lengths_
[
0
]
*
conv_param
.
G_
*
conv_param
.
C_
,
// n
1
,
// c
conv_param
.
G_
*
conv_param
.
C_
// wi
});
case
2
:
return
HostTensorDescriptor
(
{
conv_param
.
G_
,
conv_param
.
N_
,
conv_param
.
C_
,
conv_param
.
input_spatial_lengths_
[
0
],
conv_param
.
input_spatial_lengths_
[
1
]},
{
conv_param
.
C_
,
// g
conv_param
.
input_spatial_lengths_
[
0
]
*
conv_param
.
input_spatial_lengths_
[
1
]
*
conv_param
.
G_
*
conv_param
.
C_
,
// n
1
,
// c
conv_param
.
input_spatial_lengths_
[
1
]
*
conv_param
.
G_
*
conv_param
.
C_
,
// hi
conv_param
.
G_
*
conv_param
.
C_
// wi
});
case
3
:
return
HostTensorDescriptor
(
{
conv_param
.
G_
,
conv_param
.
N_
,
conv_param
.
C_
,
conv_param
.
input_spatial_lengths_
[
0
],
conv_param
.
input_spatial_lengths_
[
1
],
conv_param
.
input_spatial_lengths_
[
2
]},
{
conv_param
.
C_
,
// g
conv_param
.
input_spatial_lengths_
[
0
]
*
conv_param
.
input_spatial_lengths_
[
1
]
*
conv_param
.
input_spatial_lengths_
[
2
]
*
conv_param
.
G_
*
conv_param
.
C_
,
// n
1
,
// c
conv_param
.
input_spatial_lengths_
[
1
]
*
conv_param
.
input_spatial_lengths_
[
2
]
*
conv_param
.
G_
*
conv_param
.
C_
,
// di
conv_param
.
input_spatial_lengths_
[
2
]
*
conv_param
.
G_
*
conv_param
.
C_
,
// hi
conv_param
.
G_
*
conv_param
.
C_
// wi
});
}
throw
std
::
runtime_error
(
"unsuppored # dim spatial"
);
}
inline
HostTensorDescriptor
make_weight_descriptor
(
const
ck
::
utils
::
conv
::
ConvParam
&
conv_param
)
{
switch
(
conv_param
.
num_dim_spatial_
)
{
case
1
:
return
HostTensorDescriptor
(
{
conv_param
.
G_
,
conv_param
.
K_
,
conv_param
.
C_
,
conv_param
.
filter_spatial_lengths_
[
0
]},
{
conv_param
.
K_
*
conv_param
.
filter_spatial_lengths_
[
0
]
*
conv_param
.
C_
,
// g
conv_param
.
filter_spatial_lengths_
[
0
]
*
conv_param
.
C_
,
// k
1
,
// c
conv_param
.
C_
// x
});
case
2
:
return
HostTensorDescriptor
(
{
conv_param
.
G_
,
conv_param
.
K_
,
conv_param
.
C_
,
conv_param
.
filter_spatial_lengths_
[
0
],
conv_param
.
filter_spatial_lengths_
[
1
]},
{
conv_param
.
K_
*
conv_param
.
filter_spatial_lengths_
[
0
]
*
conv_param
.
filter_spatial_lengths_
[
1
]
*
conv_param
.
C_
,
// g
conv_param
.
filter_spatial_lengths_
[
0
]
*
conv_param
.
filter_spatial_lengths_
[
1
]
*
conv_param
.
C_
,
// k
1
,
// c
conv_param
.
filter_spatial_lengths_
[
1
]
*
conv_param
.
C_
,
// y
conv_param
.
C_
// x
});
case
3
:
return
HostTensorDescriptor
(
{
conv_param
.
G_
,
conv_param
.
K_
,
conv_param
.
C_
,
conv_param
.
filter_spatial_lengths_
[
0
],
conv_param
.
filter_spatial_lengths_
[
1
],
conv_param
.
filter_spatial_lengths_
[
2
]},
{
conv_param
.
K_
*
conv_param
.
filter_spatial_lengths_
[
0
]
*
conv_param
.
filter_spatial_lengths_
[
1
]
*
conv_param
.
filter_spatial_lengths_
[
2
]
*
conv_param
.
C_
,
// g
conv_param
.
filter_spatial_lengths_
[
0
]
*
conv_param
.
filter_spatial_lengths_
[
1
]
*
conv_param
.
filter_spatial_lengths_
[
2
]
*
conv_param
.
C_
,
// k
1
,
// c
conv_param
.
filter_spatial_lengths_
[
1
]
*
conv_param
.
filter_spatial_lengths_
[
2
]
*
conv_param
.
C_
,
// z
conv_param
.
filter_spatial_lengths_
[
2
]
*
conv_param
.
C_
,
// y
conv_param
.
C_
// x
});
}
throw
std
::
runtime_error
(
"unsuppored # dim spatial"
);
}
inline
HostTensorDescriptor
make_bias_descriptor
(
const
ck
::
utils
::
conv
::
ConvParam
&
conv_param
)
{
switch
(
conv_param
.
num_dim_spatial_
)
{
case
1
:
return
HostTensorDescriptor
(
{
conv_param
.
G_
,
conv_param
.
N_
,
conv_param
.
K_
,
conv_param
.
output_spatial_lengths_
[
0
]},
{
conv_param
.
K_
,
// g
0
,
// k
1
,
// c
0
// x
});
case
2
:
return
HostTensorDescriptor
({
conv_param
.
G_
,
conv_param
.
N_
,
conv_param
.
K_
,
conv_param
.
output_spatial_lengths_
[
0
],
conv_param
.
output_spatial_lengths_
[
1
]},
{
conv_param
.
K_
,
// g
0
,
// n
1
,
// k
0
,
// ho
0
// wo
});
case
3
:
return
HostTensorDescriptor
({
conv_param
.
G_
,
conv_param
.
N_
,
conv_param
.
K_
,
conv_param
.
output_spatial_lengths_
[
0
],
conv_param
.
output_spatial_lengths_
[
1
],
conv_param
.
output_spatial_lengths_
[
2
]},
{
conv_param
.
K_
,
// g
0
,
// n
1
,
// k
0
,
// z
0
,
// y
0
// x
});
}
throw
std
::
runtime_error
(
"unsuppored # dim spatial"
);
}
inline
HostTensorDescriptor
make_output_descriptor
(
const
ck
::
utils
::
conv
::
ConvParam
&
conv_param
)
{
switch
(
conv_param
.
num_dim_spatial_
)
{
case
1
:
return
HostTensorDescriptor
(
{
conv_param
.
G_
,
conv_param
.
N_
,
conv_param
.
K_
,
conv_param
.
output_spatial_lengths_
[
0
]},
{
conv_param
.
K_
,
// g
conv_param
.
output_spatial_lengths_
[
0
]
*
conv_param
.
G_
*
conv_param
.
K_
,
// n
1
,
// k
conv_param
.
G_
*
conv_param
.
K_
// wo
});
case
2
:
return
HostTensorDescriptor
(
{
conv_param
.
G_
,
conv_param
.
N_
,
conv_param
.
K_
,
conv_param
.
output_spatial_lengths_
[
0
],
conv_param
.
output_spatial_lengths_
[
1
]},
{
conv_param
.
K_
,
// g
conv_param
.
output_spatial_lengths_
[
0
]
*
conv_param
.
output_spatial_lengths_
[
1
]
*
conv_param
.
G_
*
conv_param
.
K_
,
// n
1
,
// k
conv_param
.
output_spatial_lengths_
[
1
]
*
conv_param
.
G_
*
conv_param
.
K_
,
// ho
conv_param
.
G_
*
conv_param
.
K_
// wo
});
case
3
:
return
HostTensorDescriptor
(
{
conv_param
.
G_
,
conv_param
.
N_
,
conv_param
.
K_
,
conv_param
.
output_spatial_lengths_
[
0
],
conv_param
.
output_spatial_lengths_
[
1
],
conv_param
.
output_spatial_lengths_
[
2
]},
{
conv_param
.
K_
,
// g
conv_param
.
output_spatial_lengths_
[
0
]
*
conv_param
.
output_spatial_lengths_
[
1
]
*
conv_param
.
output_spatial_lengths_
[
2
]
*
conv_param
.
G_
*
conv_param
.
K_
,
// n
1
,
// k
conv_param
.
output_spatial_lengths_
[
1
]
*
conv_param
.
output_spatial_lengths_
[
2
]
*
conv_param
.
G_
*
conv_param
.
K_
,
// do
conv_param
.
output_spatial_lengths_
[
2
]
*
conv_param
.
G_
*
conv_param
.
K_
,
// ho
conv_param
.
G_
*
conv_param
.
K_
// wo
});
}
throw
std
::
runtime_error
(
"unsuppored # dim spatial"
);
}
example/30_grouped_conv_fwd_multiple_d/grouped_conv_fwd_bias_relu_add_wmma_fp16.cpp
0 → 100644
View file @
9312548e
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#include "common_wmma.hpp"
// kernel data types
using
InKernelDataType
=
FP16
;
using
WeiKernelDataType
=
FP16
;
using
AccDataType
=
FP32
;
using
CShuffleDataType
=
FP16
;
using
BiasKernelDataType
=
FP16
;
using
ResidualKernelDataType
=
FP16
;
using
OutKernelDataType
=
FP16
;
// tensor data types
using
InUserDataType
=
InKernelDataType
;
using
WeiUserDataType
=
WeiKernelDataType
;
using
OutUserDataType
=
OutKernelDataType
;
using
InElementOp
=
PassThrough
;
using
WeiElementOp
=
PassThrough
;
using
OutElementOp
=
ck
::
tensor_operation
::
element_wise
::
AddReluAdd
;
#include "run_grouped_conv_fwd_bias_relu_add_wmma_example.inc"
int
main
(
int
argc
,
char
*
argv
[])
{
return
!
run_grouped_conv_fwd_bias_relu_add_example
(
argc
,
argv
);
}
example/30_grouped_conv_fwd_multiple_d/run_grouped_conv_fwd_bias_relu_add_wmma_example.inc
0 → 100644
View file @
9312548e
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
template
<
typename
BiasLay
,
typename
ResidualLay
>
struct
LayoutSetting
{
using
BiasLayout
=
BiasLay
;
using
ResidualLayout
=
ResidualLay
;
};
template
<
ck
::
index_t
NDimSpatial
>
struct
LayoutSettingSelector
;
template
<>
struct
LayoutSettingSelector
<
1
>
final
:
LayoutSetting
<
ctl
::
G_K
,
ctl
::
G_NW_K
>
{
};
template
<>
struct
LayoutSettingSelector
<
2
>
final
:
LayoutSetting
<
ctl
::
G_K
,
ctl
::
G_NHW_K
>
{
};
template
<>
struct
LayoutSettingSelector
<
3
>
final
:
LayoutSetting
<
ctl
::
G_K
,
ctl
::
G_NDHW_K
>
{
};
template
<
ck
::
index_t
NDimSpatial
>
using
BiasLayout
=
typename
LayoutSettingSelector
<
NDimSpatial
>::
BiasLayout
;
template
<
ck
::
index_t
NDimSpatial
>
using
ResidualLayout
=
typename
LayoutSettingSelector
<
NDimSpatial
>::
ResidualLayout
;
template
<
ck
::
index_t
NDimSpatial
>
using
DeviceConvFwdInstance
=
ck
::
tensor_operation
::
device
::
DeviceGroupedConvFwdMultipleD_Wmma_CShuffle
<
NDimSpatial
,
InputLayout
<
NDimSpatial
>
,
WeightLayout
<
NDimSpatial
>
,
ck
::
Tuple
<
BiasLayout
<
NDimSpatial
>
,
ResidualLayout
<
NDimSpatial
>>
,
OutputLayout
<
NDimSpatial
>
,
InKernelDataType
,
WeiKernelDataType
,
ck
::
Tuple
<
BiasKernelDataType
,
ResidualKernelDataType
>
,
OutKernelDataType
,
AccDataType
,
CShuffleDataType
,
InElementOp
,
WeiElementOp
,
OutElementOp
,
ConvSpec
,
// ConvForwardSpecialization
GemmSpec
,
// GemmSpecialization
256
,
// BlockSize
128
,
// MPerBlock
128
,
// NPerBlock
4
,
// K0PerBlock
8
,
// K1
16
,
// MPerWMMA
16
,
// NPerWMMA
4
,
// MRepeat
2
,
// NRepeat
S
<
4
,
64
,
1
>
,
// ABlockTransferThreadClusterLengths_AK0_M_AK1
S
<
1
,
0
,
2
>
,
// ABlockTransferThreadClusterArrangeOrder
S
<
1
,
0
,
2
>
,
// ABlockTransferSrcAccessOrder
2
,
// ABlockTransferSrcVectorDim
8
,
// ABlockTransferSrcScalarPerVector
8
,
// ABlockTransferDstScalarPerVector_AK1
true
,
// ABlockLdsExtraM
S
<
4
,
64
,
1
>
,
// BBlockTransferThreadClusterLengths_BK0_N_BK1
S
<
1
,
0
,
2
>
,
// BBlockTransferThreadClusterArrangeOrder
S
<
1
,
0
,
2
>
,
// BBlockTransferSrcAccessOrder
2
,
// BBlockTransferSrcVectorDim
8
,
// BBlockTransferSrcScalarPerVector
8
,
// BBlockTransferDstScalarPerVector_BK1
true
,
// BBlockLdsExtraN
1
,
1
,
S
<
1
,
32
,
1
,
8
>
,
8
>
;
template
<
ck
::
index_t
NDimSpatial
>
using
HostConvFwdInstance
=
ck
::
tensor_operation
::
host
::
ReferenceConvFwd
<
NDimSpatial
,
InUserDataType
,
WeiUserDataType
,
CShuffleDataType
,
InElementOp
,
WeiElementOp
,
PassThrough
>
;
template
<
ck
::
index_t
NDimSpatial
>
bool
run_grouped_conv_fwd_bias_relu_add
(
const
ExecutionConfig
&
config
,
const
ck
::
utils
::
conv
::
ConvParam
&
conv_param
)
{
static_assert
(
1
<=
NDimSpatial
&&
NDimSpatial
<=
3
,
"Unsupported NDimSpatial"
);
const
auto
in_g_n_c_wis_desc
=
make_input_descriptor
(
conv_param
);
const
auto
wei_g_k_c_xs_desc
=
make_weight_descriptor
(
conv_param
);
const
auto
bias_g_n_k_wos_desc
=
make_bias_descriptor
(
conv_param
);
const
auto
out_g_n_k_wos_desc
=
make_output_descriptor
(
conv_param
);
Tensor
<
InUserDataType
>
in
(
in_g_n_c_wis_desc
);
Tensor
<
WeiUserDataType
>
wei
(
wei_g_k_c_xs_desc
);
Tensor
<
OutUserDataType
>
bias
(
bias_g_n_k_wos_desc
);
Tensor
<
OutUserDataType
>
residual
(
bias_g_n_k_wos_desc
);
Tensor
<
OutUserDataType
>
out_host
(
out_g_n_k_wos_desc
);
Tensor
<
OutKernelDataType
>
out_device
(
out_g_n_k_wos_desc
);
std
::
cout
<<
"in: "
<<
in
.
mDesc
<<
std
::
endl
;
std
::
cout
<<
"wei: "
<<
wei
.
mDesc
<<
std
::
endl
;
std
::
cout
<<
"bias: "
<<
bias
.
mDesc
<<
std
::
endl
;
std
::
cout
<<
"residual: "
<<
residual
.
mDesc
<<
std
::
endl
;
std
::
cout
<<
"out: "
<<
out_host
.
mDesc
<<
std
::
endl
;
switch
(
config
.
init_method
)
{
case
0
:
break
;
case
1
:
in
.
GenerateTensorValue
(
GeneratorTensor_2
<
InUserDataType
>
{
-
5
,
5
});
wei
.
GenerateTensorValue
(
GeneratorTensor_2
<
WeiUserDataType
>
{
-
5
,
5
});
bias
.
GenerateTensorValue
(
GeneratorTensor_2
<
OutUserDataType
>
{
-
5
,
5
});
break
;
default
:
in
.
GenerateTensorValue
(
GeneratorTensor_3
<
InUserDataType
>
{
0.0
,
1.0
});
wei
.
GenerateTensorValue
(
GeneratorTensor_3
<
WeiUserDataType
>
{
-
0.5
,
0.5
});
bias
.
GenerateTensorValue
(
GeneratorTensor_3
<
OutUserDataType
>
{
-
0.5
,
0.5
});
}
DeviceMem
in_device_buf
(
sizeof
(
InKernelDataType
)
*
in
.
mDesc
.
GetElementSpaceSize
());
DeviceMem
wei_device_buf
(
sizeof
(
WeiKernelDataType
)
*
wei
.
mDesc
.
GetElementSpaceSize
());
DeviceMem
bias_device_buf
(
sizeof
(
OutKernelDataType
)
*
bias
.
mDesc
.
GetElementSpaceSize
());
DeviceMem
residual_device_buf
(
sizeof
(
OutKernelDataType
)
*
residual
.
mDesc
.
GetElementSpaceSize
());
DeviceMem
out_device_buf
(
sizeof
(
OutKernelDataType
)
*
out_device
.
mDesc
.
GetElementSpaceSize
());
#ifdef BUILD_INT4_EXAMPLE
const
Tensor
<
InKernelDataType
>
in_converted
(
in
);
const
Tensor
<
WeiKernelDataType
>
wei_converted
(
wei
);
const
Tensor
<
OutKernelDataType
>
bias_converted
(
bias
);
const
Tensor
<
OutKernelDataType
>
residual_converted
(
residual
);
in_device_buf
.
ToDevice
(
in_converted
.
mData
.
data
());
wei_device_buf
.
ToDevice
(
wei_converted
.
mData
.
data
());
bias_device_buf
.
ToDevice
(
bias_converted
.
mData
.
data
());
residual_device_buf
.
ToDevice
(
residual_converted
.
mData
.
data
());
#else
in_device_buf
.
ToDevice
(
in
.
mData
.
data
());
wei_device_buf
.
ToDevice
(
wei
.
mData
.
data
());
bias_device_buf
.
ToDevice
(
bias
.
mData
.
data
());
residual_device_buf
.
ToDevice
(
residual
.
mData
.
data
());
#endif
std
::
array
<
ck
::
index_t
,
NDimSpatial
+
3
>
a_g_n_c_wis_lengths
{};
std
::
array
<
ck
::
index_t
,
NDimSpatial
+
3
>
a_g_n_c_wis_strides
{};
std
::
array
<
ck
::
index_t
,
NDimSpatial
+
3
>
b_g_k_c_xs_lengths
{};
std
::
array
<
ck
::
index_t
,
NDimSpatial
+
3
>
b_g_k_c_xs_strides
{};
std
::
array
<
ck
::
index_t
,
NDimSpatial
+
3
>
d0_g_n_k_wos_lengths
{};
std
::
array
<
ck
::
index_t
,
NDimSpatial
+
3
>
d0_g_n_k_wos_strides
{};
std
::
array
<
ck
::
index_t
,
NDimSpatial
+
3
>
d1_g_n_k_wos_lengths
{};
std
::
array
<
ck
::
index_t
,
NDimSpatial
+
3
>
d1_g_n_k_wos_strides
{};
std
::
array
<
ck
::
index_t
,
NDimSpatial
+
3
>
e_g_n_k_wos_lengths
{};
std
::
array
<
ck
::
index_t
,
NDimSpatial
+
3
>
e_g_n_k_wos_strides
{};
std
::
array
<
ck
::
index_t
,
NDimSpatial
>
conv_filter_strides
{};
std
::
array
<
ck
::
index_t
,
NDimSpatial
>
conv_filter_dilations
{};
std
::
array
<
ck
::
index_t
,
NDimSpatial
>
input_left_pads
{};
std
::
array
<
ck
::
index_t
,
NDimSpatial
>
input_right_pads
{};
auto
copy
=
[](
const
auto
&
x
,
auto
&
y
)
{
ck
::
ranges
::
copy
(
x
,
y
.
begin
());
};
copy
(
in_g_n_c_wis_desc
.
GetLengths
(),
a_g_n_c_wis_lengths
);
copy
(
in_g_n_c_wis_desc
.
GetStrides
(),
a_g_n_c_wis_strides
);
copy
(
wei_g_k_c_xs_desc
.
GetLengths
(),
b_g_k_c_xs_lengths
);
copy
(
wei_g_k_c_xs_desc
.
GetStrides
(),
b_g_k_c_xs_strides
);
copy
(
bias_g_n_k_wos_desc
.
GetLengths
(),
d0_g_n_k_wos_lengths
);
copy
(
bias_g_n_k_wos_desc
.
GetStrides
(),
d0_g_n_k_wos_strides
);
copy
(
bias_g_n_k_wos_desc
.
GetLengths
(),
d1_g_n_k_wos_lengths
);
copy
(
bias_g_n_k_wos_desc
.
GetStrides
(),
d1_g_n_k_wos_strides
);
copy
(
out_g_n_k_wos_desc
.
GetLengths
(),
e_g_n_k_wos_lengths
);
copy
(
out_g_n_k_wos_desc
.
GetStrides
(),
e_g_n_k_wos_strides
);
copy
(
conv_param
.
conv_filter_strides_
,
conv_filter_strides
);
copy
(
conv_param
.
conv_filter_dilations_
,
conv_filter_dilations
);
copy
(
conv_param
.
input_left_pads_
,
input_left_pads
);
copy
(
conv_param
.
input_right_pads_
,
input_right_pads
);
// do Conv
auto
conv
=
DeviceConvFwdInstance
<
NDimSpatial
>
{};
auto
invoker
=
conv
.
MakeInvoker
();
auto
argument
=
conv
.
MakeArgument
(
in_device_buf
.
GetDeviceBuffer
(),
wei_device_buf
.
GetDeviceBuffer
(),
std
::
array
<
const
void
*
,
2
>
{
bias_device_buf
.
GetDeviceBuffer
(),
residual_device_buf
.
GetDeviceBuffer
()},
out_device_buf
.
GetDeviceBuffer
(),
a_g_n_c_wis_lengths
,
a_g_n_c_wis_strides
,
b_g_k_c_xs_lengths
,
b_g_k_c_xs_strides
,
std
::
array
<
std
::
array
<
ck
::
index_t
,
NDimSpatial
+
3
>
,
2
>
{
{
d0_g_n_k_wos_lengths
,
d1_g_n_k_wos_lengths
}},
std
::
array
<
std
::
array
<
ck
::
index_t
,
NDimSpatial
+
3
>
,
2
>
{
{
d0_g_n_k_wos_strides
,
d1_g_n_k_wos_strides
}},
e_g_n_k_wos_lengths
,
e_g_n_k_wos_strides
,
conv_filter_strides
,
conv_filter_dilations
,
input_left_pads
,
input_right_pads
,
InElementOp
{},
WeiElementOp
{},
OutElementOp
{});
if
(
!
conv
.
IsSupportedArgument
(
argument
))
{
throw
std
::
runtime_error
(
"wrong! device_conv with the specified compilation parameters does "
"not support this Conv problem"
);
}
float
avg_time
=
invoker
.
Run
(
argument
,
StreamConfig
{
nullptr
,
config
.
time_kernel
});
std
::
size_t
flop
=
conv_param
.
GetFlops
();
std
::
size_t
num_btype
=
conv_param
.
GetByte
<
InUserDataType
,
WeiUserDataType
,
OutUserDataType
>
();
float
tflops
=
static_cast
<
float
>
(
flop
)
/
1.E9
/
avg_time
;
float
gb_per_sec
=
num_btype
/
1.E6
/
avg_time
;
std
::
cout
<<
"Perf: "
<<
avg_time
<<
" ms, "
<<
tflops
<<
" TFlops, "
<<
gb_per_sec
<<
" GB/s, "
<<
conv
.
GetTypeString
()
<<
std
::
endl
;
if
(
config
.
do_verification
)
{
Tensor
<
CShuffleDataType
>
c_host
(
out_g_n_k_wos_desc
);
auto
ref_conv
=
HostConvFwdInstance
<
NDimSpatial
>
{};
auto
ref_invoker
=
ref_conv
.
MakeInvoker
();
auto
ref_argument
=
ref_conv
.
MakeArgument
(
in
,
wei
,
c_host
,
conv_param
.
conv_filter_strides_
,
conv_param
.
conv_filter_dilations_
,
conv_param
.
input_left_pads_
,
conv_param
.
input_right_pads_
,
InElementOp
{},
WeiElementOp
{},
PassThrough
{});
ref_invoker
.
Run
(
ref_argument
);
// TODO: implement elementwise operation for host
out_host
.
ForEach
([
&
](
auto
&
,
auto
idx
)
{
OutElementOp
{}(
out_host
(
idx
),
c_host
(
idx
),
bias
(
idx
),
residual
(
idx
));
});
out_device_buf
.
FromDevice
(
out_device
.
mData
.
data
());
#ifdef BUILD_INT4_EXAMPLE
const
Tensor
<
OutUserDataType
>
out_device_converted
(
out_device
);
return
ck
::
utils
::
check_err
(
out_device_converted
,
out_host
,
"Error: incorrect results!"
,
1
e
-
5
f
,
1
e
-
4
f
);
#else
return
ck
::
utils
::
check_err
(
out_device
,
out_host
,
"Error: incorrect results!"
,
1
e
-
5
f
,
1
e
-
4
f
);
#endif
}
return
true
;
}
bool
run_grouped_conv_fwd_bias_relu_add_example
(
int
argc
,
char
*
argv
[])
{
ExecutionConfig
config
;
ck
::
utils
::
conv
::
ConvParam
conv_param
=
DefaultConvParam
;
if
(
!
parse_cmd_args
(
argc
,
argv
,
config
,
conv_param
))
{
return
false
;
}
switch
(
conv_param
.
num_dim_spatial_
)
{
case
1
:
return
run_grouped_conv_fwd_bias_relu_add
<
1
>
(
config
,
conv_param
);
case
2
:
return
run_grouped_conv_fwd_bias_relu_add
<
2
>
(
config
,
conv_param
);
case
3
:
return
run_grouped_conv_fwd_bias_relu_add
<
3
>
(
config
,
conv_param
);
}
return
false
;
}
example/42_groupnorm/groupnorm_sigmoid_fp16.cpp
View file @
9312548e
...
@@ -23,11 +23,11 @@
...
@@ -23,11 +23,11 @@
constexpr
int
Rank
=
5
;
constexpr
int
Rank
=
5
;
constexpr
int
NumReduceDim
=
3
;
constexpr
int
NumReduceDim
=
3
;
using
XDataType
=
ck
::
half_t
;
using
XDataType
=
ck
::
half_t
;
using
GammaDataType
=
ck
::
half_t
;
using
GammaDataType
=
ck
::
half_t
;
using
BetaDataType
=
ck
::
half_t
;
using
BetaDataType
=
ck
::
half_t
;
using
YDataType
=
ck
::
half_t
;
using
YDataType
=
ck
::
half_t
;
using
Acc
DataType
=
float
;
using
Compute
DataType
=
float
;
struct
YElementOp
struct
YElementOp
{
{
...
@@ -50,7 +50,7 @@ using DeviceInstance =
...
@@ -50,7 +50,7 @@ using DeviceInstance =
ck
::
tensor_operation
::
device
::
DeviceNormalizationImpl
<
XDataType
,
ck
::
tensor_operation
::
device
::
DeviceNormalizationImpl
<
XDataType
,
GammaDataType
,
GammaDataType
,
BetaDataType
,
BetaDataType
,
Acc
DataType
,
Compute
DataType
,
YDataType
,
YDataType
,
YElementOp
,
YElementOp
,
Rank
,
Rank
,
...
@@ -157,7 +157,7 @@ int main(int argc, char* argv[])
...
@@ -157,7 +157,7 @@ int main(int argc, char* argv[])
GammaDataType
,
GammaDataType
,
BetaDataType
,
BetaDataType
,
YDataType
,
YDataType
,
Acc
DataType
,
Compute
DataType
,
YElementOp
>
;
YElementOp
>
;
ReferenceInstance
ref
;
ReferenceInstance
ref
;
...
...
example/46_gemm_add_multiply/run_gemm_add_multiply_example.inc
View file @
9312548e
...
@@ -53,7 +53,6 @@ bool run_gemm_add_multiply(const ProblemSize& problem_size, const ExecutionConfi
...
@@ -53,7 +53,6 @@ bool run_gemm_add_multiply(const ProblemSize& problem_size, const ExecutionConfi
DeviceMem
d1_device_buf
(
sizeof
(
D1DataType
)
*
d1_m_n
.
mDesc
.
GetElementSpaceSize
());
DeviceMem
d1_device_buf
(
sizeof
(
D1DataType
)
*
d1_m_n
.
mDesc
.
GetElementSpaceSize
());
DeviceMem
e_device_buf
(
sizeof
(
EDataType
)
*
e_m_n_device_result
.
mDesc
.
GetElementSpaceSize
());
DeviceMem
e_device_buf
(
sizeof
(
EDataType
)
*
e_m_n_device_result
.
mDesc
.
GetElementSpaceSize
());
a_device_buf
.
ToDevice
(
a_m_k
.
mData
.
data
());
a_device_buf
.
ToDevice
(
a_m_k
.
mData
.
data
());
b_device_buf
.
ToDevice
(
b_k_n
.
mData
.
data
());
b_device_buf
.
ToDevice
(
b_k_n
.
mData
.
data
());
d0_device_buf
.
ToDevice
(
d0_m_n
.
mData
.
data
());
d0_device_buf
.
ToDevice
(
d0_m_n
.
mData
.
data
());
...
@@ -84,8 +83,8 @@ bool run_gemm_add_multiply(const ProblemSize& problem_size, const ExecutionConfi
...
@@ -84,8 +83,8 @@ bool run_gemm_add_multiply(const ProblemSize& problem_size, const ExecutionConfi
if
(
!
device_op
.
IsSupportedArgument
(
argument
))
if
(
!
device_op
.
IsSupportedArgument
(
argument
))
{
{
std
::
cout
<<
"wrong! this device_op instance does not support this problem"
<<
std
::
endl
;
std
::
cout
<<
"wrong! this device_op instance does not support this problem"
<<
std
::
endl
;
return
true
;
return
true
;
}
}
float
ave_time
=
invoker
.
Run
(
argument
,
StreamConfig
{
nullptr
,
config
.
time_kernel
});
float
ave_time
=
invoker
.
Run
(
argument
,
StreamConfig
{
nullptr
,
config
.
time_kernel
});
...
...
include/ck/ck.hpp
View file @
9312548e
...
@@ -168,13 +168,6 @@
...
@@ -168,13 +168,6 @@
// tuning parameter
// tuning parameter
#define CK_WORKAROUND_SWDEV_325164 0
#define CK_WORKAROUND_SWDEV_325164 0
// workaround: a BF16 attention kernel for gfx908 is likely affected by a compiler issue
#ifdef __gfx908__
#define CK_WORKAROUND_SWDEV_XXXXXX_BF16_ATTEN_FWD_GFX908_ISSUE 1
#else // __gfx90a__, ...
#define CK_WORKAROUND_SWDEV_XXXXXX_BF16_ATTEN_FWD_GFX908_ISSUE 0
#endif // __gfx908__
// flag to enable (1) or disable (0) the debugging output in some kernels
// flag to enable (1) or disable (0) the debugging output in some kernels
#define DEBUG_LOG 0
#define DEBUG_LOG 0
...
...
include/ck/host_utility/kernel_launch.hpp
View file @
9312548e
...
@@ -20,6 +20,7 @@ float launch_and_time_kernel(const StreamConfig& stream_config,
...
@@ -20,6 +20,7 @@ float launch_and_time_kernel(const StreamConfig& stream_config,
#if CK_TIME_KERNEL
#if CK_TIME_KERNEL
if
(
stream_config
.
time_kernel_
)
if
(
stream_config
.
time_kernel_
)
{
{
#if DEBUG_LOG
printf
(
"%s: grid_dim {%d, %d, %d}, block_dim {%d, %d, %d}
\n
"
,
printf
(
"%s: grid_dim {%d, %d, %d}, block_dim {%d, %d, %d}
\n
"
,
__func__
,
__func__
,
grid_dim
.
x
,
grid_dim
.
x
,
...
@@ -29,15 +30,15 @@ float launch_and_time_kernel(const StreamConfig& stream_config,
...
@@ -29,15 +30,15 @@ float launch_and_time_kernel(const StreamConfig& stream_config,
block_dim
.
y
,
block_dim
.
y
,
block_dim
.
z
);
block_dim
.
z
);
const
int
nrepeat
=
10
;
printf
(
"Warm up 1 time
\n
"
);
printf
(
"Warm up 1 time
\n
"
);
#endif
// warm up
// warm up
kernel
<<<
grid_dim
,
block_dim
,
lds_byte
,
stream_config
.
stream_id_
>>>
(
args
...);
kernel
<<<
grid_dim
,
block_dim
,
lds_byte
,
stream_config
.
stream_id_
>>>
(
args
...);
const
int
nrepeat
=
10
;
#if DEBUG_LOG
printf
(
"Start running %d times...
\n
"
,
nrepeat
);
printf
(
"Start running %d times...
\n
"
,
nrepeat
);
#endif
hipEvent_t
start
,
stop
;
hipEvent_t
start
,
stop
;
hip_check_error
(
hipEventCreate
(
&
start
));
hip_check_error
(
hipEventCreate
(
&
start
));
...
...
include/ck/tensor_operation/gpu/device/device_normalization.hpp
View file @
9312548e
...
@@ -14,9 +14,9 @@ namespace device {
...
@@ -14,9 +14,9 @@ namespace device {
template
<
typename
XDataType
,
template
<
typename
XDataType
,
typename
GammaDataType
,
typename
GammaDataType
,
typename
BetaDataType
,
typename
BetaDataType
,
typename
Acc
DataType
,
typename
Compute
DataType
,
typename
YDataType
,
typename
YDataType
,
typename
Acc
ElementwiseOperation
,
typename
Y
ElementwiseOperation
,
index_t
Rank
,
index_t
Rank
,
index_t
NumReduceDim
>
index_t
NumReduceDim
>
struct
DeviceNormalization
:
public
BaseOperator
struct
DeviceNormalization
:
public
BaseOperator
...
@@ -35,7 +35,7 @@ struct DeviceNormalization : public BaseOperator
...
@@ -35,7 +35,7 @@ struct DeviceNormalization : public BaseOperator
void
*
p_y
,
void
*
p_y
,
void
*
p_savedMean
,
void
*
p_savedMean
,
void
*
p_savedInvVar
,
void
*
p_savedInvVar
,
Acc
ElementwiseOperation
acc
_elementwise_op
)
=
0
;
Y
ElementwiseOperation
y
_elementwise_op
)
=
0
;
virtual
std
::
unique_ptr
<
BaseInvoker
>
MakeInvokerPointer
()
=
0
;
virtual
std
::
unique_ptr
<
BaseInvoker
>
MakeInvokerPointer
()
=
0
;
};
};
...
@@ -43,17 +43,17 @@ struct DeviceNormalization : public BaseOperator
...
@@ -43,17 +43,17 @@ struct DeviceNormalization : public BaseOperator
template
<
typename
XDataType
,
template
<
typename
XDataType
,
typename
GammaDataType
,
typename
GammaDataType
,
typename
BetaDataType
,
typename
BetaDataType
,
typename
Acc
DataType
,
typename
Compute
DataType
,
typename
YDataType
,
typename
YDataType
,
typename
Acc
ElementwiseOperation
,
typename
Y
ElementwiseOperation
,
index_t
Rank
,
index_t
Rank
,
index_t
NumReduceDim
>
index_t
NumReduceDim
>
using
DeviceNormalizationPtr
=
std
::
unique_ptr
<
DeviceNormalization
<
XDataType
,
using
DeviceNormalizationPtr
=
std
::
unique_ptr
<
DeviceNormalization
<
XDataType
,
GammaDataType
,
GammaDataType
,
BetaDataType
,
BetaDataType
,
Acc
DataType
,
Compute
DataType
,
YDataType
,
YDataType
,
Acc
ElementwiseOperation
,
Y
ElementwiseOperation
,
Rank
,
Rank
,
NumReduceDim
>>
;
NumReduceDim
>>
;
...
...
include/ck/tensor_operation/gpu/device/impl/device_batched_contraction_multiple_d_wmma_cshuffle.hpp
0 → 100644
View file @
9312548e
This diff is collapsed.
Click to expand it.
include/ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_wmma_cshuffle.hpp
0 → 100644
View file @
9312548e
This diff is collapsed.
Click to expand it.
include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_d_wmma_cshuffle.hpp
0 → 100644
View file @
9312548e
This diff is collapsed.
Click to expand it.
include/ck/tensor_operation/gpu/device/impl/device_normalization_impl.hpp
View file @
9312548e
...
@@ -10,46 +10,11 @@
...
@@ -10,46 +10,11 @@
#include "ck/tensor_operation/gpu/device/device_normalization.hpp"
#include "ck/tensor_operation/gpu/device/device_normalization.hpp"
#include "ck/tensor_operation/gpu/device/device_reduce.hpp"
#include "ck/tensor_operation/gpu/device/device_reduce.hpp"
#include "ck/tensor_operation/gpu/device/impl/device_reduce_common.hpp"
#include "ck/tensor_operation/gpu/device/impl/device_reduce_common.hpp"
#include "ck/tensor_operation/gpu/grid/gridwise_normalization_
w
el
ford_variance
.hpp"
#include "ck/tensor_operation/gpu/grid/gridwise_normalization_
s
el
ector
.hpp"
#include "ck/tensor_operation/gpu/grid/gridwise_set_buffer_value.hpp"
#include "ck/tensor_operation/gpu/grid/gridwise_set_buffer_value.hpp"
#include "ck/host_utility/device_prop.hpp"
#include "ck/host_utility/device_prop.hpp"
#include "ck/host_utility/kernel_launch.hpp"
#include "ck/host_utility/kernel_launch.hpp"
namespace
ck
{
template
<
typename
GridwiseReduction
,
typename
XDataType
,
typename
GammaDataType
,
typename
BetaDataType
,
typename
YDataType
,
typename
AccDataType
,
typename
AccElementwiseOperation
,
typename
GridDesc_M_K
>
__global__
void
kernel_normalization
(
const
GridDesc_M_K
x_grid_desc_m_k
,
const
GridDesc_M_K
gamma_grid_desc_m_k
,
const
GridDesc_M_K
beta_grid_desc_m_k
,
const
GridDesc_M_K
y_grid_desc_m_k
,
index_t
num_k_block_tile_iteration
,
AccDataType
epsilon
,
const
XDataType
*
const
__restrict__
p_x_global
,
const
GammaDataType
*
const
__restrict__
p_gamma_global
,
const
BetaDataType
*
const
__restrict__
p_beta_global
,
YDataType
*
const
__restrict__
p_y_global
,
const
AccElementwiseOperation
acc_elementwise_op
)
{
GridwiseReduction
::
Run
(
x_grid_desc_m_k
,
gamma_grid_desc_m_k
,
beta_grid_desc_m_k
,
y_grid_desc_m_k
,
num_k_block_tile_iteration
,
epsilon
,
p_x_global
,
p_gamma_global
,
p_beta_global
,
p_y_global
,
acc_elementwise_op
);
};
}
// namespace ck
namespace
ck
{
namespace
ck
{
namespace
tensor_operation
{
namespace
tensor_operation
{
namespace
device
{
namespace
device
{
...
@@ -58,9 +23,9 @@ namespace device {
...
@@ -58,9 +23,9 @@ namespace device {
template
<
typename
XDataType
,
template
<
typename
XDataType
,
typename
GammaDataType
,
typename
GammaDataType
,
typename
BetaDataType
,
typename
BetaDataType
,
typename
Acc
DataType
,
typename
Compute
DataType
,
typename
YDataType
,
typename
YDataType
,
typename
Acc
ElementwiseOperation
,
typename
Y
ElementwiseOperation
,
index_t
Rank
,
index_t
Rank
,
index_t
NumReduceDim
,
index_t
NumReduceDim
,
index_t
BlockSize
,
index_t
BlockSize
,
...
@@ -74,16 +39,18 @@ template <typename XDataType,
...
@@ -74,16 +39,18 @@ template <typename XDataType,
index_t
GammaSrcVectorSize
,
index_t
GammaSrcVectorSize
,
index_t
BetaSrcVectorDim
,
index_t
BetaSrcVectorDim
,
index_t
BetaSrcVectorSize
,
index_t
BetaSrcVectorSize
,
index_t
YDstVectorSize
>
index_t
YDstVectorSize
,
bool
UseWelford
=
true
>
struct
DeviceNormalizationImpl
:
public
DeviceNormalization
<
XDataType
,
struct
DeviceNormalizationImpl
:
public
DeviceNormalization
<
XDataType
,
GammaDataType
,
GammaDataType
,
BetaDataType
,
BetaDataType
,
Acc
DataType
,
Compute
DataType
,
YDataType
,
YDataType
,
Acc
ElementwiseOperation
,
Y
ElementwiseOperation
,
Rank
,
Rank
,
NumReduceDim
>
NumReduceDim
>
{
{
static_assert
(
BlockSize
==
MThreadClusterSize
*
KThreadClusterSize
);
static_assert
(
static_assert
(
((
GammaSrcVectorDim
==
0
&&
MThreadSliceSize
%
GammaSrcVectorSize
==
0
)
||
((
GammaSrcVectorDim
==
0
&&
MThreadSliceSize
%
GammaSrcVectorSize
==
0
)
||
(
GammaSrcVectorDim
==
1
&&
KThreadSliceSize
%
GammaSrcVectorSize
==
0
)),
(
GammaSrcVectorDim
==
1
&&
KThreadSliceSize
%
GammaSrcVectorSize
==
0
)),
...
@@ -167,51 +134,6 @@ struct DeviceNormalizationImpl : public DeviceNormalization<XDataType,
...
@@ -167,51 +134,6 @@ struct DeviceNormalizationImpl : public DeviceNormalization<XDataType,
using
GridDesc_M_K
=
decltype
(
MakeSrc2dDescriptor
({
1
},
{
1
},
1
,
1
));
using
GridDesc_M_K
=
decltype
(
MakeSrc2dDescriptor
({
1
},
{
1
},
1
,
1
));
using
GridwiseReduceLayernormGeneric
=
GridwiseNormalizationWelfordVariance_mk_to_mk
<
XDataType
,
GammaDataType
,
BetaDataType
,
YDataType
,
AccDataType
,
AccElementwiseOperation
,
GridDesc_M_K
,
BlockSize
,
MThreadClusterSize
,
KThreadClusterSize
,
MThreadSliceSize
,
KThreadSliceSize
,
XYSrcVectorDim
,
XSrcVectorSize
,
GammaSrcVectorDim
,
GammaSrcVectorSize
,
BetaSrcVectorDim
,
BetaSrcVectorSize
,
XYSrcVectorDim
,
YDstVectorSize
,
false
>
;
using
GridwiseNormalizationSweepOnce
=
GridwiseNormalizationWelfordVariance_mk_to_mk
<
XDataType
,
GammaDataType
,
BetaDataType
,
YDataType
,
AccDataType
,
AccElementwiseOperation
,
GridDesc_M_K
,
BlockSize
,
MThreadClusterSize
,
KThreadClusterSize
,
MThreadSliceSize
,
KThreadSliceSize
,
XYSrcVectorDim
,
XSrcVectorSize
,
GammaSrcVectorDim
,
GammaSrcVectorSize
,
BetaSrcVectorDim
,
BetaSrcVectorSize
,
XYSrcVectorDim
,
YDstVectorSize
,
true
>
;
struct
Argument
:
public
BaseArgument
struct
Argument
:
public
BaseArgument
{
{
Argument
(
const
std
::
vector
<
index_t
>
lengths
,
Argument
(
const
std
::
vector
<
index_t
>
lengths
,
...
@@ -220,7 +142,7 @@ struct DeviceNormalizationImpl : public DeviceNormalization<XDataType,
...
@@ -220,7 +142,7 @@ struct DeviceNormalizationImpl : public DeviceNormalization<XDataType,
const
std
::
vector
<
index_t
>
betaStrides
,
const
std
::
vector
<
index_t
>
betaStrides
,
const
std
::
vector
<
index_t
>
yStrides
,
const
std
::
vector
<
index_t
>
yStrides
,
const
std
::
vector
<
index_t
>
reduceDims
,
const
std
::
vector
<
index_t
>
reduceDims
,
Acc
ElementwiseOperation
acc
_elementwise_op
,
Y
ElementwiseOperation
y
_elementwise_op
,
double
epsilon
,
double
epsilon
,
const
XDataType
*
p_x
,
const
XDataType
*
p_x
,
const
GammaDataType
*
p_gamma
,
const
GammaDataType
*
p_gamma
,
...
@@ -230,9 +152,9 @@ struct DeviceNormalizationImpl : public DeviceNormalization<XDataType,
...
@@ -230,9 +152,9 @@ struct DeviceNormalizationImpl : public DeviceNormalization<XDataType,
p_gamma_
(
p_gamma
),
p_gamma_
(
p_gamma
),
p_beta_
(
p_beta
),
p_beta_
(
p_beta
),
p_y_
(
p_y
),
p_y_
(
p_y
),
acc
_elementwise_op_
(
acc
_elementwise_op
)
y
_elementwise_op_
(
y
_elementwise_op
)
{
{
epsilon_
=
static_cast
<
Acc
DataType
>
(
epsilon
);
epsilon_
=
static_cast
<
Compute
DataType
>
(
epsilon
);
Lengths_
=
shuffle_tensor_dimensions
<
Rank
,
NumReduceDim
>
(
lengths
,
reduceDims
);
Lengths_
=
shuffle_tensor_dimensions
<
Rank
,
NumReduceDim
>
(
lengths
,
reduceDims
);
xStrides_
=
shuffle_tensor_dimensions
<
Rank
,
NumReduceDim
>
(
xStrides
,
reduceDims
);
xStrides_
=
shuffle_tensor_dimensions
<
Rank
,
NumReduceDim
>
(
xStrides
,
reduceDims
);
...
@@ -265,7 +187,7 @@ struct DeviceNormalizationImpl : public DeviceNormalization<XDataType,
...
@@ -265,7 +187,7 @@ struct DeviceNormalizationImpl : public DeviceNormalization<XDataType,
x_grid_desc_m_k_
.
GetLength
(
Number
<
1
>
{})
<=
KThreadClusterSize
*
KThreadSliceSize
;
x_grid_desc_m_k_
.
GetLength
(
Number
<
1
>
{})
<=
KThreadClusterSize
*
KThreadSliceSize
;
}
}
Acc
DataType
epsilon_
;
Compute
DataType
epsilon_
;
const
XDataType
*
p_x_
;
const
XDataType
*
p_x_
;
const
GammaDataType
*
p_gamma_
;
const
GammaDataType
*
p_gamma_
;
...
@@ -278,7 +200,7 @@ struct DeviceNormalizationImpl : public DeviceNormalization<XDataType,
...
@@ -278,7 +200,7 @@ struct DeviceNormalizationImpl : public DeviceNormalization<XDataType,
std
::
vector
<
index_t
>
betaStrides_
;
std
::
vector
<
index_t
>
betaStrides_
;
std
::
vector
<
index_t
>
yStrides_
;
std
::
vector
<
index_t
>
yStrides_
;
Acc
ElementwiseOperation
acc
_elementwise_op_
;
Y
ElementwiseOperation
y
_elementwise_op_
;
int
blkGroupSize_
;
int
blkGroupSize_
;
int
numBlockTileIteration_
;
int
numBlockTileIteration_
;
...
@@ -295,23 +217,27 @@ struct DeviceNormalizationImpl : public DeviceNormalization<XDataType,
...
@@ -295,23 +217,27 @@ struct DeviceNormalizationImpl : public DeviceNormalization<XDataType,
{
{
float
Run
(
const
Argument
&
arg
,
const
StreamConfig
&
stream_config
=
StreamConfig
{})
float
Run
(
const
Argument
&
arg
,
const
StreamConfig
&
stream_config
=
StreamConfig
{})
{
{
const
auto
kernel_main
=
arg
.
isSweeponce_
auto
kernel_main
=
NormalizationKernelSelector
<
XDataType
,
?
kernel_normalization
<
GridwiseNormalizationSweepOnce
,
GammaDataType
,
XDataType
,
BetaDataType
,
GammaDataType
,
YDataType
,
BetaDataType
,
ComputeDataType
,
YDataType
,
YElementwiseOperation
,
AccDataType
,
GridDesc_M_K
,
AccElementwiseOperation
,
BlockSize
,
GridDesc_M_K
>
MThreadClusterSize
,
:
kernel_normalization
<
GridwiseReduceLayernormGeneric
,
KThreadClusterSize
,
XDataType
,
MThreadSliceSize
,
GammaDataType
,
KThreadSliceSize
,
BetaDataType
,
XYSrcVectorDim
,
YDataType
,
XSrcVectorSize
,
AccDataType
,
GammaSrcVectorDim
,
AccElementwiseOperation
,
GammaSrcVectorSize
,
GridDesc_M_K
>
;
BetaSrcVectorDim
,
BetaSrcVectorSize
,
XYSrcVectorDim
,
YDstVectorSize
,
UseWelford
>
(
arg
.
isSweeponce_
);
float
avg_time
=
0
;
float
avg_time
=
0
;
avg_time
+=
launch_and_time_kernel
(
stream_config
,
avg_time
+=
launch_and_time_kernel
(
stream_config
,
...
@@ -329,7 +255,7 @@ struct DeviceNormalizationImpl : public DeviceNormalization<XDataType,
...
@@ -329,7 +255,7 @@ struct DeviceNormalizationImpl : public DeviceNormalization<XDataType,
arg
.
p_gamma_
,
arg
.
p_gamma_
,
arg
.
p_beta_
,
arg
.
p_beta_
,
arg
.
p_y_
,
arg
.
p_y_
,
arg
.
acc
_elementwise_op_
);
arg
.
y
_elementwise_op_
);
return
(
avg_time
);
return
(
avg_time
);
};
};
...
@@ -429,7 +355,7 @@ struct DeviceNormalizationImpl : public DeviceNormalization<XDataType,
...
@@ -429,7 +355,7 @@ struct DeviceNormalizationImpl : public DeviceNormalization<XDataType,
void
*
p_y
,
void
*
p_y
,
void
*
p_saveMean
,
void
*
p_saveMean
,
void
*
p_saveInvVar
,
void
*
p_saveInvVar
,
Acc
ElementwiseOperation
acc
_elementwise_op
)
override
Y
ElementwiseOperation
y
_elementwise_op
)
override
{
{
// TODO
// TODO
// Optional cache of the intermediate results (mean and InvVariance) during the
// Optional cache of the intermediate results (mean and InvVariance) during the
...
@@ -443,7 +369,7 @@ struct DeviceNormalizationImpl : public DeviceNormalization<XDataType,
...
@@ -443,7 +369,7 @@ struct DeviceNormalizationImpl : public DeviceNormalization<XDataType,
betaStrides
,
betaStrides
,
yStrides
,
yStrides
,
reduceDims
,
reduceDims
,
acc
_elementwise_op
,
y
_elementwise_op
,
epsilon
,
epsilon
,
static_cast
<
const
XDataType
*>
(
p_x
),
static_cast
<
const
XDataType
*>
(
p_x
),
static_cast
<
const
GammaDataType
*>
(
p_gamma
),
static_cast
<
const
GammaDataType
*>
(
p_gamma
),
...
@@ -462,8 +388,8 @@ struct DeviceNormalizationImpl : public DeviceNormalization<XDataType,
...
@@ -462,8 +388,8 @@ struct DeviceNormalizationImpl : public DeviceNormalization<XDataType,
// clang-format off
// clang-format off
str
<<
"DeviceNormalizationImpl<"
<<
BlockSize
<<
","
;
str
<<
"DeviceNormalizationImpl<"
<<
BlockSize
<<
","
;
str
<<
"
M_C
"
<<
MThreadClusterSize
<<
"_
S
"
<<
M
Thread
Slice
Size
<<
","
;
str
<<
"
Cluster_MK_
"
<<
MThreadClusterSize
<<
"_"
<<
K
Thread
Cluster
Size
<<
","
;
str
<<
"K_
C
"
<<
K
Thread
Cluster
Size
<<
"_
S
"
<<
KThreadSliceSize
<<
","
;
str
<<
"
Slice_M
K_"
<<
M
Thread
Slice
Size
<<
"_"
<<
KThreadSliceSize
<<
","
;
str
<<
"XYSrcVectorDim_"
<<
XYSrcVectorDim
<<
","
;
str
<<
"XYSrcVectorDim_"
<<
XYSrcVectorDim
<<
","
;
str
<<
"VectorSize_X"
<<
XSrcVectorSize
<<
"_Gamma"
<<
GammaSrcVectorSize
<<
"_Beta"
<<
BetaSrcVectorSize
<<
"_Y"
<<
YDstVectorSize
<<
">"
;
str
<<
"VectorSize_X"
<<
XSrcVectorSize
<<
"_Gamma"
<<
GammaSrcVectorSize
<<
"_Beta"
<<
BetaSrcVectorSize
<<
"_Y"
<<
YDstVectorSize
<<
">"
;
// clang-format on
// clang-format on
...
...
include/ck/tensor_operation/gpu/element/binary_element_wise_operation.hpp
View file @
9312548e
...
@@ -150,6 +150,13 @@ struct Bilinear
...
@@ -150,6 +150,13 @@ struct Bilinear
template
<
typename
Y
,
typename
X0
,
typename
X1
>
template
<
typename
Y
,
typename
X0
,
typename
X1
>
__host__
__device__
constexpr
void
operator
()(
Y
&
,
const
X0
&
,
const
X1
&
)
const
;
__host__
__device__
constexpr
void
operator
()(
Y
&
,
const
X0
&
,
const
X1
&
)
const
;
template
<
>
__host__
__device__
constexpr
void
operator
()
<
double
,
double
,
double
>
(
double
&
y
,
const
double
&
x0
,
const
double
&
x1
)
const
{
y
=
alpha_
*
x0
+
beta_
*
x1
;
};
template
<
>
template
<
>
__host__
__device__
constexpr
void
__host__
__device__
constexpr
void
operator
()
<
float
,
float
,
float
>
(
float
&
y
,
const
float
&
x0
,
const
float
&
x1
)
const
operator
()
<
float
,
float
,
float
>
(
float
&
y
,
const
float
&
x0
,
const
float
&
x1
)
const
...
...
include/ck/tensor_operation/gpu/element/unary_element_wise_operation.hpp
View file @
9312548e
...
@@ -95,6 +95,12 @@ struct Scale
...
@@ -95,6 +95,12 @@ struct Scale
y
=
scale_
*
x
;
y
=
scale_
*
x
;
};
};
template
<
>
__host__
__device__
void
operator
()
<
double
,
double
>
(
double
&
y
,
const
double
&
x
)
const
{
y
=
scale_
*
x
;
};
float
scale_
;
float
scale_
;
};
};
...
...
Prev
1
2
3
4
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment