Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
composable_kernel
Commits
cba8f7f2
Commit
cba8f7f2
authored
Jun 26, 2022
by
Anthony Chang
Browse files
Merge remote-tracking branch 'upstream/develop' into gemm-layernorm-4
parents
cc50b687
b653c5eb
Changes
583
Expand all
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
478 additions
and
5822 deletions
+478
-5822
example/19_binary_elementwise/elementwise_add_4d.cpp
example/19_binary_elementwise/elementwise_add_4d.cpp
+12
-34
example/20_convnd_bwd_weight_xdl/convnd_bwd_weight_xdl.cpp
example/20_convnd_bwd_weight_xdl/convnd_bwd_weight_xdl.cpp
+15
-15
example/20_convnd_bwd_weight_xdl/convnd_bwd_weight_xdl_bf16_splitk.cpp
...nvnd_bwd_weight_xdl/convnd_bwd_weight_xdl_bf16_splitk.cpp
+16
-16
example/21_gemm_layernorm/gemm_bias_relu_add_layernorm_xdl_fp16.cpp
..._gemm_layernorm/gemm_bias_relu_add_layernorm_xdl_fp16.cpp
+15
-11
example/21_gemm_layernorm/gemm_layernorm_xdl_fp16.cpp
example/21_gemm_layernorm/gemm_layernorm_xdl_fp16.cpp
+15
-11
example/22_cgemm/cgemm_xdl_fp16.cpp
example/22_cgemm/cgemm_xdl_fp16.cpp
+14
-38
example/23_softmax/CMakeLists.txt
example/23_softmax/CMakeLists.txt
+1
-0
example/23_softmax/README.md
example/23_softmax/README.md
+18
-0
example/23_softmax/softmax_blockwise.cpp
example/23_softmax/softmax_blockwise.cpp
+253
-0
example/CMakeLists.txt
example/CMakeLists.txt
+3
-17
external/include/half/half.hpp
external/include/half/half.hpp
+0
-5670
include/ck/ck.hpp
include/ck/ck.hpp
+9
-7
include/ck/device_utility/device_prop.hpp
include/ck/device_utility/device_prop.hpp
+4
-0
include/ck/device_utility/hip_check_error.hpp
include/ck/device_utility/hip_check_error.hpp
+17
-0
include/ck/device_utility/kernel_launch.hpp
include/ck/device_utility/kernel_launch.hpp
+74
-0
include/ck/options.hpp
include/ck/options.hpp
+0
-3
include/ck/problem_transform/transform_backward_data_convolution_into_gemm_v4r1_nhwc_kyxc_nhwk.hpp
...ckward_data_convolution_into_gemm_v4r1_nhwc_kyxc_nhwk.hpp
+3
-0
include/ck/problem_transform/transform_backward_data_convolution_into_gemm_v4r1r2_nhwc_kyxc_nhwk.hpp
...ward_data_convolution_into_gemm_v4r1r2_nhwc_kyxc_nhwk.hpp
+3
-0
include/ck/problem_transform/transform_backward_weight_convolution_into_gemm_v4r4r2_atomic_nchw_kcyx_nkhw.hpp
...ht_convolution_into_gemm_v4r4r2_atomic_nchw_kcyx_nkhw.hpp
+3
-0
include/ck/problem_transform/transform_backward_weight_convolution_into_gemm_v4r4r2_nchw_kcyx_nkhw.hpp
...rd_weight_convolution_into_gemm_v4r4r2_nchw_kcyx_nkhw.hpp
+3
-0
No files found.
example/19_binary_elementwise/elementwise_add_4d.cpp
View file @
cba8f7f2
/*******************************************************************************
// SPDX-License-Identifier: MIT
*
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
* MIT License
*
* Copyright (c) 2020 Advanced Micro Devices, Inc.
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in all
* copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*
*******************************************************************************/
#include <iostream>
#include <iostream>
#include <cstdlib>
#include <cstdlib>
#include "check_err.hpp"
#include "c
onfig
.hpp"
#include "c
k/ck
.hpp"
#include "
device
.hpp"
#include "
ck/tensor_operation/gpu/element/binary_element_wise_operation
.hpp"
#include "
host_tensor
.hpp"
#include "
ck/tensor_operation/gpu/device/device_binary_elementwise
.hpp"
#include "host_tensor_generator.hpp"
#include "ck/library/utility/check_err.hpp"
#include "device_
tens
or.hpp"
#include "
ck/library/host_tensor/
device_
mem
or
y
.hpp"
#include "
binary_element_wise_operation
.hpp"
#include "
ck/library/host_tensor/host_tensor
.hpp"
#include "
device_binary_elementwise
.hpp"
#include "
ck/library/host_tensor/host_tensor_generator
.hpp"
using
F16
=
ck
::
half_t
;
using
F16
=
ck
::
half_t
;
using
F32
=
float
;
using
F32
=
float
;
...
...
example/20_convnd_bwd_weight_xdl/convnd_bwd_weight_xdl.cpp
View file @
cba8f7f2
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#include <iostream>
#include <iostream>
#include <numeric>
#include <numeric>
#include <initializer_list>
#include <initializer_list>
#include <cstdlib>
#include <cstdlib>
#include <stdlib.h>
#include <half.hpp>
#include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "check_err.hpp"
#include "ck/tensor_operation/gpu/device/device_convnd_backward_weight_xdl_c_shuffle_nhwc_kyxc_nhwk.hpp"
#include "conv_util.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "config.hpp"
#include "print.hpp"
#include "ck/library/utility/check_err.hpp"
#include "device.hpp"
#include "ck/library/utility/conv_util.hpp"
#include "host_tensor.hpp"
#include "ck/library/host_tensor/device_memory.hpp"
#include "host_tensor_generator.hpp"
#include "ck/library/host_tensor/host_tensor.hpp"
#include "device_tensor.hpp"
#include "ck/library/host_tensor/host_tensor_generator.hpp"
#include "tensor_layout.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_conv_backward_weight.hpp"
#include "element_wise_operation.hpp"
#include "device_convnd_backward_weight_xdl_c_shuffle_nhwc_kyxc_nhwk.hpp"
#include "reference_conv_backward_weight.hpp"
using
InDataType
=
ck
::
half_t
;
using
InDataType
=
ck
::
half_t
;
using
WeiDataType
=
ck
::
half_t
;
using
WeiDataType
=
ck
::
half_t
;
...
...
example/20_convnd_bwd_weight_xdl/convnd_bwd_weight_xdl_bf16_splitk.cpp
View file @
cba8f7f2
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#include <iostream>
#include <iostream>
#include <numeric>
#include <numeric>
#include <initializer_list>
#include <initializer_list>
#include <cstdlib>
#include <cstdlib>
#include <stdlib.h>
#include <half.hpp>
#include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "check_err.hpp"
#include "ck/tensor_operation/gpu/device/device_convnd_backward_weight_xdl_c_shuffle_nhwc_kyxc_nhwk.hpp"
#include "conv_util.hpp"
#include "ck/tensor_operation/gpu/device/device_unary_elementwise.hpp"
#include "config.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "print.hpp"
#include "device.hpp"
#include "ck/library/utility/check_err.hpp"
#include "host_tensor.hpp"
#include "ck/library/utility/conv_util.hpp"
#include "host_tensor_generator.hpp"
#include "ck/library/host_tensor/device_memory.hpp"
#include "device_tensor.hpp"
#include "ck/library/host_tensor/host_tensor.hpp"
#include "tensor_layout.hpp"
#include "ck/library/host_tensor/host_tensor_generator.hpp"
#include "element_wise_operation.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_conv_backward_weight.hpp"
#include "device_unary_elementwise.hpp"
#include "device_convnd_backward_weight_xdl_c_shuffle_nhwc_kyxc_nhwk.hpp"
#include "reference_conv_backward_weight.hpp"
using
InDataType
=
ck
::
bhalf_t
;
using
InDataType
=
ck
::
bhalf_t
;
using
WeiDataType
=
ck
::
bhalf_t
;
using
WeiDataType
=
ck
::
bhalf_t
;
...
...
example/21_gemm_layernorm/gemm_bias_relu_add_layernorm_xdl_fp16.cpp
View file @
cba8f7f2
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#include <iostream>
#include <iostream>
#include <numeric>
#include <numeric>
#include <initializer_list>
#include <initializer_list>
#include <cstdlib>
#include <cstdlib>
#include "check_err.hpp"
#include "ck/ck.hpp"
#include "config.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "device.hpp"
#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
#include "host_tensor.hpp"
#include "ck/tensor_operation/gpu/device/device_gemm_bias_add_reduce_xdl_cshuffle.hpp"
#include "host_tensor_generator.hpp"
#include "ck/tensor_operation/gpu/device/device_5ary_elementwise.hpp"
#include "device_tensor.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "device_5ary_elementwise.hpp"
#include "device_gemm_bias_add_reduce_xdl_cshuffle.hpp"
#include "ck/library/host_tensor/device_memory.hpp"
#include "element_wise_operation.hpp"
#include "ck/library/host_tensor/host_tensor.hpp"
#include "reference_gemm.hpp"
#include "ck/library/host_tensor/host_tensor_generator.hpp"
#include "gemm_specialization.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"
#include "ck/library/utility/check_err.hpp"
template
<
ck
::
index_t
...
Is
>
template
<
ck
::
index_t
...
Is
>
using
S
=
ck
::
Sequence
<
Is
...
>
;
using
S
=
ck
::
Sequence
<
Is
...
>
;
...
...
example/21_gemm_layernorm/gemm_layernorm_xdl_fp16.cpp
View file @
cba8f7f2
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#include <iostream>
#include <iostream>
#include <numeric>
#include <numeric>
#include <initializer_list>
#include <initializer_list>
#include <cstdlib>
#include <cstdlib>
#include "check_err.hpp"
#include "ck/ck.hpp"
#include "config.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "device.hpp"
#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
#include "host_tensor.hpp"
#include "ck/tensor_operation/gpu/device/device_gemm_reduce_xdl_cshuffle.hpp"
#include "host_tensor_generator.hpp"
#include "ck/tensor_operation/gpu/device/device_5ary_elementwise.hpp"
#include "device_tensor.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "device_5ary_elementwise.hpp"
#include "device_gemm_reduce_xdl_cshuffle.hpp"
#include "ck/library/host_tensor/device_memory.hpp"
#include "element_wise_operation.hpp"
#include "ck/library/host_tensor/host_tensor.hpp"
#include "reference_gemm.hpp"
#include "ck/library/host_tensor/host_tensor_generator.hpp"
#include "gemm_specialization.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"
#include "ck/library/utility/check_err.hpp"
template
<
ck
::
index_t
...
Is
>
template
<
ck
::
index_t
...
Is
>
using
S
=
ck
::
Sequence
<
Is
...
>
;
using
S
=
ck
::
Sequence
<
Is
...
>
;
...
...
example/22_cgemm/cgemm_xdl_fp16.cpp
View file @
cba8f7f2
/*******************************************************************************
// SPDX-License-Identifier: MIT
*
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
* MIT License
*
* Copyright (c) 2022 Advanced Micro Devices, Inc.
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in all
* copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*
*******************************************************************************/
#include <iostream>
#include <iostream>
#include <numeric>
#include <numeric>
#include <initializer_list>
#include <initializer_list>
#include <cstdlib>
#include <cstdlib>
#include <stdlib.h>
#include <half.hpp>
#include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "check_err.hpp"
#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
#include "config.hpp"
#include "ck/tensor_operation/gpu/device/device_cgemm_4gemm_xdl_cshuffle.hpp"
#include "device.hpp"
#include "host_tensor.hpp"
#include "ck/library/utility/check_err.hpp"
#include "host_tensor_generator.hpp"
#include "ck/library/host_tensor/device_memory.hpp"
#include "device_tensor.hpp"
#include "ck/library/host_tensor/host_tensor.hpp"
#include "device_cgemm_4gemm_xdl_cshuffle.hpp"
#include "ck/library/host_tensor/host_tensor_generator.hpp"
#include "element_wise_operation.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_cgemm.hpp"
#include "reference_cgemm.hpp"
#include "gemm_specialization.hpp"
template
<
ck
::
index_t
...
Is
>
template
<
ck
::
index_t
...
Is
>
using
S
=
ck
::
Sequence
<
Is
...
>
;
using
S
=
ck
::
Sequence
<
Is
...
>
;
...
...
example/23_softmax/CMakeLists.txt
0 → 100644
View file @
cba8f7f2
add_example_executable
(
example_softmax_blockwise softmax_blockwise.cpp
)
\ No newline at end of file
example/23_softmax/README.md
0 → 100644
View file @
cba8f7f2
# Instructions for ```example_softmax_blockwise```
## Run ```example_softmax_blockwise```
```
bash
# -D <xxx> : input 3-d tensor lengths
# -v <x> : verification (0=no, 1=yes)
#arg1: initialization (0=no init, 1=single integer value, 2=scope integer value, 3=decimal value)
#arg2: time kernel (0=no, 1=yes)
example_softmax_blockwise
-D
4,128,2048
-v
1 1 1
```
Result
```
launch_and_time_kernel: grid_dim {64, 1, 1}, block_dim {256, 1, 1}
Warm up 1 time
Start running 10 times...
Perf: 0.0242877 ms, 259.039 GB/s, DeviceReduceSoftmax<256,M_C8_S1,K_C32_S8,InSrcVectorDim_1_InSrcVectorSize_8_OutDstVectorSize_8>
```
example/23_softmax/softmax_blockwise.cpp
0 → 100644
View file @
cba8f7f2
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#include <iostream>
#include <numeric>
#include <initializer_list>
#include <cstdlib>
#include <getopt.h>
#include "ck/ck.hpp"
#include "ck/utility/reduction_enums.hpp"
#include "ck/tensor_operation/gpu/device/device_softmax.hpp"
#include "ck/tensor_operation/gpu/device/reduction_operator_mapping.hpp"
#include "ck/library/utility/check_err.hpp"
#include "ck/library/host_tensor/device_memory.hpp"
#include "ck/library/host_tensor/host_common_util.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_softmax.hpp"
using
namespace
ck
;
using
namespace
ck
::
tensor_operation
::
device
;
using
InDataType
=
ck
::
half_t
;
using
OutDataType
=
ck
::
half_t
;
using
AccDataType
=
float
;
constexpr
int
Rank
=
3
;
constexpr
int
NumReduceDim
=
1
;
using
DeviceInstance
=
DeviceSoftmax
<
InDataType
,
AccDataType
,
OutDataType
,
Rank
,
NumReduceDim
,
256
,
// BlockSize
8
,
// ClusterM
32
,
// ClusterK
1
,
// SliceM
8
,
// SliceK
1
,
// SrcVecDim (0=M, 1=K)
8
,
// SrcScalarPerVector
8
>
;
// OutScalarPerVector
static
struct
option
long_options
[]
=
{{
"inLengths"
,
required_argument
,
nullptr
,
'D'
},
{
"verify"
,
required_argument
,
nullptr
,
'v'
},
{
"help"
,
no_argument
,
nullptr
,
'?'
},
{
nullptr
,
0
,
nullptr
,
0
}};
class
SimpleAppArgs
{
private:
int
option_index
=
0
;
public:
std
::
vector
<
size_t
>
inLengths
=
{
8
,
128
,
2048
};
std
::
vector
<
AccDataType
>
scales
=
{
2.0
f
,
2.0
f
};
bool
do_verification
=
true
;
int
init_method
=
2
;
bool
time_kernel
=
true
;
public:
void
show_usage
(
const
char
*
cmd
)
{
std
::
cout
<<
"Usage of "
<<
cmd
<<
std
::
endl
;
std
::
cout
<<
"--inLengths or -D, comma separated list of input tensor dimension lengths"
<<
std
::
endl
;
std
::
cout
<<
"--verify or -v, 1/0 to indicate whether to verify the reduction result by "
"comparing with the host-based reduction"
<<
std
::
endl
;
std
::
cout
<<
"Arg1 -- init method (0=no init, 1=single integer value, 2=scope integer "
"value, 3=decimal value)"
<<
std
::
endl
;
std
::
cout
<<
"Arg2 -- time kernel (0=no, 1=yes)"
<<
std
::
endl
;
};
int
processArgs
(
int
argc
,
char
*
argv
[])
{
using
ck
::
host_common
::
getTypeValuesFromString
;
int
ch
;
while
(
1
)
{
ch
=
getopt_long
(
argc
,
argv
,
"D:v:l:"
,
long_options
,
&
option_index
);
if
(
ch
==
-
1
)
break
;
switch
(
ch
)
{
case
'D'
:
if
(
!
optarg
)
throw
std
::
runtime_error
(
"Invalid option format!"
);
inLengths
=
getTypeValuesFromString
<
size_t
>
(
optarg
);
break
;
case
'v'
:
if
(
!
optarg
)
throw
std
::
runtime_error
(
"Invalid option format!"
);
do_verification
=
static_cast
<
bool
>
(
std
::
atoi
(
optarg
));
break
;
case
'?'
:
if
(
std
::
string
(
long_options
[
option_index
].
name
)
==
"help"
)
{
show_usage
(
argv
[
0
]);
return
(
-
1
);
};
break
;
default:
show_usage
(
argv
[
0
]);
return
(
-
1
);
};
};
if
(
optind
+
2
>
argc
)
throw
std
::
runtime_error
(
"Invalid cmd-line arguments, more argumetns are needed!"
);
init_method
=
std
::
atoi
(
argv
[
optind
++
]);
time_kernel
=
static_cast
<
bool
>
(
std
::
atoi
(
argv
[
optind
]));
if
(
scales
.
empty
())
{
scales
.
push_back
(
1.0
f
);
scales
.
push_back
(
0.0
f
);
};
return
(
0
);
};
};
int
main
(
int
argc
,
char
*
argv
[])
{
// Example: batched gemm C[G, M, N] applies max/sum reduction along N internally
const
std
::
vector
<
int
>
invariantDims
{
0
,
1
};
const
std
::
vector
<
int
>
reduceDims
{
2
};
SimpleAppArgs
args
;
if
(
argc
>
1
)
{
if
(
args
.
processArgs
(
argc
,
argv
)
<
0
)
return
(
-
1
);
};
Tensor
<
InDataType
>
in
(
args
.
inLengths
);
Tensor
<
OutDataType
>
out_ref
(
args
.
inLengths
);
Tensor
<
OutDataType
>
out
(
args
.
inLengths
);
auto
inStrides
=
in
.
mDesc
.
GetStrides
();
auto
outStrides
=
out
.
mDesc
.
GetStrides
();
AccDataType
alpha
=
args
.
scales
[
0
];
AccDataType
beta
=
args
.
scales
[
1
];
std
::
size_t
num_thread
=
1
;
if
(
args
.
do_verification
)
{
switch
(
args
.
init_method
)
{
case
0
:
break
;
case
1
:
in
.
GenerateTensorValue
(
GeneratorTensor_1
<
InDataType
>
{
1
},
num_thread
);
if
(
beta
!=
0.0
f
)
out_ref
.
GenerateTensorValue
(
GeneratorTensor_1
<
OutDataType
>
{
1
},
num_thread
);
break
;
case
2
:
in
.
GenerateTensorValue
(
GeneratorTensor_2
<
InDataType
>
{
-
5
,
5
},
num_thread
);
if
(
beta
!=
0.0
f
)
out_ref
.
GenerateTensorValue
(
GeneratorTensor_2
<
OutDataType
>
{
-
5
,
5
},
num_thread
);
break
;
default:
in
.
GenerateTensorValue
(
GeneratorTensor_3
<
InDataType
>
{
-
5.0
,
5.0
},
num_thread
);
if
(
beta
!=
0.0
f
)
out_ref
.
GenerateTensorValue
(
GeneratorTensor_3
<
OutDataType
>
{
-
5.0
,
5.0
},
num_thread
);
}
if
(
beta
!=
0.0
f
)
for
(
size_t
i
=
0
;
i
<
out_ref
.
mDesc
.
GetElementSpace
();
i
++
)
out
.
mData
[
i
]
=
out_ref
.
mData
[
i
];
};
// std::cout << "beta = " << beta << std::endl;
// LogRangeAsType<float>(std::cout << "tensor in: " , in.mData, ",") << std::endl;
// LogRangeAsType<float>(std::cout << "tensor prior out: " , out.mData, ",") << std::endl;
// these buffers are usually provided by the user application
DeviceMem
in_dev
(
sizeof
(
InDataType
)
*
in
.
mDesc
.
GetElementSpace
());
DeviceMem
out_dev
(
sizeof
(
OutDataType
)
*
out
.
mDesc
.
GetElementSpace
());
in_dev
.
ToDevice
(
in
.
mData
.
data
());
if
(
beta
!=
0.0
f
)
out_dev
.
ToDevice
(
out
.
mData
.
data
());
if
(
args
.
do_verification
)
{
using
ReferenceInstance
=
tensor_operation
::
host
::
ReferenceSoftmax
<
InDataType
,
OutDataType
,
AccDataType
>
;
ReferenceInstance
ref
;
auto
ref_arg
=
ref
.
MakeArgument
(
in
,
out_ref
,
alpha
,
beta
,
Rank
,
reduceDims
);
auto
invoker
=
ref
.
MakeInvoker
();
invoker
.
Run
(
ref_arg
);
// LogRangeAsType<float>(std::cout << "tensor out_ref: ", out_ref.mData, ",") << std::endl;
};
std
::
vector
<
ck
::
index_t
>
i_inLengths
;
std
::
vector
<
ck
::
index_t
>
i_inStrides
;
i_inLengths
.
assign
(
args
.
inLengths
.
begin
(),
args
.
inLengths
.
end
());
i_inStrides
.
assign
(
inStrides
.
begin
(),
inStrides
.
end
());
auto
device_instance
=
DeviceInstance
{};
auto
argument_ptr
=
device_instance
.
MakeArgumentPointer
(
i_inLengths
,
i_inStrides
,
reduceDims
,
alpha
,
beta
,
in_dev
.
GetDeviceBuffer
(),
out_dev
.
GetDeviceBuffer
());
if
(
!
device_instance
.
IsSupportedArgument
(
argument_ptr
.
get
()))
{
std
::
cout
<<
"The runtime parameters seems not supported by the DeviceReduce instance, exiting!"
<<
std
::
endl
;
return
1
;
};
std
::
string
instance_name
=
device_instance
.
GetTypeString
();
auto
invoker_ptr
=
device_instance
.
MakeInvokerPointer
();
bool
pass
=
true
;
if
(
args
.
do_verification
)
{
invoker_ptr
->
Run
(
argument_ptr
.
get
(),
StreamConfig
{
nullptr
,
false
});
out_dev
.
FromDevice
(
out
.
mData
.
data
());
// LogRangeAsType<float>(std::cout << "tensor out: " , out.mData, ",") << std::endl;
pass
=
pass
&&
ck
::
utils
::
check_err
(
out
.
mData
,
out_ref
.
mData
);
};
float
avg_time
=
invoker_ptr
->
Run
(
argument_ptr
.
get
(),
StreamConfig
{
nullptr
,
args
.
time_kernel
});
std
::
size_t
num_bytes
=
in
.
mDesc
.
GetElementSize
()
*
sizeof
(
InDataType
)
+
(
beta
==
0.0
f
?
1
:
2
)
*
out
.
mDesc
.
GetElementSize
()
*
sizeof
(
OutDataType
);
float
gb_per_sec
=
num_bytes
/
1.E6
/
avg_time
;
std
::
cout
<<
"Perf: "
<<
avg_time
<<
" ms, "
<<
gb_per_sec
<<
" GB/s, "
<<
instance_name
<<
std
::
endl
;
return
(
pass
?
0
:
1
);
}
example/CMakeLists.txt
View file @
cba8f7f2
include_directories
(
BEFORE
include_directories
(
BEFORE
${
PROJECT_SOURCE_DIR
}
/include/ck
${
PROJECT_SOURCE_DIR
}
/include
${
PROJECT_SOURCE_DIR
}
/include/ck/utility
${
PROJECT_SOURCE_DIR
}
/library/include
${
PROJECT_SOURCE_DIR
}
/include/ck/host_utility
${
PROJECT_SOURCE_DIR
}
/include/ck/tensor_description
${
PROJECT_SOURCE_DIR
}
/include/ck/tensor
${
PROJECT_SOURCE_DIR
}
/include/ck/problem_transform
${
PROJECT_SOURCE_DIR
}
/include/ck/tensor_operation/gpu/device
${
PROJECT_SOURCE_DIR
}
/include/ck/tensor_operation/gpu/grid
${
PROJECT_SOURCE_DIR
}
/include/ck/tensor_operation/gpu/block
${
PROJECT_SOURCE_DIR
}
/include/ck/tensor_operation/gpu/warp
${
PROJECT_SOURCE_DIR
}
/include/ck/tensor_operation/gpu/thread
${
PROJECT_SOURCE_DIR
}
/include/ck/tensor_operation/gpu/element
${
PROJECT_SOURCE_DIR
}
/library/include/ck/library/host_tensor
${
PROJECT_SOURCE_DIR
}
/library/include/ck/library/reference_tensor_operation/cpu
${
PROJECT_SOURCE_DIR
}
/library/include/ck/library/reference_tensor_operation/gpu
${
PROJECT_SOURCE_DIR
}
/library/include/ck/library/utility
${
PROJECT_SOURCE_DIR
}
/external/include/half
)
)
add_custom_target
(
examples
)
add_custom_target
(
examples
)
...
@@ -56,3 +41,4 @@ add_subdirectory(19_binary_elementwise)
...
@@ -56,3 +41,4 @@ add_subdirectory(19_binary_elementwise)
add_subdirectory
(
20_convnd_bwd_weight_xdl
)
add_subdirectory
(
20_convnd_bwd_weight_xdl
)
add_subdirectory
(
21_gemm_layernorm
)
add_subdirectory
(
21_gemm_layernorm
)
add_subdirectory
(
22_cgemm
)
add_subdirectory
(
22_cgemm
)
add_subdirectory
(
23_softmax
)
external/include/half/half.hpp
deleted
100644 → 0
View file @
cc50b687
This diff is collapsed.
Click to expand it.
include/ck/c
onfig
.hpp
→
include/ck/c
k
.hpp
View file @
cba8f7f2
#ifndef CK_CONFIG_AMD_HPP
// SPDX-License-Identifier: MIT
#define CK_CONFIG_AMD_HPP
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#ifndef CK_DONT_USE_HIP_RUNTIME_HEADERS
#ifndef CK_DONT_USE_HIP_RUNTIME_HEADERS
#include "hip/hip_runtime.h"
#include "hip/hip_runtime.h"
#include "hip/hip_fp16.h"
#include "hip/hip_fp16.h"
#endif
#endif
#define CK_TIME_KERNEL 1
// constant address space for kernel parameter
// constant address space for kernel parameter
// https://llvm.org/docs/AMDGPUUsage.html#address-spaces
// https://llvm.org/docs/AMDGPUUsage.html#address-spaces
#define CK_CONSTANT_ADDRESS_SPACE __attribute__((address_space(4)))
#define CK_CONSTANT_ADDRESS_SPACE __attribute__((address_space(4)))
...
@@ -139,10 +143,6 @@
...
@@ -139,10 +143,6 @@
// tuning parameter
// tuning parameter
#define CK_WORKAROUND_SWDEV_325164 1
#define CK_WORKAROUND_SWDEV_325164 1
// workaround for verification failure ConvNd forward
// https://github.com/ROCmSoftwarePlatform/composable_kernel/issues/135
#define CK_WORKAROUND_GITHUB_135 1
namespace
ck
{
namespace
ck
{
enum
struct
InMemoryDataOperationEnum
enum
struct
InMemoryDataOperationEnum
...
@@ -153,6 +153,7 @@ enum struct InMemoryDataOperationEnum
...
@@ -153,6 +153,7 @@ enum struct InMemoryDataOperationEnum
Add
Add
};
};
// FIXME: use regular Sequence and remove this
template
<
InMemoryDataOperationEnum
...
Is
>
template
<
InMemoryDataOperationEnum
...
Is
>
struct
InMemoryDataOperationEnumSequence
struct
InMemoryDataOperationEnumSequence
{
{
...
@@ -166,6 +167,7 @@ struct InMemoryDataOperationEnumSequence
...
@@ -166,6 +167,7 @@ struct InMemoryDataOperationEnumSequence
}
}
};
};
#if 0
// TODO: no longer needed, remove this
// TODO: no longer needed, remove this
enum struct ActivTypeEnum
enum struct ActivTypeEnum
{
{
...
@@ -173,10 +175,10 @@ enum struct ActivTypeEnum
...
@@ -173,10 +175,10 @@ enum struct ActivTypeEnum
LeakyRelu,
LeakyRelu,
Sigmoid
Sigmoid
};
};
#endif
// index type
// index type
using
index_t
=
int32_t
;
using
index_t
=
int32_t
;
using
long_index_t
=
int64_t
;
using
long_index_t
=
int64_t
;
}
// namespace ck
}
// namespace ck
#endif
include/ck/
host
_utility/device_prop.hpp
→
include/ck/
device
_utility/device_prop.hpp
View file @
cba8f7f2
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#pragma once
#include <string>
#include <string>
#include <map>
#include <map>
#include <hip/hip_runtime.h>
namespace
ck
{
namespace
ck
{
...
...
include/ck/device_utility/hip_check_error.hpp
0 → 100644
View file @
cba8f7f2
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#include <hip/hip_runtime.h>
inline
void
hip_check_error
(
hipError_t
x
)
{
if
(
x
!=
hipSuccess
)
{
std
::
ostringstream
ss
;
ss
<<
"HIP runtime error: "
<<
hipGetErrorString
(
x
)
<<
". "
<<
__FILE__
<<
": "
<<
__LINE__
<<
"in function: "
<<
__func__
;
throw
std
::
runtime_error
(
ss
.
str
());
}
}
library/
include/ck/
library/host_tensor/device
.hpp
→
include/ck/
device_utility/kernel_launch
.hpp
View file @
cba8f7f2
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#pragma once
#include <memory>
#include <functional>
#include <thread>
#include <chrono>
#include <hip/hip_runtime.h>
#include <hip/hip_runtime.h>
#include <hip/hip_fp16.h>
#include "stream_config.hpp"
#include "ck/options.hpp"
template
<
typename
T
>
__global__
void
set_buffer_value
(
T
*
p
,
T
x
,
uint64_t
buffer_element_size
)
{
for
(
uint64_t
i
=
threadIdx
.
x
;
i
<
buffer_element_size
;
i
+=
blockDim
.
x
)
{
p
[
i
]
=
x
;
}
}
inline
void
hip_check_error
(
hipError_t
x
)
#include "ck/ck.hpp"
{
#include "ck/stream_config.hpp"
if
(
x
!=
hipSuccess
)
#include "ck/device_utility/hip_check_error.hpp"
{
std
::
ostringstream
ss
;
ss
<<
"HIP runtime error: "
<<
hipGetErrorString
(
x
)
<<
". "
<<
__FILE__
<<
": "
<<
__LINE__
<<
"in function: "
<<
__func__
;
throw
std
::
runtime_error
(
ss
.
str
());
}
}
struct
DeviceMem
{
DeviceMem
()
=
delete
;
DeviceMem
(
std
::
size_t
mem_size
);
void
*
GetDeviceBuffer
();
std
::
size_t
GetBufferSize
();
void
ToDevice
(
const
void
*
p
);
void
FromDevice
(
void
*
p
);
void
SetZero
();
template
<
typename
T
>
void
SetValue
(
T
x
)
{
if
(
mMemSize
%
sizeof
(
T
)
!=
0
)
{
throw
std
::
runtime_error
(
"wrong! not entire DeviceMem will be set"
);
}
set_buffer_value
<
T
><<<
1
,
1024
>>>
(
static_cast
<
T
*>
(
mpDeviceBuf
),
x
,
mMemSize
/
sizeof
(
T
));
}
~
DeviceMem
();
void
*
mpDeviceBuf
;
std
::
size_t
mMemSize
;
};
struct
KernelTimerImpl
;
struct
KernelTimer
{
KernelTimer
();
~
KernelTimer
();
void
Start
();
void
End
();
float
GetElapsedTime
()
const
;
std
::
unique_ptr
<
KernelTimerImpl
>
impl
;
};
template
<
typename
...
Args
,
typename
F
>
template
<
typename
...
Args
,
typename
F
>
float
launch_and_time_kernel
(
const
StreamConfig
&
stream_config
,
float
launch_and_time_kernel
(
const
StreamConfig
&
stream_config
,
...
@@ -97,17 +38,27 @@ float launch_and_time_kernel(const StreamConfig& stream_config,
...
@@ -97,17 +38,27 @@ float launch_and_time_kernel(const StreamConfig& stream_config,
printf
(
"Start running %d times...
\n
"
,
nrepeat
);
printf
(
"Start running %d times...
\n
"
,
nrepeat
);
KernelTimer
timer
;
hipEvent_t
start
,
stop
;
timer
.
Start
();
hip_check_error
(
hipEventCreate
(
&
start
));
hip_check_error
(
hipEventCreate
(
&
stop
));
hip_check_error
(
hipDeviceSynchronize
());
hip_check_error
(
hipEventRecord
(
start
,
stream_config
.
stream_id_
));
for
(
int
i
=
0
;
i
<
nrepeat
;
++
i
)
for
(
int
i
=
0
;
i
<
nrepeat
;
++
i
)
{
{
kernel
<<<
grid_dim
,
block_dim
,
lds_byte
,
stream_config
.
stream_id_
>>>
(
args
...);
kernel
<<<
grid_dim
,
block_dim
,
lds_byte
,
stream_config
.
stream_id_
>>>
(
args
...);
}
}
timer
.
End
();
hip_check_error
(
hipEventRecord
(
stop
,
stream_config
.
stream_id_
));
hip_check_error
(
hipEventSynchronize
(
stop
));
float
total_time
=
0
;
hip_check_error
(
hipEventElapsedTime
(
&
total_time
,
start
,
stop
));
return
t
imer
.
GetElapsedT
ime
()
/
nrepeat
;
return
t
otal_t
ime
/
nrepeat
;
}
}
else
else
{
{
...
...
include/ck/options.hpp
deleted
100644 → 0
View file @
cc50b687
#pragma once
#define CK_TIME_KERNEL 1
include/ck/problem_transform/transform_backward_data_convolution_into_gemm_v4r1_nhwc_kyxc_nhwk.hpp
View file @
cba8f7f2
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#ifndef CK_TRANSFORM_BACKWARD_DATA_CONVOLUTION_INTO_GEMM_V4R1_NHWC_KYXC_NHWK_HPP
#ifndef CK_TRANSFORM_BACKWARD_DATA_CONVOLUTION_INTO_GEMM_V4R1_NHWC_KYXC_NHWK_HPP
#define CK_TRANSFORM_BACKWARD_DATA_CONVOLUTION_INTO_GEMM_V4R1_NHWC_KYXC_NHWK_HPP
#define CK_TRANSFORM_BACKWARD_DATA_CONVOLUTION_INTO_GEMM_V4R1_NHWC_KYXC_NHWK_HPP
...
...
include/ck/problem_transform/transform_backward_data_convolution_into_gemm_v4r1r2_nhwc_kyxc_nhwk.hpp
View file @
cba8f7f2
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#ifndef CK_TRANSFORM_BACKWARD_DATA_CONVOLUTION_INTO_GEMM_V4R1R2_NHWC_KYXC_NHWK_HPP
#ifndef CK_TRANSFORM_BACKWARD_DATA_CONVOLUTION_INTO_GEMM_V4R1R2_NHWC_KYXC_NHWK_HPP
#define CK_TRANSFORM_BACKWARD_DATA_CONVOLUTION_INTO_GEMM_V4R1R2_NHWC_KYXC_NHWK_HPP
#define CK_TRANSFORM_BACKWARD_DATA_CONVOLUTION_INTO_GEMM_V4R1R2_NHWC_KYXC_NHWK_HPP
...
...
include/ck/problem_transform/transform_backward_weight_convolution_into_gemm_v4r4r2_atomic_nchw_kcyx_nkhw.hpp
View file @
cba8f7f2
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#ifndef CK_TRANSFORM_BACKWARD_WEIGHT_CONVOLUTION_INTO_GEMM_V4R4R2_ATOMIC_NCHW_KCYX_NKHW_HPP
#ifndef CK_TRANSFORM_BACKWARD_WEIGHT_CONVOLUTION_INTO_GEMM_V4R4R2_ATOMIC_NCHW_KCYX_NKHW_HPP
#define CK_TRANSFORM_BACKWARD_WEIGHT_CONVOLUTION_INTO_GEMM_V4R4R2_ATOMIC_NCHW_KCYX_NKHW_HPP
#define CK_TRANSFORM_BACKWARD_WEIGHT_CONVOLUTION_INTO_GEMM_V4R4R2_ATOMIC_NCHW_KCYX_NKHW_HPP
...
...
include/ck/problem_transform/transform_backward_weight_convolution_into_gemm_v4r4r2_nchw_kcyx_nkhw.hpp
View file @
cba8f7f2
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#ifndef CK_TRANSFORM_BACKWARD_WEIGHT_CONVOLUTION_INTO_GEMM_V4R4R2_NCHW_KCYX_NKHW_HPP
#ifndef CK_TRANSFORM_BACKWARD_WEIGHT_CONVOLUTION_INTO_GEMM_V4R4R2_NCHW_KCYX_NKHW_HPP
#define CK_TRANSFORM_BACKWARD_WEIGHT_CONVOLUTION_INTO_GEMM_V4R4R2_NCHW_KCYX_NKHW_HPP
#define CK_TRANSFORM_BACKWARD_WEIGHT_CONVOLUTION_INTO_GEMM_V4R4R2_NCHW_KCYX_NKHW_HPP
...
...
Prev
1
2
3
4
5
6
7
…
30
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment