Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
composable_kernel
Commits
20ea6c75
"official/projects/bigbird/encoder.py" did not exist on "01116d8e8aa8bad91a1882e0e65d0542b0beabd7"
Unverified
Commit
20ea6c75
authored
Nov 30, 2023
by
Bartlomiej Wroblewski
Committed by
GitHub
Nov 30, 2023
Browse files
Merge branch 'develop' into bwroblew/direct_load_double_buf
parents
92a0393a
8ff845f2
Changes
45
Show whitespace changes
Inline
Side-by-side
Showing
5 changed files
with
321 additions
and
45 deletions
+321
-45
include/ck/utility/tuple_helper.hpp
include/ck/utility/tuple_helper.hpp
+88
-0
library/src/tensor_operation_instance/gpu/CMakeLists.txt
library/src/tensor_operation_instance/gpu/CMakeLists.txt
+147
-42
profiler/include/profiler/profile_gemm_impl.hpp
profiler/include/profiler/profile_gemm_impl.hpp
+1
-1
profiler/src/profile_transpose.cpp
profiler/src/profile_transpose.cpp
+85
-0
test/transpose/test_transpose_ut_cases.inc
test/transpose/test_transpose_ut_cases.inc
+0
-2
No files found.
include/ck/utility/tuple_helper.hpp
View file @
20ea6c75
...
...
@@ -5,6 +5,7 @@
#include "functional4.hpp"
#include "tuple.hpp"
#include "is_detected.hpp"
namespace
ck
{
...
...
@@ -33,6 +34,28 @@ __host__ __device__ constexpr auto concat_tuple_of_reference(const Tuple<X&...>&
ty
);
}
template
<
typename
...
X
,
typename
...
Y
>
__host__
__device__
constexpr
auto
concat_tuple
(
const
Tuple
<
X
...
>&
tx
,
const
Tuple
<
Y
...
>&
ty
)
{
return
unpack2
(
[
&
](
auto
...
zs
)
{
return
Tuple
<
decltype
(
zs
)...
>
{
std
::
forward
<
decltype
(
zs
)
>
(
zs
)...};
},
tx
,
ty
);
}
// Support any number of tuples to concat (also 1)
template
<
typename
...
X
>
__host__
__device__
constexpr
auto
concat_tuple
(
const
Tuple
<
X
...
>&
tx
)
{
return
tx
;
}
template
<
typename
...
X
,
typename
...
Tuples
>
__host__
__device__
constexpr
auto
concat_tuple
(
const
Tuple
<
X
...
>&
tx
,
const
Tuples
&
...
tuples
)
{
return
concat_tuple
(
tx
,
concat_tuple
(
tuples
...));
}
namespace
detail
{
template
<
typename
F
,
typename
X
,
index_t
...
Is
>
...
...
@@ -78,4 +101,69 @@ __host__ __device__ constexpr auto transform_tuples(F f, const X& x, const Y& y,
f
,
x
,
y
,
z
,
typename
arithmetic_sequence_gen
<
0
,
X
::
Size
(),
1
>::
type
{});
}
// By default unroll to the flatten
template
<
index_t
Depth
=
0
,
index_t
MaxDepth
=
-
1
>
__host__
__device__
constexpr
auto
UnrollNestedTuple
(
const
Tuple
<>&
element
)
{
return
element
;
}
template
<
index_t
Depth
=
0
,
index_t
MaxDepth
=
-
1
,
typename
T
>
__host__
__device__
constexpr
auto
UnrollNestedTuple
(
const
T
&
element
)
{
return
make_tuple
(
element
);
}
template
<
index_t
Depth
=
0
,
index_t
MaxDepth
=
-
1
,
typename
...
Ts
>
__host__
__device__
constexpr
auto
UnrollNestedTuple
(
const
Tuple
<
Ts
...
>&
tuple
)
{
if
constexpr
(
Depth
==
MaxDepth
)
{
return
tuple
;
}
else
{
return
unpack
(
[
&
](
auto
&&
...
ts
)
{
return
concat_tuple
(
UnrollNestedTuple
<
Depth
+
1
,
MaxDepth
>
(
ts
)...);
},
tuple
);
}
}
template
<
typename
...
Ts
>
__host__
__device__
constexpr
auto
TupleReverse
(
const
Tuple
<
Ts
...
>&
tuple
)
{
return
generate_tuple
(
[
&
](
auto
i
)
{
using
Idx
=
Number
<
Tuple
<
Ts
...
>::
Size
()
-
i
-
1
>
;
return
tuple
.
At
(
Idx
{});
},
Number
<
Tuple
<
Ts
...
>::
Size
()
>
{});
}
// Reduce tuple values in specific range using Function
template
<
index_t
Idx
,
index_t
End
,
typename
F
,
typename
...
Ts
>
__host__
__device__
constexpr
auto
TupleReduce
(
F
&&
f
,
const
Tuple
<
Ts
...
>&
tuple
)
{
static_assert
(
Idx
<
End
,
"Wrong parameters for TupleReduce"
);
if
constexpr
(
Idx
+
1
==
End
)
{
return
tuple
.
At
(
Number
<
Idx
>
{});
}
else
{
return
f
(
tuple
.
At
(
Number
<
Idx
>
{}),
TupleReduce
<
Idx
+
1
,
End
>
(
f
,
tuple
));
}
}
template
<
typename
T
>
using
is_tuple
=
decltype
(
std
::
declval
<
T
&>
().
IsTuple
());
template
<
typename
...
Ts
>
__host__
__device__
constexpr
auto
IsNestedTuple
(
const
Tuple
<
Ts
...
>&
)
{
return
(
is_detected
<
is_tuple
,
Ts
>::
value
||
...);
}
}
// namespace ck
library/src/tensor_operation_instance/gpu/CMakeLists.txt
View file @
20ea6c75
...
...
@@ -58,7 +58,12 @@ endfunction(add_instance_library INSTANCE_NAME)
file
(
GLOB dir_list LIST_DIRECTORIES true *
)
set
(
CK_DEVICE_INSTANCES
)
set
(
CK_DEVICE_OTHER_INSTANCES
)
set
(
CK_DEVICE_GEMM_INSTANCES
)
set
(
CK_DEVICE_CONV_INSTANCES
)
set
(
CK_DEVICE_MHA_INSTANCES
)
set
(
CK_DEVICE_CONTRACTION_INSTANCES
)
set
(
CK_DEVICE_REDUCTION_INSTANCES
)
FOREACH
(
subdir_path
${
dir_list
}
)
set
(
target_dir
)
IF
(
IS_DIRECTORY
"
${
subdir_path
}
"
)
...
...
@@ -122,7 +127,19 @@ FOREACH(subdir_path ${dir_list})
if
((
add_inst EQUAL 1
))
get_filename_component
(
target_dir
${
subdir_path
}
NAME
)
add_subdirectory
(
${
target_dir
}
)
list
(
APPEND CK_DEVICE_INSTANCES $<TARGET_OBJECTS:device_
${
target_dir
}
_instance>
)
if
(
"
${
cmake_instance
}
"
MATCHES
"gemm"
)
list
(
APPEND CK_DEVICE_GEMM_INSTANCES $<TARGET_OBJECTS:device_
${
target_dir
}
_instance>
)
elseif
(
"
${
cmake_instance
}
"
MATCHES
"conv"
)
list
(
APPEND CK_DEVICE_CONV_INSTANCES $<TARGET_OBJECTS:device_
${
target_dir
}
_instance>
)
elseif
(
"
${
cmake_instance
}
"
MATCHES
"mha"
)
list
(
APPEND CK_DEVICE_MHA_INSTANCES $<TARGET_OBJECTS:device_
${
target_dir
}
_instance>
)
elseif
(
"
${
cmake_instance
}
"
MATCHES
"contr"
)
list
(
APPEND CK_DEVICE_CONTRACTION_INSTANCES $<TARGET_OBJECTS:device_
${
target_dir
}
_instance>
)
elseif
(
"
${
cmake_instance
}
"
MATCHES
"reduce"
)
list
(
APPEND CK_DEVICE_REDUCTION_INSTANCES $<TARGET_OBJECTS:device_
${
target_dir
}
_instance>
)
else
()
list
(
APPEND CK_DEVICE_OTHER_INSTANCES $<TARGET_OBJECTS:device_
${
target_dir
}
_instance>
)
endif
()
message
(
"add_instance_directory
${
subdir_path
}
"
)
else
()
message
(
"skip_instance_directory
${
subdir_path
}
"
)
...
...
@@ -130,18 +147,14 @@ FOREACH(subdir_path ${dir_list})
ENDIF
()
ENDFOREACH
()
add_library
(
device_operations STATIC
${
CK_DEVICE_INSTANCES
}
)
add_library
(
composablekernels::device_operations ALIAS device_operations
)
set
(
DEV_OPS_INC_DIRS
${
PROJECT_SOURCE_DIR
}
/include/ck/
${
PROJECT_SOURCE_DIR
}
/library/include/ck/
)
target_compile_features
(
device_operations PUBLIC
)
set_target_properties
(
device_operations PROPERTIES POSITION_INDEPENDENT_CODE ON
)
target_include_directories
(
device_operations PUBLIC
if
(
CK_DEVICE_OTHER_INSTANCES
)
add_library
(
device_other_operations STATIC
${
CK_DEVICE_OTHER_INSTANCES
}
)
add_library
(
composablekernels::device_other_operations ALIAS device_other_operations
)
target_compile_features
(
device_other_operations PUBLIC
)
set_target_properties
(
device_other_operations PROPERTIES POSITION_INDEPENDENT_CODE ON
)
target_include_directories
(
device_other_operations PUBLIC
$<INSTALL_INTERFACE:
${
CMAKE_INSTALL_INCLUDEDIR
}
/ck>
$<INSTALL_INTERFACE:
${
CMAKE_INSTALL_INCLUDEDIR
}
/ck/utility>
$<INSTALL_INTERFACE:
${
CMAKE_INSTALL_INCLUDEDIR
}
/ck/tensor_description>
...
...
@@ -157,23 +170,115 @@ target_include_directories(device_operations PUBLIC
$<INSTALL_INTERFACE:
${
CMAKE_INSTALL_INCLUDEDIR
}
/ck/library/utility>
$<INSTALL_INTERFACE:
${
CMAKE_INSTALL_INCLUDEDIR
}
/ck/library/tensor_operation_instance>
$<INSTALL_INTERFACE:
${
CMAKE_INSTALL_INCLUDEDIR
}
/ck/library/tensor_operation_instance/gpu>
$<INSTALL_INTERFACE:
${
CMAKE_INSTALL_INCLUDEDIR
}
/ck/library/tensor_operation_instance/gpu/quantization>
$<INSTALL_INTERFACE:
${
CMAKE_INSTALL_INCLUDEDIR
}
/ck/library/tensor_operation_instance/gpu/softmax>
)
rocm_install
(
TARGETS device_other_operations
EXPORT device_other_operationsTargets
)
rocm_install
(
EXPORT device_other_operationsTargets
FILE composable_kerneldevice_other_operationsTargets.cmake
NAMESPACE composable_kernel::
DESTINATION
${
CMAKE_INSTALL_LIBDIR
}
/cmake/composable_kernel
)
endif
()
if
(
CK_DEVICE_GEMM_INSTANCES
)
add_library
(
device_gemm_operations STATIC
${
CK_DEVICE_GEMM_INSTANCES
}
)
add_library
(
composablekernels::device_gemm_operations ALIAS device_gemm_operations
)
target_compile_features
(
device_gemm_operations PUBLIC
)
set_target_properties
(
device_gemm_operations PROPERTIES POSITION_INDEPENDENT_CODE ON
)
target_include_directories
(
device_gemm_operations PUBLIC
$<INSTALL_INTERFACE:
${
CMAKE_INSTALL_INCLUDEDIR
}
/ck/library/tensor_operation_instance/gpu>
)
rocm_install
(
TARGETS device_gemm_operations
EXPORT device_gemm_operationsTargets
)
rocm_install
(
EXPORT device_gemm_operationsTargets
FILE composable_kerneldevice_gemm_operationsTargets.cmake
NAMESPACE composable_kernel::
DESTINATION
${
CMAKE_INSTALL_LIBDIR
}
/cmake/composable_kernel
)
endif
()
if
(
CK_DEVICE_CONV_INSTANCES
)
add_library
(
device_conv_operations STATIC
${
CK_DEVICE_CONV_INSTANCES
}
)
add_library
(
composablekernels::device_conv_operations ALIAS device_conv_operations
)
target_compile_features
(
device_conv_operations PUBLIC
)
set_target_properties
(
device_conv_operations PROPERTIES POSITION_INDEPENDENT_CODE ON
)
target_include_directories
(
device_conv_operations PUBLIC
$<INSTALL_INTERFACE:
${
CMAKE_INSTALL_INCLUDEDIR
}
/ck/library/tensor_operation_instance/gpu>
$<INSTALL_INTERFACE:
${
CMAKE_INSTALL_INCLUDEDIR
}
/ck/library/tensor_operation_instance/gpu/conv_tensor_rearrange>
$<INSTALL_INTERFACE:
${
CMAKE_INSTALL_INCLUDEDIR
}
/ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_data>
$<INSTALL_INTERFACE:
${
CMAKE_INSTALL_INCLUDEDIR
}
/ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_weight>
$<INSTALL_INTERFACE:
${
CMAKE_INSTALL_INCLUDEDIR
}
/ck/library/tensor_operation_instance/gpu/grouped_conv_fwd>
)
rocm_install
(
TARGETS device_conv_operations
EXPORT device_conv_operationsTargets
)
rocm_install
(
EXPORT device_conv_operationsTargets
FILE composable_kerneldevice_conv_operationsTargets.cmake
NAMESPACE composable_kernel::
DESTINATION
${
CMAKE_INSTALL_LIBDIR
}
/cmake/composable_kernel
)
endif
()
if
(
CK_DEVICE_MHA_INSTANCES
)
add_library
(
device_mha_operations STATIC
${
CK_DEVICE_MHA_INSTANCES
}
)
add_library
(
composablekernels::device_mha_operations ALIAS device_mha_operations
)
target_compile_features
(
device_mha_operations PUBLIC
)
set_target_properties
(
device_mha_operations PROPERTIES POSITION_INDEPENDENT_CODE ON
)
target_include_directories
(
device_mha_operations PUBLIC
$<INSTALL_INTERFACE:
${
CMAKE_INSTALL_INCLUDEDIR
}
/ck/library/tensor_operation_instance/gpu/mha>
)
rocm_install
(
TARGETS device_mha_operations
EXPORT device_mha_operationsTargets
)
rocm_install
(
EXPORT device_mha_operationsTargets
FILE composable_kerneldevice_mha_operationsTargets.cmake
NAMESPACE composable_kernel::
DESTINATION
${
CMAKE_INSTALL_LIBDIR
}
/cmake/composable_kernel
)
endif
()
if
(
CK_DEVICE_CONTRACTION_INSTANCES
)
add_library
(
device_contraction_operations STATIC
${
CK_DEVICE_CONTRACTION_INSTANCES
}
)
add_library
(
composablekernels::device_contraction_operations ALIAS device_contraction_operations
)
target_compile_features
(
device_contraction_operations PUBLIC
)
set_target_properties
(
device_contraction_operations PROPERTIES POSITION_INDEPENDENT_CODE ON
)
target_include_directories
(
device_contraction_operations PUBLIC
$<INSTALL_INTERFACE:
${
CMAKE_INSTALL_INCLUDEDIR
}
/ck/library/tensor_operation_instance/gpu>
$<INSTALL_INTERFACE:
${
CMAKE_INSTALL_INCLUDEDIR
}
/ck/library/tensor_operation_instance/gpu/contraction>
)
rocm_install
(
TARGETS device_contraction_operations
EXPORT device_contraction_operationsTargets
)
rocm_install
(
EXPORT device_contraction_operationsTargets
FILE composable_kerneldevice_contraction_operationsTargets.cmake
NAMESPACE composable_kernel::
DESTINATION
${
CMAKE_INSTALL_LIBDIR
}
/cmake/composable_kernel
)
endif
()
if
(
CK_DEVICE_REDUCTION_INSTANCES
)
add_library
(
device_reduction_operations STATIC
${
CK_DEVICE_REDUCTION_INSTANCES
}
)
add_library
(
composablekernels::device_reduction_operations ALIAS device_reduction_operations
)
target_compile_features
(
device_reduction_operations PUBLIC
)
set_target_properties
(
device_reduction_operations PROPERTIES POSITION_INDEPENDENT_CODE ON
)
target_include_directories
(
device_reduction_operations PUBLIC
$<INSTALL_INTERFACE:
${
CMAKE_INSTALL_INCLUDEDIR
}
/ck/library/tensor_operation_instance/gpu/reduce>
)
#once new arches are enabled make this an option on the main cmake file
# and pass down here to be exported
target_compile_options
(
device_operations PRIVATE
--offload-arch=gfx908
--offload-arch=gfx90a
)
# install(TARGETS device_operations LIBRARY DESTINATION lib)
rocm_install
(
TARGETS device_operations
EXPORT device_operationsTargets
)
rocm_install
(
DIRECTORY
${
DEV_OPS_INC_DIRS
}
DESTINATION
${
CMAKE_INSTALL_INCLUDEDIR
}
/ck
)
rocm_install
(
EXPORT device_operationsTargets
FILE composable_kerneldevice_operationsTargets.cmake
)
rocm_install
(
TARGETS device_reduction_operations
EXPORT device_reduction_operationsTargets
)
rocm_install
(
EXPORT device_reduction_operationsTargets
FILE composable_kerneldevice_reduction_operationsTargets.cmake
NAMESPACE composable_kernel::
DESTINATION
${
CMAKE_INSTALL_LIBDIR
}
/cmake/composable_kernel
)
endif
()
add_library
(
device_operations INTERFACE
)
target_link_libraries
(
device_operations INTERFACE
device_contraction_operations
device_conv_operations
device_gemm_operations
device_other_operations
device_reduction_operations
utility
)
set
(
DEV_OPS_INC_DIRS
${
PROJECT_SOURCE_DIR
}
/include/ck/
${
PROJECT_SOURCE_DIR
}
/library/include/ck/
)
rocm_install
(
DIRECTORY
${
DEV_OPS_INC_DIRS
}
DESTINATION
${
CMAKE_INSTALL_INCLUDEDIR
}
/ck
)
profiler/include/profiler/profile_gemm_impl.hpp
View file @
20ea6c75
...
...
@@ -166,7 +166,7 @@ int profile_gemm_impl(int do_verification,
std
::
string
op_name
=
op_ptr
->
GetTypeString
();
float
avg_time
=
invoker_ptr
->
Run
(
argument_ptr
.
get
(),
StreamConfig
{
nullptr
,
time_kernel
});
invoker_ptr
->
Run
(
argument_ptr
.
get
(),
StreamConfig
{
nullptr
,
time_kernel
,
0
,
10
,
50
});
std
::
size_t
flop
=
std
::
size_t
(
2
)
*
M
*
N
*
K
;
...
...
profiler/src/profile_transpose.cpp
0 → 100644
View file @
20ea6c75
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
#include <iostream>
#include <numeric>
#include <initializer_list>
#include <cstdlib>
#include "profiler/profile_transpose_impl.hpp"
#include "profiler_operation_registry.hpp"
enum
struct
MatrixLayout
{
NCDHW
,
// 0
NCHWD
,
// 1
};
enum
struct
DataType
{
F32_F32_F32_F32_F32
,
// 0
F16_F16_F16_F16_F16
,
// 1
};
#define OP_NAME "transpose"
#define OP_DESC "Transpose"
int
profile_transpose
(
int
argc
,
char
*
argv
[])
{
if
(
argc
!=
15
)
{
printf
(
"arg1: tensor operation ("
OP_NAME
": "
OP_DESC
")
\n
"
);
printf
(
"arg2: data type (0: fp32; 1: fp16)
\n
"
);
// printf("arg3: matrix layout (NCDHW -> NDCHW);\n");
printf
(
"arg4: verification (0: no; 1: yes)
\n
"
);
printf
(
"arg5: initialization (0: no init; 1: integer value; 2: decimal value)
\n
"
);
printf
(
"arg6: print tensor value (0: no; 1: yes)
\n
"
);
printf
(
"arg7: time kernel (0=no, 1=yes)
\n
"
);
printf
(
"arg8 to 13: N, C, D, H, W
\n
"
);
exit
(
1
);
}
const
auto
data_type
=
static_cast
<
DataType
>
(
std
::
stoi
(
argv
[
2
]));
// const auto layout = static_cast<MatrixLayout>(std::stoi(argv[3]));
const
bool
do_verification
=
std
::
stoi
(
argv
[
3
]);
const
int
init_method
=
std
::
stoi
(
argv
[
4
]);
const
bool
do_log
=
std
::
stoi
(
argv
[
5
]);
const
bool
time_kernel
=
std
::
stoi
(
argv
[
6
]);
std
::
vector
<
index_t
>
lengths
=
std
::
stoi
(
argv
[
7
]);
/**const int N = std::stoi(argv[7]);
const int C = std::stoi(argv[8]);
const int D = std::stoi(argv[9]);
const int H = std::stoi(argv[10]);
const int W = std::stoi(argv[11]);**/
using
F32
=
float
;
using
F16
=
ck
::
half_t
;
auto
profile
=
[
&
](
auto
a_type
,
auto
b_type
)
{
using
ADataType
=
decltype
(
a_type
);
using
BDataType
=
decltype
(
b_type
);
bool
pass
=
ck
::
profiler
::
profile_transpose_impl
<
ADataType
,
BDataType
>
(
do_verification
,
init_method
,
do_log
,
time_kernel
,
lengths
);
return
pass
?
0
:
1
;
};
if
(
data_type
==
GemmDataType
::
F32_F32_F32_F32_F32
)
{
return
profile
(
F32
{},
F32
{});
}
else
if
(
data_type
==
GemmDataType
::
F16_F16_F16_F16_F16
)
{
return
profile
(
F16
{},
F16
{});
}
else
{
std
::
cout
<<
"this data_type & layout is not implemented"
<<
std
::
endl
;
return
1
;
}
}
REGISTER_PROFILER_OPERATION
(
OP_NAME
,
OP_DESC
,
profile_gemm_transpose
);
test/transpose/test_transpose_ut_cases.inc
View file @
20ea6c75
...
...
@@ -14,7 +14,6 @@ TYPED_TEST(TestTranspose, Test1)
this
->
Run
();
}
TYPED_TEST
(
TestTranpose
,
Test2
)
{
std
::
vector
<
int
>
Ms
{
127
,
255
,
312
,
799
,
1573
};
...
...
@@ -27,4 +26,3 @@ TYPED_TEST(TestTranpose, Test2)
this
->
Run
();
}
Prev
1
2
3
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment