Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
composable_kernel
Commits
e573a2a0
"git@developer.sourcefind.cn:gaoqiong/yaml-cpp.git" did not exist on "d0da14404ef7cc960ef01bbf97775ead08d3c477"
Unverified
Commit
e573a2a0
authored
Jun 30, 2022
by
Chao Liu
Committed by
GitHub
Jun 30, 2022
Browse files
Merge branch 'develop' into batched_gemm_g_stride_fix
parents
6adf3591
0dcb3496
Changes
258
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
454 additions
and
105 deletions
+454
-105
README.md
README.md
+7
-0
client_example/01_gemm/CMakeLists.txt
client_example/01_gemm/CMakeLists.txt
+2
-0
client_example/01_gemm/gemm.cpp
client_example/01_gemm/gemm.cpp
+218
-0
client_example/02_gemm_add_add_fastgelu/gemm_add_add_fastgelu.cpp
...xample/02_gemm_add_add_fastgelu/gemm_add_add_fastgelu.cpp
+33
-29
client_example/03_gemm_layernorm/CMakeLists.txt
client_example/03_gemm_layernorm/CMakeLists.txt
+2
-2
client_example/03_gemm_layernorm/gemm_add_add_layernorm.cpp
client_example/03_gemm_layernorm/gemm_add_add_layernorm.cpp
+10
-9
client_example/CMakeLists.txt
client_example/CMakeLists.txt
+1
-0
client_example/README.md
client_example/README.md
+1
-12
include/ck/tensor_operation/gpu/device/device_batched_gemm.hpp
...de/ck/tensor_operation/gpu/device/device_batched_gemm.hpp
+23
-4
include/ck/tensor_operation/gpu/device/device_batched_gemm_xdl.hpp
...k/tensor_operation/gpu/device/device_batched_gemm_xdl.hpp
+10
-3
include/ck/tensor_operation/gpu/device/device_gemm.hpp
include/ck/tensor_operation/gpu/device/device_gemm.hpp
+36
-17
include/ck/tensor_operation/gpu/device/device_gemm_dl.hpp
include/ck/tensor_operation/gpu/device/device_gemm_dl.hpp
+11
-4
include/ck/tensor_operation/gpu/device/device_gemm_multiple_d.hpp
...ck/tensor_operation/gpu/device/device_gemm_multiple_d.hpp
+25
-5
include/ck/tensor_operation/gpu/device/device_gemm_multiple_d_xdl_cshuffle.hpp
...ration/gpu/device/device_gemm_multiple_d_xdl_cshuffle.hpp
+10
-4
include/ck/tensor_operation/gpu/device/device_gemm_reduce.hpp
...ude/ck/tensor_operation/gpu/device/device_gemm_reduce.hpp
+3
-0
include/ck/tensor_operation/gpu/device/device_gemm_splitk.hpp
...ude/ck/tensor_operation/gpu/device/device_gemm_splitk.hpp
+24
-4
include/ck/tensor_operation/gpu/device/device_gemm_xdl.hpp
include/ck/tensor_operation/gpu/device/device_gemm_xdl.hpp
+10
-4
include/ck/tensor_operation/gpu/device/device_gemm_xdl_cshuffle.hpp
.../tensor_operation/gpu/device/device_gemm_xdl_cshuffle.hpp
+10
-4
include/ck/tensor_operation/gpu/device/device_gemm_xdl_splitk.hpp
...ck/tensor_operation/gpu/device/device_gemm_xdl_splitk.hpp
+9
-2
include/ck/tensor_operation/gpu/device/device_gemm_xdl_splitk_c_shuffle.hpp
...operation/gpu/device/device_gemm_xdl_splitk_c_shuffle.hpp
+9
-2
No files found.
README.md
View file @
e573a2a0
...
@@ -26,6 +26,7 @@ cmake \
...
@@ -26,6 +26,7 @@ cmake \
-D
CMAKE_CXX_FLAGS
=
" --offload-arch=gfx908 --offload-arch=gfx90a -O3"
\
-D
CMAKE_CXX_FLAGS
=
" --offload-arch=gfx908 --offload-arch=gfx90a -O3"
\
-D
CMAKE_CXX_COMPILER
=
/opt/rocm/bin/hipcc
\
-D
CMAKE_CXX_COMPILER
=
/opt/rocm/bin/hipcc
\
-D
CMAKE_PREFIX_PATH
=
/opt/rocm
\
-D
CMAKE_PREFIX_PATH
=
/opt/rocm
\
-D
CMAKE_INSTALL_PREFIX
=
${
PATH_TO_CK_INSTALL_DIRECTORY
}
\
..
..
```
```
...
@@ -47,6 +48,12 @@ Instructions for running each individual examples are under ```example/```
...
@@ -47,6 +48,12 @@ Instructions for running each individual examples are under ```example/```
```
```
Instructions for running ckProfiler are under
```profiler/```
Instructions for running ckProfiler are under
```profiler/```
## Install CK
```
bash
make
install
```
## Using CK as pre-built kernel library
## Caveat
## Caveat
### Kernel Timing and Verification
### Kernel Timing and Verification
...
...
client_example/01_gemm/CMakeLists.txt
0 → 100644
View file @
e573a2a0
add_executable
(
client_gemm gemm.cpp
)
target_link_libraries
(
client_gemm PRIVATE composable_kernel::device_operations
)
client_example/01_gemm/gemm.cpp
0 → 100644
View file @
e573a2a0
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#include <iomanip>
#include <vector>
#include <iostream>
#include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/tensor_operation/gpu/device/device_gemm.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/library/tensor_operation_instance/gpu/gemm.hpp"
using
F16
=
ck
::
half_t
;
using
F32
=
float
;
using
Row
=
ck
::
tensor_layout
::
gemm
::
RowMajor
;
using
Col
=
ck
::
tensor_layout
::
gemm
::
ColumnMajor
;
using
PassThrough
=
ck
::
tensor_operation
::
element_wise
::
PassThrough
;
using
AElementOp
=
PassThrough
;
using
BElementOp
=
PassThrough
;
using
CElementOp
=
PassThrough
;
using
ADataType
=
F16
;
using
BDataType
=
F16
;
using
CDataType
=
F16
;
using
ALayout
=
Row
;
using
BLayout
=
Col
;
using
CLayout
=
Row
;
struct
SimpleDeviceMem
{
SimpleDeviceMem
()
=
delete
;
SimpleDeviceMem
(
std
::
size_t
mem_size
)
:
p_mem_
{}
{
(
void
)
hipMalloc
(
static_cast
<
void
**>
(
&
p_mem_
),
mem_size
);
}
void
*
GetDeviceBuffer
()
{
return
p_mem_
;
}
~
SimpleDeviceMem
()
{
(
void
)
hipFree
(
p_mem_
);
}
void
*
p_mem_
;
};
int
main
(
int
argc
,
char
*
argv
[])
{
// GEMM shape
ck
::
index_t
M
=
3840
;
ck
::
index_t
N
=
4096
;
ck
::
index_t
K
=
4096
;
ck
::
index_t
StrideA
=
4096
;
ck
::
index_t
StrideB
=
4096
;
ck
::
index_t
StrideC
=
4096
;
if
(
argc
==
1
)
{
// use default case
}
else
if
(
argc
==
5
)
{
M
=
std
::
stoi
(
argv
[
1
]);
N
=
std
::
stoi
(
argv
[
2
]);
K
=
std
::
stoi
(
argv
[
3
]);
StrideA
=
std
::
stoi
(
argv
[
4
]);
StrideB
=
std
::
stoi
(
argv
[
5
]);
StrideC
=
std
::
stoi
(
argv
[
6
]);
}
else
{
printf
(
"arg1 to 6: M, N, K, StrideA, StrideB, StrideC
\n
"
);
exit
(
0
);
}
auto
f_matrix_space_size
=
[](
std
::
size_t
nRow
,
std
::
size_t
nCol
,
std
::
size_t
stride
,
auto
layout
)
{
using
Layout
=
decltype
(
layout
);
if
(
std
::
is_same
<
Layout
,
ck
::
tensor_layout
::
gemm
::
RowMajor
>::
value
)
{
return
(
nRow
-
1
)
*
stride
+
nCol
;
}
else
{
return
(
nCol
-
1
)
*
stride
+
nRow
;
}
};
SimpleDeviceMem
a_device_buf
(
sizeof
(
ADataType
)
*
f_matrix_space_size
(
M
,
K
,
StrideA
,
ALayout
{}));
SimpleDeviceMem
b_device_buf
(
sizeof
(
BDataType
)
*
f_matrix_space_size
(
K
,
N
,
StrideB
,
BLayout
{}));
SimpleDeviceMem
c_device_buf
(
sizeof
(
CDataType
)
*
f_matrix_space_size
(
M
,
N
,
StrideC
,
CLayout
{}));
using
DeviceOp
=
ck
::
tensor_operation
::
device
::
DeviceGemm
<
ALayout
,
BLayout
,
CLayout
,
ADataType
,
BDataType
,
CDataType
,
ck
::
tensor_operation
::
element_wise
::
PassThrough
,
ck
::
tensor_operation
::
element_wise
::
PassThrough
,
ck
::
tensor_operation
::
element_wise
::
PassThrough
>
;
// get device op instances
const
auto
op_ptrs
=
ck
::
tensor_operation
::
device
::
instance
::
DeviceOperationInstanceFactory
<
DeviceOp
>::
GetInstances
();
std
::
cout
<<
"found "
<<
op_ptrs
.
size
()
<<
" instances"
<<
std
::
endl
;
const
auto
a_element_op
=
AElementOp
{};
const
auto
b_element_op
=
BElementOp
{};
const
auto
c_element_op
=
CElementOp
{};
std
::
string
best_op_name
;
bool
found
=
false
;
int
best_op_id
=
-
1
;
float
best_ave_time
=
0
;
float
best_tflops
=
0
;
float
best_gb_per_sec
=
0
;
// profile device operation instances
std
::
cout
<<
"Run all instances and do timing"
<<
std
::
endl
;
for
(
int
i
=
0
;
i
<
op_ptrs
.
size
();
++
i
)
{
auto
&
op_ptr
=
op_ptrs
[
i
];
auto
argument_ptr
=
op_ptr
->
MakeArgumentPointer
(
a_device_buf
.
GetDeviceBuffer
(),
b_device_buf
.
GetDeviceBuffer
(),
c_device_buf
.
GetDeviceBuffer
(),
M
,
N
,
K
,
StrideA
,
StrideB
,
StrideC
,
a_element_op
,
b_element_op
,
c_element_op
);
auto
invoker_ptr
=
op_ptr
->
MakeInvokerPointer
();
std
::
string
op_name
=
op_ptr
->
GetTypeString
();
if
(
op_ptr
->
IsSupportedArgument
(
argument_ptr
.
get
()))
{
float
ave_time
=
invoker_ptr
->
Run
(
argument_ptr
.
get
(),
StreamConfig
{
nullptr
,
true
});
std
::
size_t
flop
=
std
::
size_t
(
2
)
*
M
*
N
*
K
;
std
::
size_t
num_btype
=
sizeof
(
ADataType
)
*
M
*
K
+
sizeof
(
BDataType
)
*
K
*
N
+
sizeof
(
CDataType
)
*
M
*
N
;
float
tflops
=
static_cast
<
float
>
(
flop
)
/
1.E9
/
ave_time
;
float
gb_per_sec
=
num_btype
/
1.E6
/
ave_time
;
std
::
cout
<<
"Perf: "
<<
std
::
setw
(
10
)
<<
ave_time
<<
" ms, "
<<
tflops
<<
" TFlops, "
<<
gb_per_sec
<<
" GB/s, "
<<
op_name
<<
std
::
endl
;
if
(
tflops
>
best_tflops
)
{
found
=
true
;
best_op_id
=
i
;
best_op_name
=
op_name
;
best_tflops
=
tflops
;
best_ave_time
=
ave_time
;
best_gb_per_sec
=
gb_per_sec
;
}
}
else
{
std
::
cout
<<
op_name
<<
" does not support this problem"
<<
std
::
endl
;
}
}
std
::
cout
<<
"Best Perf: "
<<
best_ave_time
<<
" ms, "
<<
best_tflops
<<
" TFlops, "
<<
best_gb_per_sec
<<
" GB/s, "
<<
best_op_name
<<
std
::
endl
;
// run the best intance
{
auto
&
op_ptr
=
op_ptrs
[
best_op_id
];
std
::
cout
<<
"Run the best instance without timing: "
<<
op_ptr
->
GetTypeString
()
<<
std
::
endl
;
auto
argument_ptr
=
op_ptr
->
MakeArgumentPointer
(
a_device_buf
.
GetDeviceBuffer
(),
b_device_buf
.
GetDeviceBuffer
(),
c_device_buf
.
GetDeviceBuffer
(),
M
,
N
,
K
,
StrideA
,
StrideB
,
StrideC
,
a_element_op
,
b_element_op
,
c_element_op
);
auto
invoker_ptr
=
op_ptr
->
MakeInvokerPointer
();
if
(
op_ptr
->
IsSupportedArgument
(
argument_ptr
.
get
()))
{
invoker_ptr
->
Run
(
argument_ptr
.
get
(),
StreamConfig
{
nullptr
,
false
});
}
std
::
cout
<<
"Done"
<<
std
::
endl
;
}
return
0
;
}
client_example/02_gemm_add_add_fastgelu/gemm_add_add_fastgelu.cpp
View file @
e573a2a0
...
@@ -10,7 +10,7 @@
...
@@ -10,7 +10,7 @@
#include "ck/tensor_operation/gpu/device/device_gemm_multiple_d.hpp"
#include "ck/tensor_operation/gpu/device/device_gemm_multiple_d.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/library/tensor_operation_instance/gpu/
device_
gemm_add_add_fastgelu
_instance
.hpp"
#include "ck/library/tensor_operation_instance/gpu/gemm_add_add_fastgelu.hpp"
using
F16
=
ck
::
half_t
;
using
F16
=
ck
::
half_t
;
using
F32
=
float
;
using
F32
=
float
;
...
@@ -25,18 +25,17 @@ using AElementOp = PassThrough;
...
@@ -25,18 +25,17 @@ using AElementOp = PassThrough;
using
BElementOp
=
PassThrough
;
using
BElementOp
=
PassThrough
;
using
CDEElementOp
=
AddAddFastGelu
;
using
CDEElementOp
=
AddAddFastGelu
;
using
ADataType
=
F16
;
using
ADataType
=
F16
;
using
BDataType
=
F16
;
using
BDataType
=
F16
;
using
AccDataType
=
F32
;
using
D0DataType
=
F16
;
using
D0DataType
=
F16
;
using
D1DataType
=
F16
;
using
D1DataType
=
F16
;
using
EDataType
=
F16
;
using
EDataType
=
F16
;
using
ALayout
=
Row
;
using
ALayout
=
Row
;
using
BLayout
=
Col
;
using
BLayout
=
Col
;
using
D
0
Layout
=
Row
;
using
D
DE
Layout
=
Row
;
using
D
1
Layout
=
Row
;
using
D
DE
Layout
=
Row
;
using
ELayout
=
Row
;
using
D
ELayout
=
Row
;
struct
SimpleDeviceMem
struct
SimpleDeviceMem
{
{
...
@@ -106,24 +105,27 @@ int main(int argc, char* argv[])
...
@@ -106,24 +105,27 @@ int main(int argc, char* argv[])
SimpleDeviceMem
a_device_buf
(
sizeof
(
ADataType
)
*
f_matrix_space_size
(
M
,
K
,
StrideA
,
ALayout
{}));
SimpleDeviceMem
a_device_buf
(
sizeof
(
ADataType
)
*
f_matrix_space_size
(
M
,
K
,
StrideA
,
ALayout
{}));
SimpleDeviceMem
b_device_buf
(
sizeof
(
BDataType
)
*
f_matrix_space_size
(
K
,
N
,
StrideB
,
BLayout
{}));
SimpleDeviceMem
b_device_buf
(
sizeof
(
BDataType
)
*
f_matrix_space_size
(
K
,
N
,
StrideB
,
BLayout
{}));
SimpleDeviceMem
d0_m_n_device_buf
(
sizeof
(
D0DataType
)
*
SimpleDeviceMem
d0_m_n_device_buf
(
sizeof
(
D0DataType
)
*
f_matrix_space_size
(
M
,
N
,
StrideD0
,
D
0
Layout
{}));
f_matrix_space_size
(
M
,
N
,
StrideD0
,
D
DE
Layout
{}));
SimpleDeviceMem
d1_m_n_device_buf
(
sizeof
(
D1DataType
)
*
SimpleDeviceMem
d1_m_n_device_buf
(
sizeof
(
D1DataType
)
*
f_matrix_space_size
(
M
,
N
,
StrideD1
,
D1Layout
{}));
f_matrix_space_size
(
M
,
N
,
StrideD1
,
DDELayout
{}));
SimpleDeviceMem
e_device_buf
(
sizeof
(
EDataType
)
*
f_matrix_space_size
(
M
,
N
,
StrideE
,
ELayout
{}));
SimpleDeviceMem
e_device_buf
(
sizeof
(
EDataType
)
*
f_matrix_space_size
(
M
,
N
,
StrideE
,
DELayout
{}));
// add device op instances
const
auto
op_ptrs
=
ck
::
tensor_operation
::
device
::
device_gemm_instance
::
using
DeviceOp
=
ck
::
tensor_operation
::
device
::
DeviceGemmMultipleD
<
get_device_gemm_add_add_fastgelu_instances
<
ADataType
,
ALayout
,
BDataType
,
BLayout
,
AccDataType
,
DDELayout
,
D0DataType
,
ADataType
,
D1DataType
,
BDataType
,
EDataType
,
ck
::
Tuple
<
D0DataType
,
D1DataType
>
,
ALayout
,
EDataType
,
BLayout
,
ck
::
tensor_operation
::
element_wise
::
PassThrough
,
D0Layout
,
ck
::
tensor_operation
::
element_wise
::
PassThrough
,
D1Layout
,
ck
::
tensor_operation
::
element_wise
::
AddAddFastGelu
>
;
ELayout
>
();
// get device op instances
const
auto
op_ptrs
=
ck
::
tensor_operation
::
device
::
instance
::
DeviceOperationInstanceFactory
<
DeviceOp
>::
GetInstances
();
std
::
cout
<<
"found "
<<
op_ptrs
.
size
()
<<
" instances"
<<
std
::
endl
;
std
::
cout
<<
"found "
<<
op_ptrs
.
size
()
<<
" instances"
<<
std
::
endl
;
...
@@ -231,6 +233,8 @@ int main(int argc, char* argv[])
...
@@ -231,6 +233,8 @@ int main(int argc, char* argv[])
{
{
invoker_ptr
->
Run
(
argument_ptr
.
get
(),
StreamConfig
{
nullptr
,
false
});
invoker_ptr
->
Run
(
argument_ptr
.
get
(),
StreamConfig
{
nullptr
,
false
});
}
}
std
::
cout
<<
"Done"
<<
std
::
endl
;
}
}
return
0
;
return
0
;
...
...
client_example/03_gemm_layernorm/CMakeLists.txt
View file @
e573a2a0
add_executable
(
gemm_add_add_reduce_normalize gemm_add_add_layernorm.cpp
)
add_executable
(
client_
gemm_add_add_reduce_normalize gemm_add_add_layernorm.cpp
)
target_link_libraries
(
gemm_add_add_reduce_normalize PRIVATE composable_kernel::device_operations
)
target_link_libraries
(
client_
gemm_add_add_reduce_normalize PRIVATE composable_kernel::device_operations
)
client_example/03_gemm_layernorm/gemm_add_add_layernorm.cpp
View file @
e573a2a0
...
@@ -160,16 +160,17 @@ int main()
...
@@ -160,16 +160,17 @@ int main()
ck
::
index_t
StrideC
=
1024
;
ck
::
index_t
StrideC
=
1024
;
ck
::
index_t
StrideD0
=
1024
;
ck
::
index_t
StrideD0
=
1024
;
const
auto
gemm_reduce_ptrs
=
ck
::
tensor_operation
::
device
::
device_gemm_instance
::
const
auto
gemm_reduce_ptrs
=
get_device_gemm_add_add_mean_squaremean_instances
<
ADataType
,
ck
::
tensor_operation
::
device
::
instance
::
get_device_gemm_add_add_mean_squaremean_instances
<
BDataType
,
ADataType
,
CDataType
,
BDataType
,
ALayout
,
CDataType
,
BLayout
,
ALayout
,
CLayout
>
();
BLayout
,
CLayout
>
();
const
auto
normalize_ptrs
=
const
auto
normalize_ptrs
=
ck
::
tensor_operation
::
device
::
get_device_normalize_from_mean_meansquare_instances
<
ck
::
tensor_operation
::
device
::
instance
::
get_device_normalize_from_mean_meansquare_instances
<
CDataType
,
CDataType
,
ReduceDataType
,
ReduceDataType
,
ReduceDataType
,
ReduceDataType
,
...
@@ -267,4 +268,4 @@ int main()
...
@@ -267,4 +268,4 @@ int main()
<<
std
::
endl
;
<<
std
::
endl
;
}
}
}
}
}
}
\ No newline at end of file
client_example/CMakeLists.txt
View file @
e573a2a0
...
@@ -6,5 +6,6 @@ find_package(composable_kernel 1.0.0 COMPONENTS device_operations)
...
@@ -6,5 +6,6 @@ find_package(composable_kernel 1.0.0 COMPONENTS device_operations)
find_package
(
hip REQUIRED PATHS /opt/rocm
)
find_package
(
hip REQUIRED PATHS /opt/rocm
)
message
(
STATUS
"Build with HIP
${
hip_VERSION
}
"
)
message
(
STATUS
"Build with HIP
${
hip_VERSION
}
"
)
add_subdirectory
(
01_gemm
)
add_subdirectory
(
02_gemm_add_add_fastgelu
)
add_subdirectory
(
02_gemm_add_add_fastgelu
)
add_subdirectory
(
03_gemm_layernorm
)
add_subdirectory
(
03_gemm_layernorm
)
client_example/README.md
View file @
e573a2a0
##
##
Client application links to CK library, and therefore CK library needs to be installed before building client applications.
Client application links to CK library, and therefore CK library needs to be installed before building client applications.
## Docker script
```
bash
docker run
\
-it
\
--privileged
\
--group-add
sudo
\
-w
/root/workspace
\
-v
${
PATH_TO_LOCAL_WORKSPACE
}
:/root/workspace
\
rocm/tensorflow:rocm5.1-tf2.6-dev
\
/bin/bash
```
## Build
## Build
```
bash
```
bash
...
@@ -22,7 +11,7 @@ cd client_example/build
...
@@ -22,7 +11,7 @@ cd client_example/build
```
bash
```
bash
cmake
\
cmake
\
-D
CMAKE_CXX_COMPILER
=
/opt/rocm/bin/hipcc
\
-D
CMAKE_CXX_COMPILER
=
/opt/rocm/bin/hipcc
\
-D
CMAKE_PREFIX_PATH
=
/opt/rocm
\
-D
CMAKE_PREFIX_PATH
=
"
/opt/rocm
;
${
PATH_TO_CK_INSTALL_DIRECTORY
}
"
\
..
..
```
```
...
...
include/ck/tensor_operation/gpu/device/device_batched_gemm.hpp
View file @
e573a2a0
...
@@ -12,7 +12,13 @@ namespace ck {
...
@@ -12,7 +12,13 @@ namespace ck {
namespace
tensor_operation
{
namespace
tensor_operation
{
namespace
device
{
namespace
device
{
template
<
typename
AElementwiseOperation
,
template
<
typename
ALayout
,
typename
BLayout
,
typename
CLayout
,
typename
ADataType
,
typename
BDataType
,
typename
CDataType
,
typename
AElementwiseOperation
,
typename
BElementwiseOperation
,
typename
BElementwiseOperation
,
typename
CElementwiseOperation
>
typename
CElementwiseOperation
>
struct
DeviceBatchedGemm
:
public
BaseOperator
struct
DeviceBatchedGemm
:
public
BaseOperator
...
@@ -37,11 +43,24 @@ struct DeviceBatchedGemm : public BaseOperator
...
@@ -37,11 +43,24 @@ struct DeviceBatchedGemm : public BaseOperator
virtual
std
::
unique_ptr
<
BaseInvoker
>
MakeInvokerPointer
()
=
0
;
virtual
std
::
unique_ptr
<
BaseInvoker
>
MakeInvokerPointer
()
=
0
;
};
};
template
<
typename
AElementwiseOperation
,
template
<
typename
ALayout
,
typename
BLayout
,
typename
CLayout
,
typename
ADataType
,
typename
BDataType
,
typename
CDataType
,
typename
AElementwiseOperation
,
typename
BElementwiseOperation
,
typename
BElementwiseOperation
,
typename
CElementwiseOperation
>
typename
CElementwiseOperation
>
using
DeviceBatchedGemmPtr
=
std
::
unique_ptr
<
using
DeviceBatchedGemmPtr
=
std
::
unique_ptr
<
DeviceBatchedGemm
<
ALayout
,
DeviceBatchedGemm
<
AElementwiseOperation
,
BElementwiseOperation
,
CElementwiseOperation
>>
;
BLayout
,
CLayout
,
ADataType
,
BDataType
,
CDataType
,
AElementwiseOperation
,
BElementwiseOperation
,
CElementwiseOperation
>>
;
}
// namespace device
}
// namespace device
}
// namespace tensor_operation
}
// namespace tensor_operation
...
...
include/ck/tensor_operation/gpu/device/device_batched_gemm_xdl.hpp
View file @
e573a2a0
...
@@ -113,7 +113,7 @@ __global__ void
...
@@ -113,7 +113,7 @@ __global__ void
ignore
=
c_element_op
;
ignore
=
c_element_op
;
ignore
=
compute_ptr_offset_of_batch
;
ignore
=
compute_ptr_offset_of_batch
;
ignore
=
block_2_ctile_map
;
ignore
=
block_2_ctile_map
;
#endif
// end of if (defined(__gfx908__) || defined(__gfx90a__))
#endif
}
}
template
<
typename
ADataType
,
template
<
typename
ADataType
,
...
@@ -151,8 +151,15 @@ template <typename ADataType,
...
@@ -151,8 +151,15 @@ template <typename ADataType,
bool
BBlockLdsAddExtraN
,
bool
BBlockLdsAddExtraN
,
ck
::
index_t
CThreadTransferSrcDstVectorDim
,
ck
::
index_t
CThreadTransferSrcDstVectorDim
,
ck
::
index_t
CThreadTransferDstScalarPerVector
>
ck
::
index_t
CThreadTransferDstScalarPerVector
>
struct
DeviceBatchedGemmXdl
struct
DeviceBatchedGemmXdl
:
public
DeviceBatchedGemm
<
ALayout
,
:
public
DeviceBatchedGemm
<
AElementwiseOperation
,
BElementwiseOperation
,
CElementwiseOperation
>
BLayout
,
CLayout
,
ADataType
,
BDataType
,
CDataType
,
AElementwiseOperation
,
BElementwiseOperation
,
CElementwiseOperation
>
{
{
static
constexpr
auto
I0
=
Number
<
0
>
{};
static
constexpr
auto
I0
=
Number
<
0
>
{};
static
constexpr
auto
I1
=
Number
<
1
>
{};
static
constexpr
auto
I1
=
Number
<
1
>
{};
...
...
include/ck/tensor_operation/gpu/device/device_gemm.hpp
View file @
e573a2a0
...
@@ -17,33 +17,52 @@ struct GemmShape
...
@@ -17,33 +17,52 @@ struct GemmShape
ck
::
index_t
StrideA
,
StrideB
,
StrideC
;
ck
::
index_t
StrideA
,
StrideB
,
StrideC
;
};
};
template
<
typename
AElementwiseOperation
,
template
<
typename
ALayout
,
typename
BLayout
,
typename
CLayout
,
typename
ADataType
,
typename
BDataType
,
typename
CDataType
,
typename
AElementwiseOperation
,
typename
BElementwiseOperation
,
typename
BElementwiseOperation
,
typename
CElementwiseOperation
>
typename
CElementwiseOperation
>
struct
DeviceGemm
:
public
BaseOperator
struct
DeviceGemm
:
public
BaseOperator
{
{
virtual
std
::
unique_ptr
<
BaseArgument
>
MakeArgumentPointer
(
const
void
*
p_a
,
virtual
std
::
unique_ptr
<
BaseArgument
>
const
void
*
p_
b
,
MakeArgumentPointer
(
const
void
*
p_
a
,
void
*
p_
c
,
const
void
*
p_
b
,
ck
::
index_t
M
,
void
*
p_c
,
ck
::
index_t
N
,
ck
::
index_t
M
,
ck
::
index_t
K
,
ck
::
index_t
N
,
ck
::
index_t
StrideA
,
ck
::
index_t
K
,
ck
::
index_t
Stride
B
,
ck
::
index_t
Stride
A
,
ck
::
index_t
Stride
C
,
ck
::
index_t
Stride
B
,
AElementwiseOperation
a_element_op
,
ck
::
index_t
StrideC
,
B
ElementwiseOperation
b
_element_op
,
A
ElementwiseOperation
a
_element_op
,
C
ElementwiseOperation
c
_element_op
,
B
ElementwiseOperation
b
_element_op
,
ck
::
index_t
KBatch
=
1
)
=
0
;
CElementwiseOperation
c_element_op
)
=
0
;
virtual
std
::
unique_ptr
<
BaseInvoker
>
MakeInvokerPointer
()
=
0
;
virtual
std
::
unique_ptr
<
BaseInvoker
>
MakeInvokerPointer
()
=
0
;
};
};
template
<
typename
AElementwiseOperation
,
template
<
typename
ALayout
,
typename
BLayout
,
typename
CLayout
,
typename
ADataType
,
typename
BDataType
,
typename
CDataType
,
typename
AElementwiseOperation
,
typename
BElementwiseOperation
,
typename
BElementwiseOperation
,
typename
CElementwiseOperation
>
typename
CElementwiseOperation
>
using
DeviceGemmPtr
=
std
::
unique_ptr
<
using
DeviceGemmPtr
=
std
::
unique_ptr
<
DeviceGemm
<
ALayout
,
DeviceGemm
<
AElementwiseOperation
,
BElementwiseOperation
,
CElementwiseOperation
>>
;
BLayout
,
CLayout
,
ADataType
,
BDataType
,
CDataType
,
AElementwiseOperation
,
BElementwiseOperation
,
CElementwiseOperation
>>
;
template
<
typename
AElementwiseOperation
,
template
<
typename
AElementwiseOperation
,
typename
BElementwiseOperation
,
typename
BElementwiseOperation
,
...
...
include/ck/tensor_operation/gpu/device/device_gemm_dl.hpp
View file @
e573a2a0
...
@@ -64,8 +64,16 @@ template <
...
@@ -64,8 +64,16 @@ template <
is_same_v
<
BElementwiseOperation
,
ck
::
tensor_operation
::
element_wise
::
PassThrough
>
&&
is_same_v
<
BElementwiseOperation
,
ck
::
tensor_operation
::
element_wise
::
PassThrough
>
&&
is_same_v
<
CElementwiseOperation
,
ck
::
tensor_operation
::
element_wise
::
PassThrough
>
,
is_same_v
<
CElementwiseOperation
,
ck
::
tensor_operation
::
element_wise
::
PassThrough
>
,
bool
>
=
false
>
bool
>
=
false
>
struct
DeviceGemmDl
struct
DeviceGemmDl
:
public
DeviceGemm
<
ALayout
,
:
public
DeviceGemm
<
AElementwiseOperation
,
BElementwiseOperation
,
CElementwiseOperation
>
BLayout
,
CLayout
,
ADataType
,
BDataType
,
CDataType
,
AElementwiseOperation
,
BElementwiseOperation
,
CElementwiseOperation
>
{
{
static
constexpr
auto
I0
=
Number
<
0
>
{};
static
constexpr
auto
I0
=
Number
<
0
>
{};
static
constexpr
auto
I1
=
Number
<
1
>
{};
static
constexpr
auto
I1
=
Number
<
1
>
{};
...
@@ -534,8 +542,7 @@ struct DeviceGemmDl
...
@@ -534,8 +542,7 @@ struct DeviceGemmDl
index_t
StrideC
,
index_t
StrideC
,
AElementwiseOperation
a_element_op
,
AElementwiseOperation
a_element_op
,
BElementwiseOperation
b_element_op
,
BElementwiseOperation
b_element_op
,
CElementwiseOperation
c_element_op
,
CElementwiseOperation
c_element_op
)
override
index_t
/* KBatch */
=
1
)
override
{
{
return
std
::
make_unique
<
Argument
>
(
static_cast
<
const
ADataType
*>
(
p_a
),
return
std
::
make_unique
<
Argument
>
(
static_cast
<
const
ADataType
*>
(
p_a
),
static_cast
<
const
BDataType
*>
(
p_b
),
static_cast
<
const
BDataType
*>
(
p_b
),
...
...
include/ck/tensor_operation/gpu/device/device_gemm_multiple_d.hpp
View file @
e573a2a0
...
@@ -16,12 +16,20 @@ namespace device {
...
@@ -16,12 +16,20 @@ namespace device {
// output : E[M, N]
// output : E[M, N]
// C = a_op(A) * b_op(B)
// C = a_op(A) * b_op(B)
// E = cde_op(C, D0, D1, ...)
// E = cde_op(C, D0, D1, ...)
template
<
ck
::
index_t
NumDTensor
,
template
<
typename
ALayout
,
typename
BLayout
,
typename
DELayout
,
typename
ADataType
,
typename
BDataType
,
typename
DsDataType
,
typename
EDataType
,
typename
AElementwiseOperation
,
typename
AElementwiseOperation
,
typename
BElementwiseOperation
,
typename
BElementwiseOperation
,
typename
CDEElementwiseOperation
>
typename
CDEElementwiseOperation
>
struct
DeviceGemmMultipleD
:
public
BaseOperator
struct
DeviceGemmMultipleD
:
public
BaseOperator
{
{
static
constexpr
index_t
NumDTensor
=
DsDataType
::
Size
();
virtual
std
::
unique_ptr
<
BaseArgument
>
virtual
std
::
unique_ptr
<
BaseArgument
>
MakeArgumentPointer
(
const
void
*
p_a
,
MakeArgumentPointer
(
const
void
*
p_a
,
const
void
*
p_b
,
const
void
*
p_b
,
...
@@ -41,14 +49,26 @@ struct DeviceGemmMultipleD : public BaseOperator
...
@@ -41,14 +49,26 @@ struct DeviceGemmMultipleD : public BaseOperator
virtual
std
::
unique_ptr
<
BaseInvoker
>
MakeInvokerPointer
()
=
0
;
virtual
std
::
unique_ptr
<
BaseInvoker
>
MakeInvokerPointer
()
=
0
;
};
};
template
<
ck
::
index_t
NumDTensor
,
template
<
typename
ALayout
,
typename
BLayout
,
typename
DELayout
,
typename
ADataType
,
typename
BDataType
,
typename
DsDataType
,
typename
EDataType
,
typename
AElementwiseOperation
,
typename
AElementwiseOperation
,
typename
BElementwiseOperation
,
typename
BElementwiseOperation
,
typename
CElementwiseOperation
>
typename
CDEElementwiseOperation
>
using
DeviceGemmMultipleDPtr
=
std
::
unique_ptr
<
DeviceGemmMultipleD
<
NumDTensor
,
using
DeviceGemmMultipleDPtr
=
std
::
unique_ptr
<
DeviceGemmMultipleD
<
ALayout
,
BLayout
,
DELayout
,
ADataType
,
BDataType
,
DsDataType
,
EDataType
,
AElementwiseOperation
,
AElementwiseOperation
,
BElementwiseOperation
,
BElementwiseOperation
,
CElementwiseOperation
>>
;
C
DE
ElementwiseOperation
>>
;
}
// namespace device
}
// namespace device
}
// namespace tensor_operation
}
// namespace tensor_operation
...
...
include/ck/tensor_operation/gpu/device/device_gemm_multiple_d_xdl_cshuffle.hpp
View file @
e573a2a0
...
@@ -96,7 +96,7 @@ namespace device {
...
@@ -96,7 +96,7 @@ namespace device {
// E = cde_op(C, D0, D1, ...)
// E = cde_op(C, D0, D1, ...)
template
<
typename
ALayout
,
template
<
typename
ALayout
,
typename
BLayout
,
typename
BLayout
,
typename
C
DELayout
,
typename
DELayout
,
typename
ADataType
,
typename
ADataType
,
typename
BDataType
,
typename
BDataType
,
typename
GemmAccDataType
,
typename
GemmAccDataType
,
...
@@ -137,7 +137,13 @@ template <typename ALayout,
...
@@ -137,7 +137,13 @@ template <typename ALayout,
typename
CDEBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock
,
typename
CDEBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock
,
index_t
CDEBlockTransferScalarPerVector_NPerBlock
,
index_t
CDEBlockTransferScalarPerVector_NPerBlock
,
LoopScheduler
LoopSched
=
make_default_loop_scheduler
()>
LoopScheduler
LoopSched
=
make_default_loop_scheduler
()>
struct
DeviceGemmMultipleD_Xdl_CShuffle
:
public
DeviceGemmMultipleD
<
DsDataType
::
Size
(),
struct
DeviceGemmMultipleD_Xdl_CShuffle
:
public
DeviceGemmMultipleD
<
ALayout
,
BLayout
,
DELayout
,
ADataType
,
BDataType
,
DsDataType
,
EDataType
,
AElementwiseOperation
,
AElementwiseOperation
,
BElementwiseOperation
,
BElementwiseOperation
,
CDEElementwiseOperation
>
CDEElementwiseOperation
>
...
@@ -360,12 +366,12 @@ struct DeviceGemmMultipleD_Xdl_CShuffle : public DeviceGemmMultipleD<DsDataType:
...
@@ -360,12 +366,12 @@ struct DeviceGemmMultipleD_Xdl_CShuffle : public DeviceGemmMultipleD<DsDataType:
static
auto
MakeCGridDescriptor_M_N
(
index_t
MRaw
,
index_t
NRaw
,
index_t
StrideE
)
static
auto
MakeCGridDescriptor_M_N
(
index_t
MRaw
,
index_t
NRaw
,
index_t
StrideE
)
{
{
const
auto
c_grid_desc_mraw_nraw
=
[
&
]()
{
const
auto
c_grid_desc_mraw_nraw
=
[
&
]()
{
if
constexpr
(
is_same
<
tensor_layout
::
gemm
::
RowMajor
,
C
DELayout
>::
value
)
if
constexpr
(
is_same
<
tensor_layout
::
gemm
::
RowMajor
,
DELayout
>::
value
)
{
{
return
make_naive_tensor_descriptor
(
make_tuple
(
MRaw
,
NRaw
),
return
make_naive_tensor_descriptor
(
make_tuple
(
MRaw
,
NRaw
),
make_tuple
(
StrideE
,
I1
));
make_tuple
(
StrideE
,
I1
));
}
}
else
if
constexpr
(
is_same
<
tensor_layout
::
gemm
::
ColumnMajor
,
C
DELayout
>::
value
)
else
if
constexpr
(
is_same
<
tensor_layout
::
gemm
::
ColumnMajor
,
DELayout
>::
value
)
{
{
return
make_naive_tensor_descriptor
(
make_tuple
(
MRaw
,
NRaw
),
return
make_naive_tensor_descriptor
(
make_tuple
(
MRaw
,
NRaw
),
make_tuple
(
I1
,
StrideE
));
make_tuple
(
I1
,
StrideE
));
...
...
include/ck/tensor_operation/gpu/device/device_gemm_reduce.hpp
View file @
e573a2a0
...
@@ -2,13 +2,16 @@
...
@@ -2,13 +2,16 @@
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#pragma once
#include <iostream>
#include <iostream>
#include "device_base.hpp"
#include "device_base.hpp"
namespace
ck
{
namespace
ck
{
namespace
tensor_operation
{
namespace
tensor_operation
{
namespace
device
{
namespace
device
{
// FIXME: DeviceGemmReduce type need to well define the problem
template
<
ck
::
index_t
NumDTensor
,
ck
::
index_t
NumReduce
>
template
<
ck
::
index_t
NumDTensor
,
ck
::
index_t
NumReduce
>
struct
DeviceGemmReduce
:
public
BaseOperator
struct
DeviceGemmReduce
:
public
BaseOperator
{
{
...
...
include/ck/tensor_operation/gpu/device/device_gemm_splitk.hpp
View file @
e573a2a0
...
@@ -2,6 +2,7 @@
...
@@ -2,6 +2,7 @@
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#pragma once
#include <iostream>
#include <iostream>
#include <vector>
#include <vector>
...
@@ -11,7 +12,13 @@ namespace ck {
...
@@ -11,7 +12,13 @@ namespace ck {
namespace
tensor_operation
{
namespace
tensor_operation
{
namespace
device
{
namespace
device
{
template
<
typename
AElementwiseOperation
,
template
<
typename
ALayout
,
typename
BLayout
,
typename
CLayout
,
typename
ADataType
,
typename
BDataType
,
typename
CDataType
,
typename
AElementwiseOperation
,
typename
BElementwiseOperation
,
typename
BElementwiseOperation
,
typename
CElementwiseOperation
>
typename
CElementwiseOperation
>
struct
DeviceGemmSplitK
:
public
BaseOperator
struct
DeviceGemmSplitK
:
public
BaseOperator
...
@@ -33,11 +40,24 @@ struct DeviceGemmSplitK : public BaseOperator
...
@@ -33,11 +40,24 @@ struct DeviceGemmSplitK : public BaseOperator
virtual
std
::
unique_ptr
<
BaseInvoker
>
MakeInvokerPointer
()
=
0
;
virtual
std
::
unique_ptr
<
BaseInvoker
>
MakeInvokerPointer
()
=
0
;
};
};
template
<
typename
AElementwiseOperation
,
template
<
typename
ALayout
,
typename
BLayout
,
typename
CLayout
,
typename
ADataType
,
typename
BDataType
,
typename
CDataType
,
typename
AElementwiseOperation
,
typename
BElementwiseOperation
,
typename
BElementwiseOperation
,
typename
CElementwiseOperation
>
typename
CElementwiseOperation
>
using
DeviceGemmSplitKPtr
=
std
::
unique_ptr
<
using
DeviceGemmSplitKPtr
=
std
::
unique_ptr
<
DeviceGemmSplitK
<
ALayout
,
DeviceGemmSplitK
<
AElementwiseOperation
,
BElementwiseOperation
,
CElementwiseOperation
>>
;
BLayout
,
CLayout
,
ADataType
,
BDataType
,
CDataType
,
AElementwiseOperation
,
BElementwiseOperation
,
CElementwiseOperation
>>
;
}
// namespace device
}
// namespace device
}
// namespace tensor_operation
}
// namespace tensor_operation
...
...
include/ck/tensor_operation/gpu/device/device_gemm_xdl.hpp
View file @
e573a2a0
...
@@ -57,8 +57,15 @@ template <typename ADataType,
...
@@ -57,8 +57,15 @@ template <typename ADataType,
ck
::
index_t
CThreadTransferSrcDstVectorDim
,
ck
::
index_t
CThreadTransferSrcDstVectorDim
,
ck
::
index_t
CThreadTransferDstScalarPerVector
,
ck
::
index_t
CThreadTransferDstScalarPerVector
,
ck
::
index_t
NumPrefetch
=
1
>
ck
::
index_t
NumPrefetch
=
1
>
struct
DeviceGemmXdl
struct
DeviceGemmXdl
:
public
DeviceGemm
<
ALayout
,
:
public
DeviceGemm
<
AElementwiseOperation
,
BElementwiseOperation
,
CElementwiseOperation
>
BLayout
,
CLayout
,
ADataType
,
BDataType
,
CDataType
,
AElementwiseOperation
,
BElementwiseOperation
,
CElementwiseOperation
>
{
{
static
constexpr
auto
I0
=
Number
<
0
>
{};
static
constexpr
auto
I0
=
Number
<
0
>
{};
static
constexpr
auto
I1
=
Number
<
1
>
{};
static
constexpr
auto
I1
=
Number
<
1
>
{};
...
@@ -487,8 +494,7 @@ struct DeviceGemmXdl
...
@@ -487,8 +494,7 @@ struct DeviceGemmXdl
index_t
StrideC
,
index_t
StrideC
,
AElementwiseOperation
a_element_op
,
AElementwiseOperation
a_element_op
,
BElementwiseOperation
b_element_op
,
BElementwiseOperation
b_element_op
,
CElementwiseOperation
c_element_op
,
CElementwiseOperation
c_element_op
)
override
index_t
/* KBatch */
=
1
)
override
{
{
return
std
::
make_unique
<
Argument
>
(
static_cast
<
const
ADataType
*>
(
p_a
),
return
std
::
make_unique
<
Argument
>
(
static_cast
<
const
ADataType
*>
(
p_a
),
static_cast
<
const
BDataType
*>
(
p_b
),
static_cast
<
const
BDataType
*>
(
p_b
),
...
...
include/ck/tensor_operation/gpu/device/device_gemm_xdl_cshuffle.hpp
View file @
e573a2a0
...
@@ -65,8 +65,15 @@ template <typename ALayout,
...
@@ -65,8 +65,15 @@ template <typename ALayout,
typename
CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock
,
typename
CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock
,
index_t
CShuffleBlockTransferScalarPerVector_NPerBlock
,
index_t
CShuffleBlockTransferScalarPerVector_NPerBlock
,
LoopScheduler
LoopSched
=
make_default_loop_scheduler
()>
LoopScheduler
LoopSched
=
make_default_loop_scheduler
()>
struct
DeviceGemm_Xdl_CShuffle
struct
DeviceGemm_Xdl_CShuffle
:
public
DeviceGemm
<
ALayout
,
:
public
DeviceGemm
<
AElementwiseOperation
,
BElementwiseOperation
,
CElementwiseOperation
>
BLayout
,
CLayout
,
ADataType
,
BDataType
,
CDataType
,
AElementwiseOperation
,
BElementwiseOperation
,
CElementwiseOperation
>
{
{
using
DeviceOp
=
DeviceGemm_Xdl_CShuffle
;
using
DeviceOp
=
DeviceGemm_Xdl_CShuffle
;
...
@@ -622,8 +629,7 @@ struct DeviceGemm_Xdl_CShuffle
...
@@ -622,8 +629,7 @@ struct DeviceGemm_Xdl_CShuffle
index_t
StrideC
,
index_t
StrideC
,
AElementwiseOperation
a_element_op
,
AElementwiseOperation
a_element_op
,
BElementwiseOperation
b_element_op
,
BElementwiseOperation
b_element_op
,
CElementwiseOperation
c_element_op
,
CElementwiseOperation
c_element_op
)
override
index_t
/* KBatch */
=
1
)
override
{
{
return
std
::
make_unique
<
Argument
>
(
static_cast
<
const
ADataType
*>
(
p_a
),
return
std
::
make_unique
<
Argument
>
(
static_cast
<
const
ADataType
*>
(
p_a
),
static_cast
<
const
BDataType
*>
(
p_b
),
static_cast
<
const
BDataType
*>
(
p_b
),
...
...
include/ck/tensor_operation/gpu/device/device_gemm_xdl_splitk.hpp
View file @
e573a2a0
...
@@ -56,8 +56,15 @@ template <typename ADataType,
...
@@ -56,8 +56,15 @@ template <typename ADataType,
bool
BBlockLdsAddExtraN
,
bool
BBlockLdsAddExtraN
,
ck
::
index_t
CThreadTransferSrcDstVectorDim
,
ck
::
index_t
CThreadTransferSrcDstVectorDim
,
ck
::
index_t
CThreadTransferDstScalarPerVector
>
ck
::
index_t
CThreadTransferDstScalarPerVector
>
struct
DeviceGemmXdlSplitK
struct
DeviceGemmXdlSplitK
:
public
DeviceGemmSplitK
<
ALayout
,
:
public
DeviceGemmSplitK
<
AElementwiseOperation
,
BElementwiseOperation
,
CElementwiseOperation
>
BLayout
,
CLayout
,
ADataType
,
BDataType
,
CDataType
,
AElementwiseOperation
,
BElementwiseOperation
,
CElementwiseOperation
>
{
{
static
constexpr
auto
I0
=
Number
<
0
>
{};
static
constexpr
auto
I0
=
Number
<
0
>
{};
static
constexpr
auto
I1
=
Number
<
1
>
{};
static
constexpr
auto
I1
=
Number
<
1
>
{};
...
...
include/ck/tensor_operation/gpu/device/device_gemm_xdl_splitk_c_shuffle.hpp
View file @
e573a2a0
...
@@ -58,8 +58,15 @@ template <typename ADataType,
...
@@ -58,8 +58,15 @@ template <typename ADataType,
index_t
CShuffleNRepeatPerShuffle
,
index_t
CShuffleNRepeatPerShuffle
,
typename
CBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock
,
typename
CBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock
,
index_t
CBlockTransferScalarPerVector_NWaveNPerXDL
>
index_t
CBlockTransferScalarPerVector_NWaveNPerXDL
>
struct
DeviceGemmXdlSplitKCShuffle
struct
DeviceGemmXdlSplitKCShuffle
:
public
DeviceGemmSplitK
<
ALayout
,
:
public
DeviceGemmSplitK
<
AElementwiseOperation
,
BElementwiseOperation
,
CElementwiseOperation
>
BLayout
,
CLayout
,
ADataType
,
BDataType
,
CDataType
,
AElementwiseOperation
,
BElementwiseOperation
,
CElementwiseOperation
>
{
{
static
constexpr
auto
I0
=
Number
<
0
>
{};
static
constexpr
auto
I0
=
Number
<
0
>
{};
static
constexpr
auto
I1
=
Number
<
1
>
{};
static
constexpr
auto
I1
=
Number
<
1
>
{};
...
...
Prev
1
2
3
4
5
…
13
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment