Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
composable_kernel_ROCM
Commits
32806d5f
Commit
32806d5f
authored
Dec 27, 2023
by
Jun Liu
Browse files
Merge branch 'amd-develop' into amd-master
parents
e70a4d19
d0f355a3
Changes
138
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
283 additions
and
75 deletions
+283
-75
client_example/05_layernorm/layernorm4d_fwd.cpp
client_example/05_layernorm/layernorm4d_fwd.cpp
+2
-1
client_example/06_softmax/softmax4d.cpp
client_example/06_softmax/softmax4d.cpp
+1
-0
client_example/12_elementwise_normalization/elementwise_layernorm2d.cpp
.../12_elementwise_normalization/elementwise_layernorm2d.cpp
+1
-0
client_example/15_gemm_add_multiply/gemm_add_multiply.cpp
client_example/15_gemm_add_multiply/gemm_add_multiply.cpp
+1
-0
client_example/18_groupnorm/CMakeLists.txt
client_example/18_groupnorm/CMakeLists.txt
+5
-2
client_example/18_groupnorm/groupnorm_bwd_data.cpp
client_example/18_groupnorm/groupnorm_bwd_data.cpp
+182
-0
client_example/18_groupnorm/groupnorm_swish_fwd.cpp
client_example/18_groupnorm/groupnorm_swish_fwd.cpp
+0
-0
client_example/19_pool/avg_pool3d_fwd.cpp
client_example/19_pool/avg_pool3d_fwd.cpp
+31
-32
client_example/20_splitk_gemm/splitK_gemm_fp16_f8.cpp
client_example/20_splitk_gemm/splitK_gemm_fp16_f8.cpp
+1
-0
client_example/23_elementwise_transpose/elementwise_transpose_3d.cpp
...ple/23_elementwise_transpose/elementwise_transpose_3d.cpp
+1
-0
client_example/23_grouped_convnd_fwd_scaleadd_scaleadd_relu/grouped_conv_fwd_scaleadd_scaleadd_relu.inc
...scaleadd_relu/grouped_conv_fwd_scaleadd_scaleadd_relu.inc
+10
-6
client_example/25_tensor_transforms/CMakeLists.txt
client_example/25_tensor_transforms/CMakeLists.txt
+4
-0
client_example/25_tensor_transforms/tensor_transform.cpp
client_example/25_tensor_transforms/tensor_transform.cpp
+0
-0
client_example/25_tensor_transforms/tensor_transform_using_wrapper.cpp
...e/25_tensor_transforms/tensor_transform_using_wrapper.cpp
+13
-18
cmake/ClangTidy.cmake
cmake/ClangTidy.cmake
+1
-1
dev-requirements.txt
dev-requirements.txt
+2
-2
docs/conf.py
docs/conf.py
+19
-8
docs/doxygen/Doxyfile
docs/doxygen/Doxyfile
+4
-2
docs/index.rst
docs/index.rst
+2
-0
docs/sphinx/_toc.yml.in
docs/sphinx/_toc.yml.in
+3
-3
No files found.
client_example/05_layernorm/layernorm4d_fwd.cpp
View file @
32806d5f
...
@@ -16,7 +16,7 @@ using XDataType = ck::half_t;
...
@@ -16,7 +16,7 @@ using XDataType = ck::half_t;
using
GammaDataType
=
ck
::
half_t
;
using
GammaDataType
=
ck
::
half_t
;
using
BetaDataType
=
ck
::
half_t
;
using
BetaDataType
=
ck
::
half_t
;
using
YDataType
=
ck
::
half_t
;
using
YDataType
=
ck
::
half_t
;
using
SaveMeanInvStdDataType
=
floa
t
;
using
SaveMeanInvStdDataType
=
ck
::
half_
t
;
using
PassThrough
=
ck
::
tensor_operation
::
element_wise
::
PassThrough
;
using
PassThrough
=
ck
::
tensor_operation
::
element_wise
::
PassThrough
;
#define SAVE_MEAN_INV_STD
#define SAVE_MEAN_INV_STD
...
@@ -155,6 +155,7 @@ int main(int argc, char* argv[])
...
@@ -155,6 +155,7 @@ int main(int argc, char* argv[])
<<
best_op_name
<<
std
::
endl
;
<<
best_op_name
<<
std
::
endl
;
// run the best intance
// run the best intance
if
(
found
)
{
{
auto
&
op_ptr
=
op_ptrs
[
best_op_id
];
auto
&
op_ptr
=
op_ptrs
[
best_op_id
];
std
::
cout
<<
"Run the best instance without timing: "
<<
op_ptr
->
GetTypeString
()
std
::
cout
<<
"Run the best instance without timing: "
<<
op_ptr
->
GetTypeString
()
...
...
client_example/06_softmax/softmax4d.cpp
View file @
32806d5f
...
@@ -140,6 +140,7 @@ int main(int argc, char* argv[])
...
@@ -140,6 +140,7 @@ int main(int argc, char* argv[])
<<
best_op_name
<<
std
::
endl
;
<<
best_op_name
<<
std
::
endl
;
// run the best intance
// run the best intance
if
(
found
)
{
{
auto
&
op_ptr
=
op_ptrs
[
best_op_id
];
auto
&
op_ptr
=
op_ptrs
[
best_op_id
];
std
::
cout
<<
"Run the best instance without timing: "
<<
op_ptr
->
GetTypeString
()
std
::
cout
<<
"Run the best instance without timing: "
<<
op_ptr
->
GetTypeString
()
...
...
client_example/12_elementwise_normalization/elementwise_layernorm2d.cpp
View file @
32806d5f
...
@@ -142,6 +142,7 @@ int main()
...
@@ -142,6 +142,7 @@ int main()
<<
best_op_name
<<
std
::
endl
;
<<
best_op_name
<<
std
::
endl
;
// run the best intance
// run the best intance
if
(
found
)
{
{
auto
&
op_ptr
=
op_ptrs
[
best_op_id
];
auto
&
op_ptr
=
op_ptrs
[
best_op_id
];
std
::
cout
<<
"Run the best instance without timing: "
<<
op_ptr
->
GetTypeString
()
std
::
cout
<<
"Run the best instance without timing: "
<<
op_ptr
->
GetTypeString
()
...
...
client_example/15_gemm_add_multiply/gemm_add_multiply.cpp
View file @
32806d5f
...
@@ -204,6 +204,7 @@ int main(int argc, char* argv[])
...
@@ -204,6 +204,7 @@ int main(int argc, char* argv[])
<<
best_gb_per_sec
<<
" GB/s, "
<<
best_op_name
<<
std
::
endl
;
<<
best_gb_per_sec
<<
" GB/s, "
<<
best_op_name
<<
std
::
endl
;
// run the best intance
// run the best intance
if
(
found
)
{
{
auto
&
op_ptr
=
op_ptrs
[
best_op_id
];
auto
&
op_ptr
=
op_ptrs
[
best_op_id
];
...
...
client_example/18_groupnorm/CMakeLists.txt
View file @
32806d5f
add_executable
(
client_groupnorm_swish groupnorm_swish.cpp
)
add_executable
(
client_groupnorm_bwd_data groupnorm_bwd_data.cpp
)
target_link_libraries
(
client_groupnorm_swish PRIVATE composable_kernel::device_other_operations
)
target_link_libraries
(
client_groupnorm_bwd_data PRIVATE composable_kernel::device_other_operations
)
add_executable
(
client_groupnorm_swish_fwd groupnorm_swish_fwd.cpp
)
target_link_libraries
(
client_groupnorm_swish_fwd PRIVATE composable_kernel::device_other_operations
)
client_example/18_groupnorm/groupnorm_bwd_data.cpp
0 → 100644
View file @
32806d5f
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
#include <iomanip>
#include <vector>
#include <iostream>
#include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/tensor_operation/gpu/device/device_normalization_bwd_data.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/library/tensor_operation_instance/gpu/groupnorm_bwd_data.hpp"
using
DYDataType
=
float
;
using
XDataType
=
float
;
using
GammaDataType
=
float
;
using
MeanInvStdDataType
=
float
;
using
DXDataType
=
float
;
constexpr
int
Rank
=
5
;
constexpr
int
NumReduceDim
=
3
;
struct
SimpleDeviceMem
{
SimpleDeviceMem
()
=
delete
;
SimpleDeviceMem
(
std
::
size_t
mem_size
)
:
p_mem_
{}
{
(
void
)
hipMalloc
(
static_cast
<
void
**>
(
&
p_mem_
),
mem_size
);
}
void
*
GetDeviceBuffer
()
{
return
p_mem_
;
}
~
SimpleDeviceMem
()
{
(
void
)
hipFree
(
p_mem_
);
}
void
*
p_mem_
;
};
int
main
(
int
argc
,
char
*
argv
[])
{
ck
::
index_t
N
=
32
;
ck
::
index_t
H
=
16
;
ck
::
index_t
W
=
16
;
ck
::
index_t
G
=
64
;
ck
::
index_t
C
=
128
;
std
::
size_t
length
=
N
*
H
*
W
*
G
*
C
;
std
::
vector
<
ck
::
index_t
>
strideDy
=
{
H
*
W
*
G
*
C
,
W
*
G
*
C
,
G
*
C
,
C
,
1
};
std
::
vector
<
ck
::
index_t
>
strideX
=
strideDy
;
std
::
vector
<
ck
::
index_t
>
strideDx
=
strideDy
;
std
::
vector
<
ck
::
index_t
>
strideGamma
=
{
0
,
0
,
0
,
C
,
1
};
std
::
vector
<
ck
::
index_t
>
strideMeanInvStd
=
{
G
,
0
,
0
,
1
,
0
};
SimpleDeviceMem
dy_dev
(
sizeof
(
DYDataType
)
*
length
);
SimpleDeviceMem
x_dev
(
sizeof
(
XDataType
)
*
length
);
SimpleDeviceMem
gamma_dev
(
sizeof
(
GammaDataType
)
*
G
*
C
);
SimpleDeviceMem
mean_dev
(
sizeof
(
MeanInvStdDataType
)
*
N
*
G
);
SimpleDeviceMem
inv_std_dev
(
sizeof
(
MeanInvStdDataType
)
*
N
*
G
);
SimpleDeviceMem
dx_dev
(
sizeof
(
DXDataType
)
*
length
);
using
DeviceOp
=
ck
::
tensor_operation
::
device
::
DeviceNormalizationBwdData
<
DYDataType
,
XDataType
,
GammaDataType
,
MeanInvStdDataType
,
DXDataType
,
Rank
,
NumReduceDim
>
;
// get device op instances
const
auto
op_ptrs
=
ck
::
tensor_operation
::
device
::
instance
::
DeviceOperationInstanceFactory
<
DeviceOp
>::
GetInstances
();
std
::
cout
<<
"found "
<<
op_ptrs
.
size
()
<<
" instances"
<<
std
::
endl
;
std
::
string
best_op_name
;
bool
found
=
false
;
int
best_op_id
=
-
1
;
float
best_ave_time
=
std
::
numeric_limits
<
float
>::
max
();
float
best_gb_per_sec
=
0
;
// profile device operation instances
std
::
cout
<<
"Run all instances and do timing"
<<
std
::
endl
;
for
(
int
i
=
0
;
i
<
op_ptrs
.
size
();
++
i
)
{
auto
&
op_ptr
=
op_ptrs
[
i
];
auto
argument_ptr
=
op_ptr
->
MakeArgumentPointer
({
N
,
H
,
W
,
G
,
C
},
strideDy
,
strideX
,
strideGamma
,
strideMeanInvStd
,
strideMeanInvStd
,
strideDx
,
{
1
,
2
,
4
},
// reduceDims
dy_dev
.
GetDeviceBuffer
(),
x_dev
.
GetDeviceBuffer
(),
gamma_dev
.
GetDeviceBuffer
(),
mean_dev
.
GetDeviceBuffer
(),
inv_std_dev
.
GetDeviceBuffer
(),
dx_dev
.
GetDeviceBuffer
());
auto
invoker_ptr
=
op_ptr
->
MakeInvokerPointer
();
std
::
string
op_name
=
op_ptr
->
GetTypeString
();
if
(
op_ptr
->
IsSupportedArgument
(
argument_ptr
.
get
()))
{
size_t
workspace_sz
=
op_ptr
->
GetWorkSpaceSize
(
argument_ptr
.
get
());
SimpleDeviceMem
workspace
(
workspace_sz
);
op_ptr
->
SetWorkSpacePointer
(
argument_ptr
.
get
(),
workspace
.
GetDeviceBuffer
());
float
ave_time
=
invoker_ptr
->
Run
(
argument_ptr
.
get
(),
StreamConfig
{
nullptr
,
true
});
std
::
size_t
num_byte
=
sizeof
(
DYDataType
)
*
length
+
sizeof
(
XDataType
)
*
length
+
sizeof
(
GammaDataType
)
*
G
*
C
+
sizeof
(
MeanInvStdDataType
)
*
N
*
G
*
2
+
sizeof
(
DXDataType
)
*
length
;
float
gb_per_sec
=
num_byte
/
1.E6
/
ave_time
;
std
::
cout
<<
"Perf: "
<<
std
::
setw
(
10
)
<<
ave_time
<<
" ms, "
<<
gb_per_sec
<<
" GB/s, "
<<
op_name
<<
std
::
endl
;
if
(
ave_time
<
best_ave_time
)
{
found
=
true
;
best_op_id
=
i
;
best_op_name
=
op_name
;
best_ave_time
=
ave_time
;
best_gb_per_sec
=
gb_per_sec
;
}
}
else
{
std
::
cout
<<
op_name
<<
" does not support this problem"
<<
std
::
endl
;
}
}
// run the best intance
if
(
found
)
{
std
::
cout
<<
"Best Perf: "
<<
best_ave_time
<<
" ms, "
<<
best_gb_per_sec
<<
" GB/s, "
<<
best_op_name
<<
std
::
endl
;
auto
&
op_ptr
=
op_ptrs
[
best_op_id
];
std
::
cout
<<
"Run the best instance without timing: "
<<
op_ptr
->
GetTypeString
()
<<
std
::
endl
;
auto
argument_ptr
=
op_ptr
->
MakeArgumentPointer
({
N
,
H
,
W
,
G
,
C
},
strideDy
,
strideX
,
strideGamma
,
strideMeanInvStd
,
strideMeanInvStd
,
strideDx
,
{
1
,
2
,
4
},
// reduceDims
dy_dev
.
GetDeviceBuffer
(),
x_dev
.
GetDeviceBuffer
(),
gamma_dev
.
GetDeviceBuffer
(),
mean_dev
.
GetDeviceBuffer
(),
inv_std_dev
.
GetDeviceBuffer
(),
dx_dev
.
GetDeviceBuffer
());
auto
invoker_ptr
=
op_ptr
->
MakeInvokerPointer
();
if
(
op_ptr
->
IsSupportedArgument
(
argument_ptr
.
get
()))
{
size_t
workspace_sz
=
op_ptr
->
GetWorkSpaceSize
(
argument_ptr
.
get
());
SimpleDeviceMem
workspace
(
workspace_sz
);
op_ptr
->
SetWorkSpacePointer
(
argument_ptr
.
get
(),
workspace
.
GetDeviceBuffer
());
invoker_ptr
->
Run
(
argument_ptr
.
get
(),
StreamConfig
{
nullptr
,
false
});
}
std
::
cout
<<
"Done"
<<
std
::
endl
;
}
return
0
;
}
client_example/18_groupnorm/groupnorm_swish.cpp
→
client_example/18_groupnorm/groupnorm_swish
_fwd
.cpp
View file @
32806d5f
File moved
client_example/19_pool/avg_pool3d_fwd.cpp
View file @
32806d5f
...
@@ -94,7 +94,6 @@ int main(int argc, char* argv[])
...
@@ -94,7 +94,6 @@ int main(int argc, char* argv[])
SimpleDeviceMem
in_device_buf
(
sizeof
(
InDataType
)
*
in_tensor_size
);
SimpleDeviceMem
in_device_buf
(
sizeof
(
InDataType
)
*
in_tensor_size
);
SimpleDeviceMem
out_device_buf
(
sizeof
(
OutDataType
)
*
out_tensor_size
);
SimpleDeviceMem
out_device_buf
(
sizeof
(
OutDataType
)
*
out_tensor_size
);
SimpleDeviceMem
out_indices_device_buf
(
sizeof
(
IndexDataType
)
*
out_tensor_size
);
using
DeviceOp
=
ck
::
tensor_operation
::
device
::
DevicePoolFwd
<
InOutRank
,
using
DeviceOp
=
ck
::
tensor_operation
::
device
::
DevicePoolFwd
<
InOutRank
,
WindowRank
,
WindowRank
,
...
@@ -123,22 +122,22 @@ int main(int argc, char* argv[])
...
@@ -123,22 +122,22 @@ int main(int argc, char* argv[])
for
(
int
i
=
0
;
i
<
op_ptrs
.
size
();
++
i
)
for
(
int
i
=
0
;
i
<
op_ptrs
.
size
();
++
i
)
{
{
auto
&
op_ptr
=
op_ptrs
[
i
];
auto
&
op_ptr
=
op_ptrs
[
i
];
auto
argument_ptr
=
op_ptr
->
MakeArgumentPointer
(
auto
argument_ptr
=
static_cast
<
InDataType
*>
(
in_device_buf
.
GetDeviceBuffer
()),
op_ptr
->
MakeArgumentPointer
(
static_cast
<
InDataType
*>
(
in_device_buf
.
GetDeviceBuffer
()),
static_cast
<
OutDataType
*>
(
out_device_buf
.
GetDeviceBuffer
()),
static_cast
<
OutDataType
*>
(
out_device_buf
.
GetDeviceBuffer
()),
static_cast
<
IndexDataType
*>
(
out_indices_device_buf
.
GetDeviceBuffer
())
,
nullptr
,
in_length
,
in_length
,
window_spatial_lengths
,
window_spatial_lengths
,
out_length
,
out_length
,
in_tensor_stride
,
in_tensor_stride
,
out_tensor_stride
,
out_tensor_stride
,
out_tensor_stride
,
out_tensor_stride
,
window_strides
,
window_strides
,
window_dilations
,
window_dilations
,
input_left_pads
,
input_left_pads
,
input_right_pads
,
input_right_pads
,
{
2
,
3
,
4
});
{
2
,
3
,
4
});
auto
invoker_ptr
=
op_ptr
->
MakeInvokerPointer
();
auto
invoker_ptr
=
op_ptr
->
MakeInvokerPointer
();
...
@@ -184,21 +183,21 @@ int main(int argc, char* argv[])
...
@@ -184,21 +183,21 @@ int main(int argc, char* argv[])
std
::
cout
<<
"Run the best instance without timing: "
<<
op_ptr
->
GetTypeString
()
std
::
cout
<<
"Run the best instance without timing: "
<<
op_ptr
->
GetTypeString
()
<<
std
::
endl
;
<<
std
::
endl
;
auto
argument_ptr
=
op_ptr
->
MakeArgumentPointer
(
auto
argument_ptr
=
static_cast
<
InDataType
*>
(
in_device_buf
.
GetDeviceBuffer
()),
op_ptr
->
MakeArgumentPointer
(
static_cast
<
InDataType
*>
(
in_device_buf
.
GetDeviceBuffer
()),
static_cast
<
OutDataType
*>
(
out_device_buf
.
GetDeviceBuffer
()),
static_cast
<
OutDataType
*>
(
out_device_buf
.
GetDeviceBuffer
()),
static_cast
<
IndexDataType
*>
(
out_indices_device_buf
.
GetDeviceBuffer
())
,
nullptr
,
in_length
,
in_length
,
window_spatial_lengths
,
window_spatial_lengths
,
out_length
,
out_length
,
in_tensor_stride
,
in_tensor_stride
,
out_tensor_stride
,
out_tensor_stride
,
out_tensor_stride
,
out_tensor_stride
,
window_strides
,
window_strides
,
window_dilations
,
window_dilations
,
input_left_pads
,
input_left_pads
,
input_right_pads
,
input_right_pads
,
{
2
,
3
,
4
});
{
2
,
3
,
4
});
auto
invoker_ptr
=
op_ptr
->
MakeInvokerPointer
();
auto
invoker_ptr
=
op_ptr
->
MakeInvokerPointer
();
...
...
client_example/20_splitk_gemm/splitK_gemm_fp16_f8.cpp
View file @
32806d5f
...
@@ -191,6 +191,7 @@ int main(int argc, char* argv[])
...
@@ -191,6 +191,7 @@ int main(int argc, char* argv[])
<<
best_gb_per_sec
<<
" GB/s, "
<<
best_op_name
<<
std
::
endl
;
<<
best_gb_per_sec
<<
" GB/s, "
<<
best_op_name
<<
std
::
endl
;
// run the best intance
// run the best intance
if
(
found
)
{
{
auto
&
op_ptr
=
op_ptrs
[
best_op_id
];
auto
&
op_ptr
=
op_ptrs
[
best_op_id
];
...
...
client_example/23_elementwise_transpose/elementwise_transpose_3d.cpp
View file @
32806d5f
...
@@ -117,6 +117,7 @@ int main()
...
@@ -117,6 +117,7 @@ int main()
<<
best_op_name
<<
std
::
endl
;
<<
best_op_name
<<
std
::
endl
;
// run the best intance
// run the best intance
if
(
found
)
{
{
auto
&
op_ptr
=
op_ptrs
[
best_op_id
];
auto
&
op_ptr
=
op_ptrs
[
best_op_id
];
std
::
cout
<<
"Run the best instance without timing: "
<<
op_ptr
->
GetTypeString
()
std
::
cout
<<
"Run the best instance without timing: "
<<
op_ptr
->
GetTypeString
()
...
...
client_example/23_grouped_convnd_fwd_scaleadd_scaleadd_relu/grouped_conv_fwd_scaleadd_scaleadd_relu.inc
View file @
32806d5f
...
@@ -16,6 +16,7 @@
...
@@ -16,6 +16,7 @@
using
InLayout
=
ck
::
tensor_layout
::
convolution
::
NDHWGC
;
using
InLayout
=
ck
::
tensor_layout
::
convolution
::
NDHWGC
;
using
WeiLayout
=
ck
::
tensor_layout
::
convolution
::
GKZYXC
;
using
WeiLayout
=
ck
::
tensor_layout
::
convolution
::
GKZYXC
;
using
OutLayout
=
ck
::
tensor_layout
::
convolution
::
NDHWGK
;
using
OutLayout
=
ck
::
tensor_layout
::
convolution
::
NDHWGK
;
using
BiasLayout
=
ck
::
tensor_layout
::
convolution
::
G_K
;
using
PassThrough
=
ck
::
tensor_operation
::
element_wise
::
PassThrough
;
using
PassThrough
=
ck
::
tensor_operation
::
element_wise
::
PassThrough
;
using
ScaleAddScaleAddRelu
=
ck
::
tensor_operation
::
element_wise
::
ScaleAddScaleAddRelu
;
using
ScaleAddScaleAddRelu
=
ck
::
tensor_operation
::
element_wise
::
ScaleAddScaleAddRelu
;
...
@@ -64,6 +65,9 @@ int execute_conv_fwd_scaleadd_scaleadd_relu()
...
@@ -64,6 +65,9 @@ int execute_conv_fwd_scaleadd_scaleadd_relu()
std
::
array
<
ck
::
index_t
,
6
>
out_lengths
{
G
,
N
,
K
,
Do
,
Ho
,
Wo
};
std
::
array
<
ck
::
index_t
,
6
>
out_lengths
{
G
,
N
,
K
,
Do
,
Ho
,
Wo
};
std
::
array
<
ck
::
index_t
,
6
>
out_strides
{
std
::
array
<
ck
::
index_t
,
6
>
out_strides
{
K
,
Do
*
Ho
*
Wo
*
G
*
K
,
1
,
Ho
*
Wo
*
G
*
K
,
Wo
*
G
*
K
,
G
*
K
};
K
,
Do
*
Ho
*
Wo
*
G
*
K
,
1
,
Ho
*
Wo
*
G
*
K
,
Wo
*
G
*
K
,
G
*
K
};
// Logical broadcast bias (we have to pass bias lengths in the same format as output - GNKDHW)
std
::
array
<
ck
::
index_t
,
6
>
bias_lengths
{
G
,
1
,
K
,
1
,
1
,
1
};
std
::
array
<
ck
::
index_t
,
6
>
bias_strides
{
K
,
0
,
1
,
0
,
0
,
0
};
std
::
array
<
ck
::
index_t
,
NumDimSpatial
>
filter_strides
{
1
,
1
,
1
};
std
::
array
<
ck
::
index_t
,
NumDimSpatial
>
filter_strides
{
1
,
1
,
1
};
std
::
array
<
ck
::
index_t
,
NumDimSpatial
>
filter_dilations
{
1
,
1
,
1
};
std
::
array
<
ck
::
index_t
,
NumDimSpatial
>
filter_dilations
{
1
,
1
,
1
};
...
@@ -74,13 +78,13 @@ int execute_conv_fwd_scaleadd_scaleadd_relu()
...
@@ -74,13 +78,13 @@ int execute_conv_fwd_scaleadd_scaleadd_relu()
SimpleDeviceMem
wei
(
sizeof
(
WeiDataType
)
*
G
*
K
*
Z
*
Y
*
X
*
C
);
SimpleDeviceMem
wei
(
sizeof
(
WeiDataType
)
*
G
*
K
*
Z
*
Y
*
X
*
C
);
SimpleDeviceMem
out
(
sizeof
(
OutDataType
)
*
N
*
Do
*
Ho
*
Wo
*
G
*
K
);
SimpleDeviceMem
out
(
sizeof
(
OutDataType
)
*
N
*
Do
*
Ho
*
Wo
*
G
*
K
);
SimpleDeviceMem
d0
(
sizeof
(
std
::
tuple_element_t
<
0
,
DDataTypes
>
)
*
N
*
Do
*
Ho
*
Wo
*
G
*
K
);
SimpleDeviceMem
d0
(
sizeof
(
std
::
tuple_element_t
<
0
,
DDataTypes
>
)
*
N
*
Do
*
Ho
*
Wo
*
G
*
K
);
SimpleDeviceMem
d1
(
sizeof
(
std
::
tuple_element_t
<
1
,
DDataTypes
>
)
*
N
*
Do
*
Ho
*
Wo
*
G
*
K
);
SimpleDeviceMem
d1
(
sizeof
(
std
::
tuple_element_t
<
1
,
DDataTypes
>
)
*
G
*
K
);
using
DeviceOp
=
ck
::
tensor_operation
::
device
::
DeviceGroupedConvFwdMultipleABD
<
using
DeviceOp
=
ck
::
tensor_operation
::
device
::
DeviceGroupedConvFwdMultipleABD
<
NumDimSpatial
,
NumDimSpatial
,
InLayout
,
InLayout
,
WeiLayout
,
WeiLayout
,
ck
::
Tuple
<
OutLayout
,
Out
Layout
>
,
ck
::
Tuple
<
OutLayout
,
Bias
Layout
>
,
OutLayout
,
OutLayout
,
InDataType
,
InDataType
,
WeiDataType
,
WeiDataType
,
...
@@ -117,8 +121,8 @@ int execute_conv_fwd_scaleadd_scaleadd_relu()
...
@@ -117,8 +121,8 @@ int execute_conv_fwd_scaleadd_scaleadd_relu()
in_strides
,
in_strides
,
wei_lengths
,
wei_lengths
,
wei_strides
,
wei_strides
,
{
out_lengths
,
out
_lengths
},
{
out_lengths
,
bias
_lengths
},
{
out_strides
,
out
_strides
},
{
out_strides
,
bias
_strides
},
out_lengths
,
out_lengths
,
out_strides
,
out_strides
,
filter_strides
,
filter_strides
,
...
@@ -187,8 +191,8 @@ int execute_conv_fwd_scaleadd_scaleadd_relu()
...
@@ -187,8 +191,8 @@ int execute_conv_fwd_scaleadd_scaleadd_relu()
in_strides
,
in_strides
,
wei_lengths
,
wei_lengths
,
wei_strides
,
wei_strides
,
{
out_lengths
,
out
_lengths
},
{
out_lengths
,
bias
_lengths
},
{
out_strides
,
out
_strides
},
{
out_strides
,
bias
_strides
},
out_lengths
,
out_lengths
,
out_strides
,
out_strides
,
filter_strides
,
filter_strides
,
...
...
client_example/25_tensor_transforms/CMakeLists.txt
0 → 100644
View file @
32806d5f
add_executable
(
client_tensor_transform tensor_transform.cpp
)
target_link_libraries
(
client_tensor_transform PRIVATE composable_kernel::device_other_operations
)
add_executable
(
client_tensor_transform_using_wrapper tensor_transform_using_wrapper.cpp
)
target_link_libraries
(
client_tensor_transform_using_wrapper PRIVATE composable_kernel::device_other_operations
)
example/
64
_tensor_transforms/tensor_transform.cpp
→
client_
example/
25
_tensor_transforms/tensor_transform.cpp
View file @
32806d5f
File moved
example/
64
_tensor_transforms/tensor_transform_using_wrapper.cpp
→
client_
example/
25
_tensor_transforms/tensor_transform_using_wrapper.cpp
View file @
32806d5f
...
@@ -9,7 +9,7 @@
...
@@ -9,7 +9,7 @@
#include "ck/utility/tuple.hpp"
#include "ck/utility/tuple.hpp"
#include "ck/utility/sequence.hpp"
#include "ck/utility/sequence.hpp"
#include "
tensor_transform_wrapper
.hpp"
#include "
ck/wrapper/layout
.hpp"
using
DataType
=
int
;
using
DataType
=
int
;
...
@@ -17,7 +17,7 @@ template <typename Layout>
...
@@ -17,7 +17,7 @@ template <typename Layout>
void
Print1d
(
const
Layout
&
layout
)
void
Print1d
(
const
Layout
&
layout
)
{
{
std
::
cout
<<
"Print1d"
<<
std
::
endl
;
std
::
cout
<<
"Print1d"
<<
std
::
endl
;
for
(
ck
::
index_t
w
=
0
;
w
<
ck
::
tensor_transform_
wrapper
::
size
(
layout
);
w
++
)
for
(
ck
::
index_t
w
=
0
;
w
<
ck
::
wrapper
::
size
(
layout
);
w
++
)
{
{
std
::
cout
<<
layout
(
ck
::
make_tuple
(
w
))
<<
" "
;
std
::
cout
<<
layout
(
ck
::
make_tuple
(
w
))
<<
" "
;
}
}
...
@@ -28,9 +28,9 @@ template <typename Layout>
...
@@ -28,9 +28,9 @@ template <typename Layout>
void
Print2d
(
const
Layout
&
layout
)
void
Print2d
(
const
Layout
&
layout
)
{
{
std
::
cout
<<
"Print2d"
<<
std
::
endl
;
std
::
cout
<<
"Print2d"
<<
std
::
endl
;
for
(
ck
::
index_t
h
=
0
;
h
<
ck
::
tensor_transform_
wrapper
::
size
<
0
>
(
layout
);
h
++
)
for
(
ck
::
index_t
h
=
0
;
h
<
ck
::
wrapper
::
size
<
0
>
(
layout
);
h
++
)
{
{
for
(
ck
::
index_t
w
=
0
;
w
<
ck
::
tensor_transform_
wrapper
::
size
<
1
>
(
layout
);
w
++
)
for
(
ck
::
index_t
w
=
0
;
w
<
ck
::
wrapper
::
size
<
1
>
(
layout
);
w
++
)
{
{
std
::
cout
<<
layout
(
ck
::
make_tuple
(
h
,
w
))
<<
" "
;
std
::
cout
<<
layout
(
ck
::
make_tuple
(
h
,
w
))
<<
" "
;
}
}
...
@@ -43,15 +43,11 @@ template <typename Layout>
...
@@ -43,15 +43,11 @@ template <typename Layout>
void
Print3dCustom
(
const
Layout
&
layout
)
void
Print3dCustom
(
const
Layout
&
layout
)
{
{
std
::
cout
<<
"Print3dCustom"
<<
std
::
endl
;
std
::
cout
<<
"Print3dCustom"
<<
std
::
endl
;
for
(
ck
::
index_t
d
=
0
;
for
(
ck
::
index_t
d
=
0
;
d
<
ck
::
wrapper
::
size
<
0
>
(
ck
::
wrapper
::
get
<
0
>
(
layout
));
d
++
)
d
<
ck
::
tensor_transform_wrapper
::
size
<
0
>
(
ck
::
tensor_transform_wrapper
::
get
<
0
>
(
layout
));
d
++
)
{
{
for
(
ck
::
index_t
h
=
0
;
for
(
ck
::
index_t
h
=
0
;
h
<
ck
::
wrapper
::
size
<
1
>
(
ck
::
wrapper
::
get
<
0
>
(
layout
));
h
++
)
h
<
ck
::
tensor_transform_wrapper
::
size
<
1
>
(
ck
::
tensor_transform_wrapper
::
get
<
0
>
(
layout
));
h
++
)
{
{
for
(
ck
::
index_t
w
=
0
;
w
<
ck
::
tensor_transform_
wrapper
::
size
<
1
>
(
layout
);
w
++
)
for
(
ck
::
index_t
w
=
0
;
w
<
ck
::
wrapper
::
size
<
1
>
(
layout
);
w
++
)
{
{
std
::
cout
<<
layout
(
ck
::
make_tuple
(
ck
::
make_tuple
(
d
,
h
),
w
))
<<
" "
;
std
::
cout
<<
layout
(
ck
::
make_tuple
(
ck
::
make_tuple
(
d
,
h
),
w
))
<<
" "
;
}
}
...
@@ -68,7 +64,7 @@ int main()
...
@@ -68,7 +64,7 @@ int main()
// Basic descriptor 0, 1, 2, ... 30, 31 (compile-time descriptor)
// Basic descriptor 0, 1, 2, ... 30, 31 (compile-time descriptor)
// (dims:4,8 strides:1,4)
// (dims:4,8 strides:1,4)
const
auto
shape_4x8
=
ck
::
make_tuple
(
ck
::
Number
<
4
>
{},
ck
::
Number
<
8
>
{});
const
auto
shape_4x8
=
ck
::
make_tuple
(
ck
::
Number
<
4
>
{},
ck
::
Number
<
8
>
{});
const
auto
layout_4x8_s1x4
=
ck
::
tensor_transform_
wrapper
::
make_layout
(
shape_4x8
);
const
auto
layout_4x8_s1x4
=
ck
::
wrapper
::
make_layout
(
shape_4x8
);
std
::
cout
<<
"dims:4,8 strides:1,4"
<<
std
::
endl
;
std
::
cout
<<
"dims:4,8 strides:1,4"
<<
std
::
endl
;
Print2d
(
layout_4x8_s1x4
);
Print2d
(
layout_4x8_s1x4
);
using
Cord1x1Type
=
ck
::
Tuple
<
ck
::
Number
<
1
>
,
ck
::
Number
<
1
>>
;
using
Cord1x1Type
=
ck
::
Tuple
<
ck
::
Number
<
1
>
,
ck
::
Number
<
1
>>
;
...
@@ -77,10 +73,9 @@ int main()
...
@@ -77,10 +73,9 @@ int main()
// Basic descriptor 0, 1, 8, 9, 16, 17, ... 30, 31 (runtime descriptor)
// Basic descriptor 0, 1, 8, 9, 16, 17, ... 30, 31 (runtime descriptor)
// dims:4,(2,4) strides:2,(1,8)
// dims:4,(2,4) strides:2,(1,8)
const
auto
shape_4x2x4
=
ck
::
make_tuple
(
4
,
ck
::
make_tuple
(
2
,
4
));
const
auto
shape_4x2x4
=
ck
::
make_tuple
(
4
,
ck
::
make_tuple
(
2
,
4
));
const
auto
strides_s2x1x8
=
ck
::
make_tuple
(
2
,
ck
::
make_tuple
(
1
,
8
));
const
auto
strides_s2x1x8
=
ck
::
make_tuple
(
2
,
ck
::
make_tuple
(
1
,
8
));
const
auto
layout_4x2x4_s2x1x8
=
const
auto
layout_4x2x4_s2x1x8
=
ck
::
wrapper
::
make_layout
(
shape_4x2x4
,
strides_s2x1x8
);
ck
::
tensor_transform_wrapper
::
make_layout
(
shape_4x2x4
,
strides_s2x1x8
);
std
::
cout
<<
"dims:4,(2,4) strides:2,(1,8)"
<<
std
::
endl
;
std
::
cout
<<
"dims:4,(2,4) strides:2,(1,8)"
<<
std
::
endl
;
Print2d
(
layout_4x2x4_s2x1x8
);
Print2d
(
layout_4x2x4_s2x1x8
);
...
@@ -92,7 +87,7 @@ int main()
...
@@ -92,7 +87,7 @@ int main()
const
auto
strides_s1x4x2x8
=
ck
::
make_tuple
(
ck
::
make_tuple
(
ck
::
Number
<
1
>
{},
ck
::
Number
<
4
>
{}),
const
auto
strides_s1x4x2x8
=
ck
::
make_tuple
(
ck
::
make_tuple
(
ck
::
Number
<
1
>
{},
ck
::
Number
<
4
>
{}),
ck
::
make_tuple
(
ck
::
Number
<
2
>
{},
ck
::
Number
<
8
>
{}));
ck
::
make_tuple
(
ck
::
Number
<
2
>
{},
ck
::
Number
<
8
>
{}));
static
const
auto
layout_2x2x2x4_s1x4x2x8
=
static
const
auto
layout_2x2x2x4_s1x4x2x8
=
ck
::
tensor_transform_
wrapper
::
make_layout
(
shape_2x2x2x4
,
strides_s1x4x2x8
);
ck
::
wrapper
::
make_layout
(
shape_2x2x2x4
,
strides_s1x4x2x8
);
std
::
cout
<<
"dims:(2,2),(2,4) strides:(1,4),(2,8)"
<<
std
::
endl
;
std
::
cout
<<
"dims:(2,2),(2,4) strides:(1,4),(2,8)"
<<
std
::
endl
;
Print2d
(
layout_2x2x2x4_s1x4x2x8
);
Print2d
(
layout_2x2x2x4_s1x4x2x8
);
...
@@ -108,7 +103,7 @@ int main()
...
@@ -108,7 +103,7 @@ int main()
ck
::
make_tuple
(
ck
::
make_tuple
(
ck
::
Number
<
1
>
{},
ck
::
Number
<
4
>
{}),
ck
::
Number
<
2
>
{}),
ck
::
make_tuple
(
ck
::
make_tuple
(
ck
::
Number
<
1
>
{},
ck
::
Number
<
4
>
{}),
ck
::
Number
<
2
>
{}),
ck
::
Number
<
8
>
{});
ck
::
Number
<
8
>
{});
static
const
auto
layout_2x2x2x4_s1x4x2x8_nested
=
static
const
auto
layout_2x2x2x4_s1x4x2x8_nested
=
ck
::
tensor_transform_
wrapper
::
make_layout
(
shape_2x2x2x4_nested
,
strides_s1x4x2x8_nested
);
ck
::
wrapper
::
make_layout
(
shape_2x2x2x4_nested
,
strides_s1x4x2x8_nested
);
std
::
cout
<<
"dims:((2,2),2),4 strides:((1,4),2),8"
<<
std
::
endl
;
std
::
cout
<<
"dims:((2,2),2),4 strides:((1,4),2),8"
<<
std
::
endl
;
Print1d
(
layout_2x2x2x4_s1x4x2x8_nested
);
Print1d
(
layout_2x2x2x4_s1x4x2x8_nested
);
...
...
cmake/ClangTidy.cmake
View file @
32806d5f
...
@@ -149,7 +149,7 @@ function(clang_tidy_check TARGET)
...
@@ -149,7 +149,7 @@ function(clang_tidy_check TARGET)
add_custom_target
(
${
tidy_target
}
add_custom_target
(
${
tidy_target
}
# for some targets clang-tidy not able to get information from .clang-tidy
# for some targets clang-tidy not able to get information from .clang-tidy
DEPENDS
${
SOURCE
}
DEPENDS
${
SOURCE
}
COMMAND
${
CLANG_TIDY_COMMAND
}
"-config=\{CheckOptions: \[\{key: bugprone-reserved-identifier.AllowedIdentifiers,value: __HIP_PLATFORM_HCC__
\;
__HIP_ROCclr__\}\]\}"
${
SOURCE
}
"-export-fixes=
${
CLANG_TIDY_FIXIT_DIR
}
/
${
TARGET
}
-
${
tidy_file
}
.yaml"
COMMAND
${
CLANG_TIDY_COMMAND
}
"-config=\{CheckOptions: \[\{key: bugprone-reserved-identifier.AllowedIdentifiers,value: __HIP_PLATFORM_HCC__
\;
__HIP_PLATFORM_AMD__
\;
__HIP_ROCclr__\}\]\}"
${
SOURCE
}
"-export-fixes=
${
CLANG_TIDY_FIXIT_DIR
}
/
${
TARGET
}
-
${
tidy_file
}
.yaml"
WORKING_DIRECTORY
${
CMAKE_CURRENT_SOURCE_DIR
}
WORKING_DIRECTORY
${
CMAKE_CURRENT_SOURCE_DIR
}
COMMENT
"clang-tidy: Running clang-tidy on target
${
SOURCE
}
..."
COMMENT
"clang-tidy: Running clang-tidy on target
${
SOURCE
}
..."
)
)
...
...
dev-requirements.txt
View file @
32806d5f
ROCm
SoftwarePlatform
/rocm-recipes
ROCm/rocm-recipes
RadeonOpenCompute/rocm-cmake@04f694df2a8dc9d7e35fa4dee4ba5fa407ec04f8
--build
RadeonOpenCompute/rocm-cmake@04f694df2a8dc9d7e35fa4dee4ba5fa407ec04f8
--build
danmar/cppcheck@2.9
danmar/cppcheck@2.9
\ No newline at end of file
docs/conf.py
View file @
32806d5f
...
@@ -4,23 +4,34 @@
...
@@ -4,23 +4,34 @@
# list see the documentation:
# list see the documentation:
# https://www.sphinx-doc.org/en/master/usage/configuration.html
# https://www.sphinx-doc.org/en/master/usage/configuration.html
import
subprocess
import
re
from
rocm_docs
import
ROCmDocs
from
rocm_docs
import
ROCmDocs
html_theme_options
=
{
"flavor"
:
"list"
}
name
=
"Composable Kernel"
with
open
(
'../CMakeLists.txt'
,
encoding
=
'utf-8'
)
as
f
:
get_version
=
r
'sed -n -e "s/^rocm_setup_version(.* \([0-9\.]\{1,\}\).*/\1/p" ../CMakeLists.txt'
match
=
re
.
search
(
r
'.*set\(version ([0-9.]+)[^0-9.]+'
,
f
.
read
())
version
=
subprocess
.
getoutput
(
get_version
)
if
not
match
:
if
len
(
version
)
>
0
:
raise
ValueError
(
"VERSION not found!"
)
name
=
f
"
{
name
}
{
version
}
"
version_number
=
match
[
1
]
left_nav_title
=
f
"Composable Kernel
{
version_number
}
Documentation"
# for PDF output on Read the Docs
project
=
"Composable Kernel Documentation"
author
=
"Advanced Micro Devices, Inc."
copyright
=
"Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved."
version
=
version_number
release
=
version_number
external_toc_path
=
"./sphinx/_toc.yml"
external_toc_path
=
"./sphinx/_toc.yml"
docs_core
=
ROCmDocs
(
f
"
{
name
}
Documentation"
)
docs_core
=
ROCmDocs
(
left_nav_title
)
docs_core
.
run_doxygen
(
doxygen_root
=
"doxygen"
,
doxygen_path
=
"doxygen/
docBin/
xml"
)
docs_core
.
run_doxygen
(
doxygen_root
=
"doxygen"
,
doxygen_path
=
"doxygen/xml"
)
docs_core
.
setup
()
docs_core
.
setup
()
external_projects_current_project
=
"composable_kernel"
mathjax3_config
=
{
mathjax3_config
=
{
'tex'
:
{
'tex'
:
{
'macros'
:
{
'macros'
:
{
...
...
docs/doxygen/Doxyfile
View file @
32806d5f
...
@@ -58,7 +58,7 @@ PROJECT_LOGO =
...
@@ -58,7 +58,7 @@ PROJECT_LOGO =
# entered, it will be relative to the location where doxygen was started. If
# entered, it will be relative to the location where doxygen was started. If
# left blank the current directory will be used.
# left blank the current directory will be used.
OUTPUT_DIRECTORY =
docBin
OUTPUT_DIRECTORY =
.
# If the CREATE_SUBDIRS tag is set to YES then doxygen will create 4096 sub-
# If the CREATE_SUBDIRS tag is set to YES then doxygen will create 4096 sub-
# directories (in 2 levels) under the output directory of each output format and
# directories (in 2 levels) under the output directory of each output format and
...
@@ -778,7 +778,9 @@ WARN_LOGFILE =
...
@@ -778,7 +778,9 @@ WARN_LOGFILE =
INPUT = ../../include/ck/tensor_operation/gpu/grid \
INPUT = ../../include/ck/tensor_operation/gpu/grid \
../../include/ck/tensor_operation/gpu/block \
../../include/ck/tensor_operation/gpu/block \
../../include/ck/tensor_operation/gpu/thread \
../../include/ck/tensor_operation/gpu/thread \
../../library/include/ck/library/utility
../../library/include/ck/library/utility \
../../include/ck/wrapper
# This tag can be used to specify the character encoding of the source files
# This tag can be used to specify the character encoding of the source files
# that doxygen parses. Internally doxygen uses the UTF-8 encoding. Doxygen uses
# that doxygen parses. Internally doxygen uses the UTF-8 encoding. Doxygen uses
...
...
docs/index.rst
View file @
32806d5f
...
@@ -34,6 +34,7 @@ Current CK library are structured into 4 layers:
...
@@ -34,6 +34,7 @@ Current CK library are structured into 4 layers:
* "Templated Tile Operators" layer
* "Templated Tile Operators" layer
* "Templated Kernel and Invoker" layer
* "Templated Kernel and Invoker" layer
* "Instantiated Kernel and Invoker" layer
* "Instantiated Kernel and Invoker" layer
* "Wrapper for tensor transform operations"
* "Client API" layer
* "Client API" layer
.. image:: data/ck_layer.png
.. image:: data/ck_layer.png
...
@@ -50,6 +51,7 @@ The following is a list of CK documents in the suggested reading order:
...
@@ -50,6 +51,7 @@ The following is a list of CK documents in the suggested reading order:
tutorial_hello_world
tutorial_hello_world
dockerhub
dockerhub
wrapper
Supported_Primitives_Guide
Supported_Primitives_Guide
API_Reference_Guide
API_Reference_Guide
Contributors_Guide
Contributors_Guide
docs/sphinx/_toc.yml.in
View file @
32806d5f
...
@@ -5,6 +5,6 @@ defaults:
...
@@ -5,6 +5,6 @@ defaults:
maxdepth: 6
maxdepth: 6
root: index
root: index
subtrees:
subtrees:
- caption: About
- caption: About
entries:
entries:
- file: license
- file: license
Prev
1
2
3
4
5
6
7
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment