Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
composable_kernel_ROCM
Commits
cbcc844e
Commit
cbcc844e
authored
Feb 08, 2024
by
illsilin
Browse files
merge from public repo
parents
29deceb6
1f306024
Changes
393
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
1088 additions
and
43 deletions
+1088
-43
client_example/05_layernorm/layernorm2d_bwd_data.cpp
client_example/05_layernorm/layernorm2d_bwd_data.cpp
+170
-0
client_example/05_layernorm/layernorm2d_bwd_gamma_beta.cpp
client_example/05_layernorm/layernorm2d_bwd_gamma_beta.cpp
+171
-0
client_example/05_layernorm/layernorm2d_fwd.cpp
client_example/05_layernorm/layernorm2d_fwd.cpp
+2
-1
client_example/05_layernorm/layernorm4d_fwd.cpp
client_example/05_layernorm/layernorm4d_fwd.cpp
+2
-1
client_example/06_softmax/softmax4d.cpp
client_example/06_softmax/softmax4d.cpp
+1
-0
client_example/12_elementwise_normalization/elementwise_layernorm2d.cpp
.../12_elementwise_normalization/elementwise_layernorm2d.cpp
+1
-0
client_example/15_gemm_add_multiply/gemm_add_multiply.cpp
client_example/15_gemm_add_multiply/gemm_add_multiply.cpp
+1
-0
client_example/18_groupnorm/CMakeLists.txt
client_example/18_groupnorm/CMakeLists.txt
+8
-2
client_example/18_groupnorm/groupnorm_bwd_data.cpp
client_example/18_groupnorm/groupnorm_bwd_data.cpp
+182
-0
client_example/18_groupnorm/groupnorm_bwd_gamma_beta.cpp
client_example/18_groupnorm/groupnorm_bwd_gamma_beta.cpp
+180
-0
client_example/18_groupnorm/groupnorm_swish_fwd.cpp
client_example/18_groupnorm/groupnorm_swish_fwd.cpp
+0
-0
client_example/19_pool/avg_pool3d_fwd.cpp
client_example/19_pool/avg_pool3d_fwd.cpp
+31
-32
client_example/20_splitk_gemm/splitK_gemm_fp16_f8.cpp
client_example/20_splitk_gemm/splitK_gemm_fp16_f8.cpp
+1
-0
client_example/23_elementwise_transpose/elementwise_transpose_3d.cpp
...ple/23_elementwise_transpose/elementwise_transpose_3d.cpp
+1
-0
client_example/23_grouped_convnd_fwd_scaleadd_scaleadd_relu/grouped_conv_fwd_scaleadd_scaleadd_relu.inc
...scaleadd_relu/grouped_conv_fwd_scaleadd_scaleadd_relu.inc
+10
-6
client_example/25_wrapper/CMakeLists.txt
client_example/25_wrapper/CMakeLists.txt
+4
-0
client_example/25_wrapper/tensor_transform_using_wrapper.cpp
client_example/25_wrapper/tensor_transform_using_wrapper.cpp
+114
-0
client_example/25_wrapper/wrapper_img2col.cpp
client_example/25_wrapper/wrapper_img2col.cpp
+180
-0
cmake/ClangTidy.cmake
cmake/ClangTidy.cmake
+1
-1
cmake/getopt.cmake
cmake/getopt.cmake
+28
-0
No files found.
client_example/05_layernorm/layernorm2d_bwd_data.cpp
0 → 100644
View file @
cbcc844e
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
#include <iomanip>
#include <vector>
#include <iostream>
#include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/tensor_operation/gpu/device/device_normalization_bwd_data.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/library/tensor_operation_instance/gpu/layernorm_bwd_data.hpp"
using
DYDataType
=
float
;
using
XDataType
=
float
;
using
GammaDataType
=
float
;
using
MeanInvStdDataType
=
float
;
using
DXDataType
=
float
;
constexpr
int
Rank
=
2
;
constexpr
int
NumReduceDim
=
1
;
struct
SimpleDeviceMem
{
SimpleDeviceMem
()
=
delete
;
SimpleDeviceMem
(
std
::
size_t
mem_size
)
:
p_mem_
{}
{
(
void
)
hipMalloc
(
static_cast
<
void
**>
(
&
p_mem_
),
mem_size
);
}
void
*
GetDeviceBuffer
()
{
return
p_mem_
;
}
~
SimpleDeviceMem
()
{
(
void
)
hipFree
(
p_mem_
);
}
void
*
p_mem_
;
};
int
main
(
int
argc
,
char
*
argv
[])
{
ck
::
index_t
M
=
1024
;
ck
::
index_t
N
=
1024
;
SimpleDeviceMem
dy_dev
(
sizeof
(
DYDataType
)
*
M
*
N
);
SimpleDeviceMem
x_dev
(
sizeof
(
XDataType
)
*
M
*
N
);
SimpleDeviceMem
gamma_dev
(
sizeof
(
GammaDataType
)
*
N
);
SimpleDeviceMem
mean_dev
(
sizeof
(
MeanInvStdDataType
)
*
M
);
SimpleDeviceMem
inv_std_dev
(
sizeof
(
MeanInvStdDataType
)
*
M
);
SimpleDeviceMem
dx_dev
(
sizeof
(
DXDataType
)
*
M
*
N
);
using
DeviceOp
=
ck
::
tensor_operation
::
device
::
DeviceNormalizationBwdData
<
DYDataType
,
XDataType
,
GammaDataType
,
MeanInvStdDataType
,
DXDataType
,
Rank
,
NumReduceDim
>
;
// get device op instances
const
auto
op_ptrs
=
ck
::
tensor_operation
::
device
::
instance
::
DeviceOperationInstanceFactory
<
DeviceOp
>::
GetInstances
();
std
::
cout
<<
"found "
<<
op_ptrs
.
size
()
<<
" instances"
<<
std
::
endl
;
std
::
string
best_op_name
;
bool
found
=
false
;
int
best_op_id
=
-
1
;
float
best_ave_time
=
std
::
numeric_limits
<
float
>::
max
();
float
best_gb_per_sec
=
0
;
// profile device operation instances
std
::
cout
<<
"Run all instances and do timing"
<<
std
::
endl
;
for
(
int
i
=
0
;
i
<
op_ptrs
.
size
();
++
i
)
{
auto
&
op_ptr
=
op_ptrs
[
i
];
auto
argument_ptr
=
op_ptr
->
MakeArgumentPointer
({
M
,
N
},
// lengths
{
N
,
1
},
// dyStrides
{
N
,
1
},
// xStrides
{
0
,
1
},
// gammaStrides
{
1
,
0
},
// meanStrides
{
1
,
0
},
// invStdStrides
{
N
,
1
},
// dxStrides
{
1
},
// reduceDims
dy_dev
.
GetDeviceBuffer
(),
x_dev
.
GetDeviceBuffer
(),
gamma_dev
.
GetDeviceBuffer
(),
mean_dev
.
GetDeviceBuffer
(),
inv_std_dev
.
GetDeviceBuffer
(),
dx_dev
.
GetDeviceBuffer
());
auto
invoker_ptr
=
op_ptr
->
MakeInvokerPointer
();
std
::
string
op_name
=
op_ptr
->
GetTypeString
();
if
(
op_ptr
->
IsSupportedArgument
(
argument_ptr
.
get
()))
{
size_t
workspace_sz
=
op_ptr
->
GetWorkSpaceSize
(
argument_ptr
.
get
());
SimpleDeviceMem
workspace
(
workspace_sz
);
op_ptr
->
SetWorkSpacePointer
(
argument_ptr
.
get
(),
workspace
.
GetDeviceBuffer
());
float
ave_time
=
invoker_ptr
->
Run
(
argument_ptr
.
get
(),
StreamConfig
{
nullptr
,
true
});
std
::
size_t
num_byte
=
sizeof
(
DYDataType
)
*
M
*
N
+
sizeof
(
XDataType
)
*
M
*
N
+
sizeof
(
GammaDataType
)
*
N
+
sizeof
(
MeanInvStdDataType
)
*
M
*
2
+
sizeof
(
DXDataType
)
*
M
*
N
;
float
gb_per_sec
=
num_byte
/
1.E6
/
ave_time
;
std
::
cout
<<
"Perf: "
<<
std
::
setw
(
10
)
<<
ave_time
<<
" ms, "
<<
gb_per_sec
<<
" GB/s, "
<<
op_name
<<
std
::
endl
;
if
(
ave_time
<
best_ave_time
)
{
found
=
true
;
best_op_id
=
i
;
best_op_name
=
op_name
;
best_ave_time
=
ave_time
;
best_gb_per_sec
=
gb_per_sec
;
}
}
else
{
std
::
cout
<<
op_name
<<
" does not support this problem"
<<
std
::
endl
;
}
}
std
::
cout
<<
"Best Perf: "
<<
best_ave_time
<<
" ms, "
<<
best_gb_per_sec
<<
" GB/s, "
<<
best_op_name
<<
std
::
endl
;
// run the best intance
if
(
found
)
{
auto
&
op_ptr
=
op_ptrs
[
best_op_id
];
std
::
cout
<<
"Run the best instance without timing: "
<<
op_ptr
->
GetTypeString
()
<<
std
::
endl
;
auto
argument_ptr
=
op_ptr
->
MakeArgumentPointer
({
M
,
N
},
// lengths
{
N
,
1
},
// dyStrides
{
N
,
1
},
// xStrides
{
0
,
1
},
// gammaStrides
{
1
,
0
},
// meanStrides
{
1
,
0
},
// invStdStrides
{
N
,
1
},
// dxStrides
{
1
},
// reduceDims
dy_dev
.
GetDeviceBuffer
(),
x_dev
.
GetDeviceBuffer
(),
gamma_dev
.
GetDeviceBuffer
(),
mean_dev
.
GetDeviceBuffer
(),
inv_std_dev
.
GetDeviceBuffer
(),
dx_dev
.
GetDeviceBuffer
());
auto
invoker_ptr
=
op_ptr
->
MakeInvokerPointer
();
if
(
op_ptr
->
IsSupportedArgument
(
argument_ptr
.
get
()))
{
size_t
workspace_sz
=
op_ptr
->
GetWorkSpaceSize
(
argument_ptr
.
get
());
SimpleDeviceMem
workspace
(
workspace_sz
);
op_ptr
->
SetWorkSpacePointer
(
argument_ptr
.
get
(),
workspace
.
GetDeviceBuffer
());
invoker_ptr
->
Run
(
argument_ptr
.
get
(),
StreamConfig
{
nullptr
,
false
});
}
std
::
cout
<<
"Done"
<<
std
::
endl
;
}
return
0
;
}
client_example/05_layernorm/layernorm2d_bwd_gamma_beta.cpp
0 → 100644
View file @
cbcc844e
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
#include <iomanip>
#include <vector>
#include <iostream>
#include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/device/device_normalization_bwd_gamma_beta.hpp"
#include "ck/library/tensor_operation_instance/gpu/layernorm_bwd_gamma_beta.hpp"
using
DYDataType
=
float
;
using
XDataType
=
float
;
using
GammaDataType
=
float
;
using
MeanInvStdDataType
=
float
;
using
DGammaDataType
=
float
;
using
DBetaDataType
=
float
;
constexpr
int
Rank
=
2
;
constexpr
int
NumReduceDim
=
1
;
struct
SimpleDeviceMem
{
SimpleDeviceMem
()
=
delete
;
SimpleDeviceMem
(
std
::
size_t
mem_size
)
:
p_mem_
{}
{
(
void
)
hipMalloc
(
static_cast
<
void
**>
(
&
p_mem_
),
mem_size
);
}
void
*
GetDeviceBuffer
()
{
return
p_mem_
;
}
~
SimpleDeviceMem
()
{
(
void
)
hipFree
(
p_mem_
);
}
void
*
p_mem_
;
};
int
main
(
int
argc
,
char
*
argv
[])
{
ck
::
index_t
M
=
1024
;
ck
::
index_t
N
=
1024
;
SimpleDeviceMem
dy_dev
(
sizeof
(
DYDataType
)
*
M
*
N
);
SimpleDeviceMem
x_dev
(
sizeof
(
XDataType
)
*
M
*
N
);
SimpleDeviceMem
mean_dev
(
sizeof
(
MeanInvStdDataType
)
*
M
);
SimpleDeviceMem
inv_std_dev
(
sizeof
(
MeanInvStdDataType
)
*
M
);
SimpleDeviceMem
dgamma_dev
(
sizeof
(
DGammaDataType
)
*
N
);
SimpleDeviceMem
dbeta_dev
(
sizeof
(
DBetaDataType
)
*
N
);
using
DeviceOp
=
ck
::
tensor_operation
::
device
::
DeviceNormalizationBwdGammaBeta
<
DYDataType
,
XDataType
,
MeanInvStdDataType
,
DGammaDataType
,
DBetaDataType
,
Rank
,
NumReduceDim
>
;
// get device op instances
const
auto
op_ptrs
=
ck
::
tensor_operation
::
device
::
instance
::
DeviceOperationInstanceFactory
<
DeviceOp
>::
GetInstances
();
std
::
cout
<<
"found "
<<
op_ptrs
.
size
()
<<
" instances"
<<
std
::
endl
;
std
::
string
best_op_name
;
bool
found
=
false
;
int
best_op_id
=
-
1
;
float
best_ave_time
=
std
::
numeric_limits
<
float
>::
max
();
float
best_gb_per_sec
=
0
;
// profile device operation instances
std
::
cout
<<
"Run all instances and do timing"
<<
std
::
endl
;
std
::
size_t
num_bytes
=
sizeof
(
DYDataType
)
*
M
*
N
+
sizeof
(
XDataType
)
*
M
*
N
+
sizeof
(
MeanInvStdDataType
)
*
M
*
2
+
sizeof
(
DGammaDataType
)
*
N
+
sizeof
(
DBetaDataType
)
*
N
;
for
(
int
i
=
0
;
i
<
op_ptrs
.
size
();
++
i
)
{
auto
&
op_ptr
=
op_ptrs
[
i
];
auto
argument_ptr
=
op_ptr
->
MakeArgumentPointer
({
M
,
N
},
// inLengths
{
N
,
1
},
// dyStrides
{
N
,
1
},
// xStrides
{
1
,
0
},
// meanStrides
{
1
,
0
},
// invStdStrides
{
N
},
// outLengths
{
1
},
// dgammaStrides
{
1
},
// dbetaStrides
{
0
},
// reduceDims
dy_dev
.
GetDeviceBuffer
(),
x_dev
.
GetDeviceBuffer
(),
mean_dev
.
GetDeviceBuffer
(),
inv_std_dev
.
GetDeviceBuffer
(),
dgamma_dev
.
GetDeviceBuffer
(),
dbeta_dev
.
GetDeviceBuffer
());
auto
invoker_ptr
=
op_ptr
->
MakeInvokerPointer
();
std
::
string
op_name
=
op_ptr
->
GetTypeString
();
if
(
op_ptr
->
IsSupportedArgument
(
argument_ptr
.
get
()))
{
size_t
workspace_sz
=
op_ptr
->
GetWorkSpaceSize
(
argument_ptr
.
get
());
SimpleDeviceMem
workspace
(
workspace_sz
);
op_ptr
->
SetWorkSpacePointer
(
argument_ptr
.
get
(),
workspace
.
GetDeviceBuffer
());
float
ave_time
=
invoker_ptr
->
Run
(
argument_ptr
.
get
(),
StreamConfig
{
nullptr
,
true
});
float
gb_per_sec
=
num_bytes
/
1.E6
/
ave_time
;
std
::
cout
<<
"Perf: "
<<
std
::
setw
(
10
)
<<
ave_time
<<
" ms, "
<<
gb_per_sec
<<
" GB/s, "
<<
op_name
<<
std
::
endl
;
if
(
ave_time
<
best_ave_time
)
{
found
=
true
;
best_op_id
=
i
;
best_op_name
=
op_name
;
best_ave_time
=
ave_time
;
best_gb_per_sec
=
gb_per_sec
;
}
}
else
{
std
::
cout
<<
op_name
<<
" does not support this problem"
<<
std
::
endl
;
}
}
std
::
cout
<<
"Best Perf: "
<<
best_ave_time
<<
" ms, "
<<
best_gb_per_sec
<<
" GB/s, "
<<
best_op_name
<<
std
::
endl
;
// run the best intance
if
(
found
)
{
auto
&
op_ptr
=
op_ptrs
[
best_op_id
];
std
::
cout
<<
"Run the best instance without timing: "
<<
op_ptr
->
GetTypeString
()
<<
std
::
endl
;
auto
argument_ptr
=
op_ptr
->
MakeArgumentPointer
({
M
,
N
},
// inLengths
{
N
,
1
},
// dyStrides
{
N
,
1
},
// xStrides
{
1
,
0
},
// meanStrides
{
1
,
0
},
// invStdStrides
{
N
},
// outLengths
{
1
},
// dgammaStrides
{
1
},
// dbetaStrides
{
0
},
// reduceDims
dy_dev
.
GetDeviceBuffer
(),
x_dev
.
GetDeviceBuffer
(),
mean_dev
.
GetDeviceBuffer
(),
inv_std_dev
.
GetDeviceBuffer
(),
dgamma_dev
.
GetDeviceBuffer
(),
dbeta_dev
.
GetDeviceBuffer
());
auto
invoker_ptr
=
op_ptr
->
MakeInvokerPointer
();
if
(
op_ptr
->
IsSupportedArgument
(
argument_ptr
.
get
()))
{
size_t
workspace_sz
=
op_ptr
->
GetWorkSpaceSize
(
argument_ptr
.
get
());
SimpleDeviceMem
workspace
(
workspace_sz
);
op_ptr
->
SetWorkSpacePointer
(
argument_ptr
.
get
(),
workspace
.
GetDeviceBuffer
());
invoker_ptr
->
Run
(
argument_ptr
.
get
(),
StreamConfig
{
nullptr
,
false
});
}
std
::
cout
<<
"Done"
<<
std
::
endl
;
}
return
0
;
}
client_example/05_layernorm/layernorm2d_fwd.cpp
View file @
cbcc844e
...
...
@@ -16,7 +16,7 @@ using XDataType = ck::half_t;
using
GammaDataType
=
ck
::
half_t
;
using
BetaDataType
=
ck
::
half_t
;
using
YDataType
=
ck
::
half_t
;
using
SaveMeanInvStdDataType
=
floa
t
;
using
SaveMeanInvStdDataType
=
ck
::
half_
t
;
using
PassThrough
=
ck
::
tensor_operation
::
element_wise
::
PassThrough
;
#define SAVE_MEAN_INV_STD
...
...
@@ -150,6 +150,7 @@ int main(int argc, char* argv[])
<<
best_op_name
<<
std
::
endl
;
// run the best intance
if
(
found
)
{
auto
&
op_ptr
=
op_ptrs
[
best_op_id
];
std
::
cout
<<
"Run the best instance without timing: "
<<
op_ptr
->
GetTypeString
()
...
...
client_example/05_layernorm/layernorm4d_fwd.cpp
View file @
cbcc844e
...
...
@@ -16,7 +16,7 @@ using XDataType = ck::half_t;
using
GammaDataType
=
ck
::
half_t
;
using
BetaDataType
=
ck
::
half_t
;
using
YDataType
=
ck
::
half_t
;
using
SaveMeanInvStdDataType
=
floa
t
;
using
SaveMeanInvStdDataType
=
ck
::
half_
t
;
using
PassThrough
=
ck
::
tensor_operation
::
element_wise
::
PassThrough
;
#define SAVE_MEAN_INV_STD
...
...
@@ -155,6 +155,7 @@ int main(int argc, char* argv[])
<<
best_op_name
<<
std
::
endl
;
// run the best intance
if
(
found
)
{
auto
&
op_ptr
=
op_ptrs
[
best_op_id
];
std
::
cout
<<
"Run the best instance without timing: "
<<
op_ptr
->
GetTypeString
()
...
...
client_example/06_softmax/softmax4d.cpp
View file @
cbcc844e
...
...
@@ -140,6 +140,7 @@ int main(int argc, char* argv[])
<<
best_op_name
<<
std
::
endl
;
// run the best intance
if
(
found
)
{
auto
&
op_ptr
=
op_ptrs
[
best_op_id
];
std
::
cout
<<
"Run the best instance without timing: "
<<
op_ptr
->
GetTypeString
()
...
...
client_example/12_elementwise_normalization/elementwise_layernorm2d.cpp
View file @
cbcc844e
...
...
@@ -142,6 +142,7 @@ int main()
<<
best_op_name
<<
std
::
endl
;
// run the best intance
if
(
found
)
{
auto
&
op_ptr
=
op_ptrs
[
best_op_id
];
std
::
cout
<<
"Run the best instance without timing: "
<<
op_ptr
->
GetTypeString
()
...
...
client_example/15_gemm_add_multiply/gemm_add_multiply.cpp
View file @
cbcc844e
...
...
@@ -204,6 +204,7 @@ int main(int argc, char* argv[])
<<
best_gb_per_sec
<<
" GB/s, "
<<
best_op_name
<<
std
::
endl
;
// run the best intance
if
(
found
)
{
auto
&
op_ptr
=
op_ptrs
[
best_op_id
];
...
...
client_example/18_groupnorm/CMakeLists.txt
View file @
cbcc844e
add_executable
(
client_groupnorm_swish groupnorm_swish.cpp
)
target_link_libraries
(
client_groupnorm_swish PRIVATE composable_kernel::device_other_operations
)
add_executable
(
client_groupnorm_bwd_data groupnorm_bwd_data.cpp
)
target_link_libraries
(
client_groupnorm_bwd_data PRIVATE composable_kernel::device_other_operations
)
add_executable
(
client_groupnorm_bwd_gamma_beta groupnorm_bwd_gamma_beta.cpp
)
target_link_libraries
(
client_groupnorm_bwd_gamma_beta PRIVATE composable_kernel::device_other_operations
)
add_executable
(
client_groupnorm_swish_fwd groupnorm_swish_fwd.cpp
)
target_link_libraries
(
client_groupnorm_swish_fwd PRIVATE composable_kernel::device_other_operations
)
client_example/18_groupnorm/groupnorm_bwd_data.cpp
0 → 100644
View file @
cbcc844e
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
#include <iomanip>
#include <vector>
#include <iostream>
#include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/tensor_operation/gpu/device/device_normalization_bwd_data.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/library/tensor_operation_instance/gpu/groupnorm_bwd_data.hpp"
using
DYDataType
=
float
;
using
XDataType
=
float
;
using
GammaDataType
=
float
;
using
MeanInvStdDataType
=
float
;
using
DXDataType
=
float
;
constexpr
int
Rank
=
5
;
constexpr
int
NumReduceDim
=
3
;
struct
SimpleDeviceMem
{
SimpleDeviceMem
()
=
delete
;
SimpleDeviceMem
(
std
::
size_t
mem_size
)
:
p_mem_
{}
{
(
void
)
hipMalloc
(
static_cast
<
void
**>
(
&
p_mem_
),
mem_size
);
}
void
*
GetDeviceBuffer
()
{
return
p_mem_
;
}
~
SimpleDeviceMem
()
{
(
void
)
hipFree
(
p_mem_
);
}
void
*
p_mem_
;
};
int
main
(
int
argc
,
char
*
argv
[])
{
ck
::
index_t
N
=
32
;
ck
::
index_t
H
=
16
;
ck
::
index_t
W
=
16
;
ck
::
index_t
G
=
64
;
ck
::
index_t
C
=
128
;
std
::
size_t
length
=
N
*
H
*
W
*
G
*
C
;
std
::
vector
<
ck
::
index_t
>
strideDy
=
{
H
*
W
*
G
*
C
,
W
*
G
*
C
,
G
*
C
,
C
,
1
};
std
::
vector
<
ck
::
index_t
>
strideX
=
strideDy
;
std
::
vector
<
ck
::
index_t
>
strideDx
=
strideDy
;
std
::
vector
<
ck
::
index_t
>
strideGamma
=
{
0
,
0
,
0
,
C
,
1
};
std
::
vector
<
ck
::
index_t
>
strideMeanInvStd
=
{
G
,
0
,
0
,
1
,
0
};
SimpleDeviceMem
dy_dev
(
sizeof
(
DYDataType
)
*
length
);
SimpleDeviceMem
x_dev
(
sizeof
(
XDataType
)
*
length
);
SimpleDeviceMem
gamma_dev
(
sizeof
(
GammaDataType
)
*
G
*
C
);
SimpleDeviceMem
mean_dev
(
sizeof
(
MeanInvStdDataType
)
*
N
*
G
);
SimpleDeviceMem
inv_std_dev
(
sizeof
(
MeanInvStdDataType
)
*
N
*
G
);
SimpleDeviceMem
dx_dev
(
sizeof
(
DXDataType
)
*
length
);
using
DeviceOp
=
ck
::
tensor_operation
::
device
::
DeviceNormalizationBwdData
<
DYDataType
,
XDataType
,
GammaDataType
,
MeanInvStdDataType
,
DXDataType
,
Rank
,
NumReduceDim
>
;
// get device op instances
const
auto
op_ptrs
=
ck
::
tensor_operation
::
device
::
instance
::
DeviceOperationInstanceFactory
<
DeviceOp
>::
GetInstances
();
std
::
cout
<<
"found "
<<
op_ptrs
.
size
()
<<
" instances"
<<
std
::
endl
;
std
::
string
best_op_name
;
bool
found
=
false
;
int
best_op_id
=
-
1
;
float
best_ave_time
=
std
::
numeric_limits
<
float
>::
max
();
float
best_gb_per_sec
=
0
;
// profile device operation instances
std
::
cout
<<
"Run all instances and do timing"
<<
std
::
endl
;
for
(
int
i
=
0
;
i
<
op_ptrs
.
size
();
++
i
)
{
auto
&
op_ptr
=
op_ptrs
[
i
];
auto
argument_ptr
=
op_ptr
->
MakeArgumentPointer
({
N
,
H
,
W
,
G
,
C
},
strideDy
,
strideX
,
strideGamma
,
strideMeanInvStd
,
strideMeanInvStd
,
strideDx
,
{
1
,
2
,
4
},
// reduceDims
dy_dev
.
GetDeviceBuffer
(),
x_dev
.
GetDeviceBuffer
(),
gamma_dev
.
GetDeviceBuffer
(),
mean_dev
.
GetDeviceBuffer
(),
inv_std_dev
.
GetDeviceBuffer
(),
dx_dev
.
GetDeviceBuffer
());
auto
invoker_ptr
=
op_ptr
->
MakeInvokerPointer
();
std
::
string
op_name
=
op_ptr
->
GetTypeString
();
if
(
op_ptr
->
IsSupportedArgument
(
argument_ptr
.
get
()))
{
size_t
workspace_sz
=
op_ptr
->
GetWorkSpaceSize
(
argument_ptr
.
get
());
SimpleDeviceMem
workspace
(
workspace_sz
);
op_ptr
->
SetWorkSpacePointer
(
argument_ptr
.
get
(),
workspace
.
GetDeviceBuffer
());
float
ave_time
=
invoker_ptr
->
Run
(
argument_ptr
.
get
(),
StreamConfig
{
nullptr
,
true
});
std
::
size_t
num_byte
=
sizeof
(
DYDataType
)
*
length
+
sizeof
(
XDataType
)
*
length
+
sizeof
(
GammaDataType
)
*
G
*
C
+
sizeof
(
MeanInvStdDataType
)
*
N
*
G
*
2
+
sizeof
(
DXDataType
)
*
length
;
float
gb_per_sec
=
num_byte
/
1.E6
/
ave_time
;
std
::
cout
<<
"Perf: "
<<
std
::
setw
(
10
)
<<
ave_time
<<
" ms, "
<<
gb_per_sec
<<
" GB/s, "
<<
op_name
<<
std
::
endl
;
if
(
ave_time
<
best_ave_time
)
{
found
=
true
;
best_op_id
=
i
;
best_op_name
=
op_name
;
best_ave_time
=
ave_time
;
best_gb_per_sec
=
gb_per_sec
;
}
}
else
{
std
::
cout
<<
op_name
<<
" does not support this problem"
<<
std
::
endl
;
}
}
// run the best intance
if
(
found
)
{
std
::
cout
<<
"Best Perf: "
<<
best_ave_time
<<
" ms, "
<<
best_gb_per_sec
<<
" GB/s, "
<<
best_op_name
<<
std
::
endl
;
auto
&
op_ptr
=
op_ptrs
[
best_op_id
];
std
::
cout
<<
"Run the best instance without timing: "
<<
op_ptr
->
GetTypeString
()
<<
std
::
endl
;
auto
argument_ptr
=
op_ptr
->
MakeArgumentPointer
({
N
,
H
,
W
,
G
,
C
},
strideDy
,
strideX
,
strideGamma
,
strideMeanInvStd
,
strideMeanInvStd
,
strideDx
,
{
1
,
2
,
4
},
// reduceDims
dy_dev
.
GetDeviceBuffer
(),
x_dev
.
GetDeviceBuffer
(),
gamma_dev
.
GetDeviceBuffer
(),
mean_dev
.
GetDeviceBuffer
(),
inv_std_dev
.
GetDeviceBuffer
(),
dx_dev
.
GetDeviceBuffer
());
auto
invoker_ptr
=
op_ptr
->
MakeInvokerPointer
();
if
(
op_ptr
->
IsSupportedArgument
(
argument_ptr
.
get
()))
{
size_t
workspace_sz
=
op_ptr
->
GetWorkSpaceSize
(
argument_ptr
.
get
());
SimpleDeviceMem
workspace
(
workspace_sz
);
op_ptr
->
SetWorkSpacePointer
(
argument_ptr
.
get
(),
workspace
.
GetDeviceBuffer
());
invoker_ptr
->
Run
(
argument_ptr
.
get
(),
StreamConfig
{
nullptr
,
false
});
}
std
::
cout
<<
"Done"
<<
std
::
endl
;
}
return
0
;
}
client_example/18_groupnorm/groupnorm_bwd_gamma_beta.cpp
0 → 100644
View file @
cbcc844e
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
#include <iomanip>
#include <vector>
#include <iostream>
#include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/device/device_normalization_bwd_gamma_beta.hpp"
#include "ck/library/tensor_operation_instance/gpu/groupnorm_bwd_gamma_beta.hpp"
using
DYDataType
=
float
;
using
XDataType
=
float
;
using
GammaDataType
=
float
;
using
MeanInvStdDataType
=
float
;
using
DGammaDataType
=
float
;
using
DBetaDataType
=
float
;
constexpr
int
Rank
=
5
;
constexpr
int
NumReduceDim
=
3
;
struct
SimpleDeviceMem
{
SimpleDeviceMem
()
=
delete
;
SimpleDeviceMem
(
std
::
size_t
mem_size
)
:
p_mem_
{}
{
(
void
)
hipMalloc
(
static_cast
<
void
**>
(
&
p_mem_
),
mem_size
);
}
void
*
GetDeviceBuffer
()
{
return
p_mem_
;
}
~
SimpleDeviceMem
()
{
(
void
)
hipFree
(
p_mem_
);
}
void
*
p_mem_
;
};
int
main
(
int
argc
,
char
*
argv
[])
{
ck
::
index_t
N
=
32
;
ck
::
index_t
H
=
16
;
ck
::
index_t
W
=
16
;
ck
::
index_t
G
=
64
;
ck
::
index_t
C
=
128
;
std
::
size_t
length
=
N
*
H
*
W
*
G
*
C
;
std
::
vector
<
ck
::
index_t
>
strideDy
=
{
H
*
W
*
G
*
C
,
W
*
G
*
C
,
G
*
C
,
C
,
1
};
std
::
vector
<
ck
::
index_t
>
strideX
=
strideDy
;
std
::
vector
<
ck
::
index_t
>
strideMeanInvStd
=
{
G
,
0
,
0
,
1
,
0
};
std
::
vector
<
ck
::
index_t
>
strideDGammaBeta
=
{
C
,
1
};
SimpleDeviceMem
dy_dev
(
sizeof
(
DYDataType
)
*
length
);
SimpleDeviceMem
x_dev
(
sizeof
(
XDataType
)
*
length
);
SimpleDeviceMem
mean_dev
(
sizeof
(
MeanInvStdDataType
)
*
N
*
G
);
SimpleDeviceMem
inv_std_dev
(
sizeof
(
MeanInvStdDataType
)
*
N
*
G
);
SimpleDeviceMem
dgamma_dev
(
sizeof
(
DGammaDataType
)
*
G
*
C
);
SimpleDeviceMem
dbeta_dev
(
sizeof
(
DBetaDataType
)
*
G
*
C
);
using
DeviceOp
=
ck
::
tensor_operation
::
device
::
DeviceNormalizationBwdGammaBeta
<
DYDataType
,
XDataType
,
MeanInvStdDataType
,
DGammaDataType
,
DBetaDataType
,
Rank
,
NumReduceDim
>
;
// get device op instances
const
auto
op_ptrs
=
ck
::
tensor_operation
::
device
::
instance
::
DeviceOperationInstanceFactory
<
DeviceOp
>::
GetInstances
();
std
::
cout
<<
"found "
<<
op_ptrs
.
size
()
<<
" instances"
<<
std
::
endl
;
std
::
string
best_op_name
;
bool
found
=
false
;
int
best_op_id
=
-
1
;
float
best_ave_time
=
std
::
numeric_limits
<
float
>::
max
();
float
best_gb_per_sec
=
0
;
// profile device operation instances
std
::
cout
<<
"Run all instances and do timing"
<<
std
::
endl
;
std
::
size_t
num_bytes
=
sizeof
(
DYDataType
)
*
length
+
sizeof
(
XDataType
)
*
length
+
sizeof
(
GammaDataType
)
*
G
*
C
+
sizeof
(
MeanInvStdDataType
)
*
N
*
G
*
2
+
sizeof
(
DGammaDataType
)
*
G
*
C
+
sizeof
(
DBetaDataType
)
*
G
*
C
;
for
(
int
i
=
0
;
i
<
op_ptrs
.
size
();
++
i
)
{
auto
&
op_ptr
=
op_ptrs
[
i
];
auto
argument_ptr
=
op_ptr
->
MakeArgumentPointer
({
N
,
H
,
W
,
G
,
C
},
strideDy
,
strideX
,
strideMeanInvStd
,
strideMeanInvStd
,
{
G
,
C
},
strideDGammaBeta
,
strideDGammaBeta
,
{
0
,
1
,
2
},
// reduceDims
dy_dev
.
GetDeviceBuffer
(),
x_dev
.
GetDeviceBuffer
(),
mean_dev
.
GetDeviceBuffer
(),
inv_std_dev
.
GetDeviceBuffer
(),
dgamma_dev
.
GetDeviceBuffer
(),
dbeta_dev
.
GetDeviceBuffer
());
auto
invoker_ptr
=
op_ptr
->
MakeInvokerPointer
();
std
::
string
op_name
=
op_ptr
->
GetTypeString
();
if
(
op_ptr
->
IsSupportedArgument
(
argument_ptr
.
get
()))
{
size_t
workspace_sz
=
op_ptr
->
GetWorkSpaceSize
(
argument_ptr
.
get
());
SimpleDeviceMem
workspace
(
workspace_sz
);
op_ptr
->
SetWorkSpacePointer
(
argument_ptr
.
get
(),
workspace
.
GetDeviceBuffer
());
float
ave_time
=
invoker_ptr
->
Run
(
argument_ptr
.
get
(),
StreamConfig
{
nullptr
,
true
});
float
gb_per_sec
=
num_bytes
/
1.E6
/
ave_time
;
std
::
cout
<<
"Perf: "
<<
std
::
setw
(
10
)
<<
ave_time
<<
" ms, "
<<
gb_per_sec
<<
" GB/s, "
<<
op_name
<<
std
::
endl
;
if
(
ave_time
<
best_ave_time
)
{
found
=
true
;
best_op_id
=
i
;
best_op_name
=
op_name
;
best_ave_time
=
ave_time
;
best_gb_per_sec
=
gb_per_sec
;
}
}
else
{
std
::
cout
<<
op_name
<<
" does not support this problem"
<<
std
::
endl
;
}
}
// run the best intance
if
(
found
)
{
std
::
cout
<<
"Best Perf: "
<<
best_ave_time
<<
" ms, "
<<
best_gb_per_sec
<<
" GB/s, "
<<
best_op_name
<<
std
::
endl
;
auto
&
op_ptr
=
op_ptrs
[
best_op_id
];
std
::
cout
<<
"Run the best instance without timing: "
<<
op_ptr
->
GetTypeString
()
<<
std
::
endl
;
auto
argument_ptr
=
op_ptr
->
MakeArgumentPointer
({
N
,
H
,
W
,
G
,
C
},
strideDy
,
strideX
,
strideMeanInvStd
,
strideMeanInvStd
,
{
G
,
C
},
strideDGammaBeta
,
strideDGammaBeta
,
{
0
,
1
,
2
},
// reduceDims
dy_dev
.
GetDeviceBuffer
(),
x_dev
.
GetDeviceBuffer
(),
mean_dev
.
GetDeviceBuffer
(),
inv_std_dev
.
GetDeviceBuffer
(),
dgamma_dev
.
GetDeviceBuffer
(),
dbeta_dev
.
GetDeviceBuffer
());
auto
invoker_ptr
=
op_ptr
->
MakeInvokerPointer
();
if
(
op_ptr
->
IsSupportedArgument
(
argument_ptr
.
get
()))
{
size_t
workspace_sz
=
op_ptr
->
GetWorkSpaceSize
(
argument_ptr
.
get
());
SimpleDeviceMem
workspace
(
workspace_sz
);
op_ptr
->
SetWorkSpacePointer
(
argument_ptr
.
get
(),
workspace
.
GetDeviceBuffer
());
invoker_ptr
->
Run
(
argument_ptr
.
get
(),
StreamConfig
{
nullptr
,
false
});
}
std
::
cout
<<
"Done"
<<
std
::
endl
;
}
return
0
;
}
client_example/18_groupnorm/groupnorm_swish.cpp
→
client_example/18_groupnorm/groupnorm_swish
_fwd
.cpp
View file @
cbcc844e
File moved
client_example/19_pool/avg_pool3d_fwd.cpp
View file @
cbcc844e
...
...
@@ -94,7 +94,6 @@ int main(int argc, char* argv[])
SimpleDeviceMem
in_device_buf
(
sizeof
(
InDataType
)
*
in_tensor_size
);
SimpleDeviceMem
out_device_buf
(
sizeof
(
OutDataType
)
*
out_tensor_size
);
SimpleDeviceMem
out_indices_device_buf
(
sizeof
(
IndexDataType
)
*
out_tensor_size
);
using
DeviceOp
=
ck
::
tensor_operation
::
device
::
DevicePoolFwd
<
InOutRank
,
WindowRank
,
...
...
@@ -123,22 +122,22 @@ int main(int argc, char* argv[])
for
(
int
i
=
0
;
i
<
op_ptrs
.
size
();
++
i
)
{
auto
&
op_ptr
=
op_ptrs
[
i
];
auto
argument_ptr
=
op_ptr
->
MakeArgumentPointer
(
static_cast
<
InDataType
*>
(
in_device_buf
.
GetDeviceBuffer
()),
static_cast
<
OutDataType
*>
(
out_device_buf
.
GetDeviceBuffer
()),
static_cast
<
IndexDataType
*>
(
out_indices_device_buf
.
GetDeviceBuffer
())
,
in_length
,
window_spatial_lengths
,
out_length
,
in_tensor_stride
,
out_tensor_stride
,
out_tensor_stride
,
window_strides
,
window_dilations
,
input_left_pads
,
input_right_pads
,
{
2
,
3
,
4
});
auto
&
op_ptr
=
op_ptrs
[
i
];
auto
argument_ptr
=
op_ptr
->
MakeArgumentPointer
(
static_cast
<
InDataType
*>
(
in_device_buf
.
GetDeviceBuffer
()),
static_cast
<
OutDataType
*>
(
out_device_buf
.
GetDeviceBuffer
()),
nullptr
,
in_length
,
window_spatial_lengths
,
out_length
,
in_tensor_stride
,
out_tensor_stride
,
out_tensor_stride
,
window_strides
,
window_dilations
,
input_left_pads
,
input_right_pads
,
{
2
,
3
,
4
});
auto
invoker_ptr
=
op_ptr
->
MakeInvokerPointer
();
...
...
@@ -184,21 +183,21 @@ int main(int argc, char* argv[])
std
::
cout
<<
"Run the best instance without timing: "
<<
op_ptr
->
GetTypeString
()
<<
std
::
endl
;
auto
argument_ptr
=
op_ptr
->
MakeArgumentPointer
(
static_cast
<
InDataType
*>
(
in_device_buf
.
GetDeviceBuffer
()),
static_cast
<
OutDataType
*>
(
out_device_buf
.
GetDeviceBuffer
()),
static_cast
<
IndexDataType
*>
(
out_indices_device_buf
.
GetDeviceBuffer
())
,
in_length
,
window_spatial_lengths
,
out_length
,
in_tensor_stride
,
out_tensor_stride
,
out_tensor_stride
,
window_strides
,
window_dilations
,
input_left_pads
,
input_right_pads
,
{
2
,
3
,
4
});
auto
argument_ptr
=
op_ptr
->
MakeArgumentPointer
(
static_cast
<
InDataType
*>
(
in_device_buf
.
GetDeviceBuffer
()),
static_cast
<
OutDataType
*>
(
out_device_buf
.
GetDeviceBuffer
()),
nullptr
,
in_length
,
window_spatial_lengths
,
out_length
,
in_tensor_stride
,
out_tensor_stride
,
out_tensor_stride
,
window_strides
,
window_dilations
,
input_left_pads
,
input_right_pads
,
{
2
,
3
,
4
});
auto
invoker_ptr
=
op_ptr
->
MakeInvokerPointer
();
...
...
client_example/20_splitk_gemm/splitK_gemm_fp16_f8.cpp
View file @
cbcc844e
...
...
@@ -191,6 +191,7 @@ int main(int argc, char* argv[])
<<
best_gb_per_sec
<<
" GB/s, "
<<
best_op_name
<<
std
::
endl
;
// run the best intance
if
(
found
)
{
auto
&
op_ptr
=
op_ptrs
[
best_op_id
];
...
...
client_example/23_elementwise_transpose/elementwise_transpose_3d.cpp
View file @
cbcc844e
...
...
@@ -117,6 +117,7 @@ int main()
<<
best_op_name
<<
std
::
endl
;
// run the best intance
if
(
found
)
{
auto
&
op_ptr
=
op_ptrs
[
best_op_id
];
std
::
cout
<<
"Run the best instance without timing: "
<<
op_ptr
->
GetTypeString
()
...
...
client_example/23_grouped_convnd_fwd_scaleadd_scaleadd_relu/grouped_conv_fwd_scaleadd_scaleadd_relu.inc
View file @
cbcc844e
...
...
@@ -16,6 +16,7 @@
using
InLayout
=
ck
::
tensor_layout
::
convolution
::
NDHWGC
;
using
WeiLayout
=
ck
::
tensor_layout
::
convolution
::
GKZYXC
;
using
OutLayout
=
ck
::
tensor_layout
::
convolution
::
NDHWGK
;
using
BiasLayout
=
ck
::
tensor_layout
::
convolution
::
G_K
;
using
PassThrough
=
ck
::
tensor_operation
::
element_wise
::
PassThrough
;
using
ScaleAddScaleAddRelu
=
ck
::
tensor_operation
::
element_wise
::
ScaleAddScaleAddRelu
;
...
...
@@ -64,6 +65,9 @@ int execute_conv_fwd_scaleadd_scaleadd_relu()
std
::
array
<
ck
::
index_t
,
6
>
out_lengths
{
G
,
N
,
K
,
Do
,
Ho
,
Wo
};
std
::
array
<
ck
::
index_t
,
6
>
out_strides
{
K
,
Do
*
Ho
*
Wo
*
G
*
K
,
1
,
Ho
*
Wo
*
G
*
K
,
Wo
*
G
*
K
,
G
*
K
};
// Logical broadcast bias (we have to pass bias lengths in the same format as output - GNKDHW)
std
::
array
<
ck
::
index_t
,
6
>
bias_lengths
{
G
,
1
,
K
,
1
,
1
,
1
};
std
::
array
<
ck
::
index_t
,
6
>
bias_strides
{
K
,
0
,
1
,
0
,
0
,
0
};
std
::
array
<
ck
::
index_t
,
NumDimSpatial
>
filter_strides
{
1
,
1
,
1
};
std
::
array
<
ck
::
index_t
,
NumDimSpatial
>
filter_dilations
{
1
,
1
,
1
};
...
...
@@ -74,13 +78,13 @@ int execute_conv_fwd_scaleadd_scaleadd_relu()
SimpleDeviceMem
wei
(
sizeof
(
WeiDataType
)
*
G
*
K
*
Z
*
Y
*
X
*
C
);
SimpleDeviceMem
out
(
sizeof
(
OutDataType
)
*
N
*
Do
*
Ho
*
Wo
*
G
*
K
);
SimpleDeviceMem
d0
(
sizeof
(
std
::
tuple_element_t
<
0
,
DDataTypes
>
)
*
N
*
Do
*
Ho
*
Wo
*
G
*
K
);
SimpleDeviceMem
d1
(
sizeof
(
std
::
tuple_element_t
<
1
,
DDataTypes
>
)
*
N
*
Do
*
Ho
*
Wo
*
G
*
K
);
SimpleDeviceMem
d1
(
sizeof
(
std
::
tuple_element_t
<
1
,
DDataTypes
>
)
*
G
*
K
);
using
DeviceOp
=
ck
::
tensor_operation
::
device
::
DeviceGroupedConvFwdMultipleABD
<
NumDimSpatial
,
InLayout
,
WeiLayout
,
ck
::
Tuple
<
OutLayout
,
Out
Layout
>
,
ck
::
Tuple
<
OutLayout
,
Bias
Layout
>
,
OutLayout
,
InDataType
,
WeiDataType
,
...
...
@@ -117,8 +121,8 @@ int execute_conv_fwd_scaleadd_scaleadd_relu()
in_strides
,
wei_lengths
,
wei_strides
,
{
out_lengths
,
out
_lengths
},
{
out_strides
,
out
_strides
},
{
out_lengths
,
bias
_lengths
},
{
out_strides
,
bias
_strides
},
out_lengths
,
out_strides
,
filter_strides
,
...
...
@@ -187,8 +191,8 @@ int execute_conv_fwd_scaleadd_scaleadd_relu()
in_strides
,
wei_lengths
,
wei_strides
,
{
out_lengths
,
out
_lengths
},
{
out_strides
,
out
_strides
},
{
out_lengths
,
bias
_lengths
},
{
out_strides
,
bias
_strides
},
out_lengths
,
out_strides
,
filter_strides
,
...
...
client_example/25_wrapper/CMakeLists.txt
0 → 100644
View file @
cbcc844e
add_executable
(
client_tensor_transform_using_wrapper tensor_transform_using_wrapper.cpp
)
target_link_libraries
(
client_tensor_transform_using_wrapper PRIVATE composable_kernel::device_other_operations
)
add_executable
(
client_wrapper_img2col wrapper_img2col.cpp
)
target_link_libraries
(
client_wrapper_img2col PRIVATE composable_kernel::device_other_operations
)
client_example/25_wrapper/tensor_transform_using_wrapper.cpp
0 → 100644
View file @
cbcc844e
// SPDX-License-Identifier: MIT
// Copyright (c) 2023-2024, Advanced Micro Devices, Inc. All rights reserved.
#include <iostream>
#include "ck/ck.hpp"
#include "ck/utility/number.hpp"
#include "ck/utility/tuple.hpp"
#include "ck/utility/sequence.hpp"
#include "ck/wrapper/layout.hpp"
using
DataType
=
int
;
template
<
typename
Layout
>
void
Print1d
(
const
Layout
&
layout
)
{
std
::
cout
<<
"Print1d"
<<
std
::
endl
;
for
(
ck
::
index_t
w
=
0
;
w
<
ck
::
wrapper
::
size
(
layout
);
w
++
)
{
std
::
cout
<<
layout
(
ck
::
make_tuple
(
w
))
<<
" "
;
}
std
::
cout
<<
std
::
endl
;
}
template
<
typename
Layout
>
void
Print2d
(
const
Layout
&
layout
)
{
std
::
cout
<<
"Print2d"
<<
std
::
endl
;
for
(
ck
::
index_t
h
=
0
;
h
<
ck
::
wrapper
::
size
<
0
>
(
layout
);
h
++
)
{
for
(
ck
::
index_t
w
=
0
;
w
<
ck
::
wrapper
::
size
<
1
>
(
layout
);
w
++
)
{
std
::
cout
<<
layout
(
ck
::
make_tuple
(
h
,
w
))
<<
" "
;
}
std
::
cout
<<
std
::
endl
;
}
}
// Print in (x,y),z pattern
template
<
typename
Layout
>
void
Print3dCustom
(
const
Layout
&
layout
)
{
std
::
cout
<<
"Print3dCustom"
<<
std
::
endl
;
for
(
ck
::
index_t
d
=
0
;
d
<
ck
::
wrapper
::
size
<
0
>
(
ck
::
wrapper
::
get
<
0
>
(
layout
));
d
++
)
{
for
(
ck
::
index_t
h
=
0
;
h
<
ck
::
wrapper
::
size
<
1
>
(
ck
::
wrapper
::
get
<
0
>
(
layout
));
h
++
)
{
for
(
ck
::
index_t
w
=
0
;
w
<
ck
::
wrapper
::
size
<
1
>
(
layout
);
w
++
)
{
std
::
cout
<<
layout
(
ck
::
make_tuple
(
ck
::
make_tuple
(
d
,
h
),
w
))
<<
" "
;
}
std
::
cout
<<
std
::
endl
;
}
std
::
cout
<<
std
::
endl
;
}
}
int
main
()
{
// Layout traverse in row-major
std
::
cout
<<
"Note: Layout traverse in column-major"
<<
std
::
endl
;
// Basic descriptor 0, 1, 2, ... 30, 31 (compile-time descriptor)
// (dims:4,8 strides:1,4)
const
auto
shape_4x8
=
ck
::
make_tuple
(
ck
::
Number
<
4
>
{},
ck
::
Number
<
8
>
{});
const
auto
layout_4x8_s1x4
=
ck
::
wrapper
::
make_layout
(
shape_4x8
);
std
::
cout
<<
"dims:4,8 strides:1,4"
<<
std
::
endl
;
Print2d
(
layout_4x8_s1x4
);
using
Cord1x1Type
=
ck
::
Tuple
<
ck
::
Number
<
1
>
,
ck
::
Number
<
1
>>
;
constexpr
ck
::
index_t
offset_1x1
=
layout_4x8_s1x4
.
template
operator
()
<
Cord1x1Type
>();
std
::
cout
<<
"Constexpr calculated [1, 1] offset:"
<<
offset_1x1
<<
std
::
endl
;
// Basic descriptor 0, 1, 8, 9, 16, 17, ... 30, 31 (runtime descriptor)
// dims:4,(2,4) strides:2,(1,8)
const
auto
shape_4x2x4
=
ck
::
make_tuple
(
4
,
ck
::
make_tuple
(
2
,
4
));
const
auto
strides_s2x1x8
=
ck
::
make_tuple
(
2
,
ck
::
make_tuple
(
1
,
8
));
const
auto
layout_4x2x4_s2x1x8
=
ck
::
wrapper
::
make_layout
(
shape_4x2x4
,
strides_s2x1x8
);
std
::
cout
<<
"dims:4,(2,4) strides:2,(1,8)"
<<
std
::
endl
;
Print2d
(
layout_4x2x4_s2x1x8
);
// Basic descriptor 0, 1, 8, 9, 16, 17, ... 30, 31 (compile-time descriptor)
// dims:(2,2),(2,4) strides:((1,4),(2,8)
const
auto
shape_2x2x2x4
=
ck
::
make_tuple
(
ck
::
make_tuple
(
ck
::
Number
<
2
>
{},
ck
::
Number
<
2
>
{}),
ck
::
make_tuple
(
ck
::
Number
<
2
>
{},
ck
::
Number
<
4
>
{}));
const
auto
strides_s1x4x2x8
=
ck
::
make_tuple
(
ck
::
make_tuple
(
ck
::
Number
<
1
>
{},
ck
::
Number
<
4
>
{}),
ck
::
make_tuple
(
ck
::
Number
<
2
>
{},
ck
::
Number
<
8
>
{}));
static
const
auto
layout_2x2x2x4_s1x4x2x8
=
ck
::
wrapper
::
make_layout
(
shape_2x2x2x4
,
strides_s1x4x2x8
);
std
::
cout
<<
"dims:(2,2),(2,4) strides:(1,4),(2,8)"
<<
std
::
endl
;
Print2d
(
layout_2x2x2x4_s1x4x2x8
);
Print3dCustom
(
layout_2x2x2x4_s1x4x2x8
);
// Basic descriptor 0, 1, 8, 9, 16, 17, ... 30, 31 (compile-time descriptor)
// dims:((2,2),2),4 strides:((1,4),2),8
// Transform to 2d
const
auto
shape_2x2x2x4_nested
=
ck
::
make_tuple
(
ck
::
make_tuple
(
ck
::
make_tuple
(
ck
::
Number
<
2
>
{},
ck
::
Number
<
2
>
{}),
ck
::
Number
<
2
>
{}),
ck
::
Number
<
4
>
{});
const
auto
strides_s1x4x2x8_nested
=
ck
::
make_tuple
(
ck
::
make_tuple
(
ck
::
make_tuple
(
ck
::
Number
<
1
>
{},
ck
::
Number
<
4
>
{}),
ck
::
Number
<
2
>
{}),
ck
::
Number
<
8
>
{});
static
const
auto
layout_2x2x2x4_s1x4x2x8_nested
=
ck
::
wrapper
::
make_layout
(
shape_2x2x2x4_nested
,
strides_s1x4x2x8_nested
);
std
::
cout
<<
"dims:((2,2),2),4 strides:((1,4),2),8"
<<
std
::
endl
;
Print1d
(
layout_2x2x2x4_s1x4x2x8_nested
);
Print2d
(
layout_2x2x2x4_s1x4x2x8_nested
);
Print3dCustom
(
layout_2x2x2x4_s1x4x2x8_nested
);
return
0
;
}
client_example/25_wrapper/wrapper_img2col.cpp
0 → 100644
View file @
cbcc844e
// SPDX-License-Identifier: MIT
// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
#include <numeric>
#include <cstdlib>
#include <iomanip>
#include <iostream>
#include <initializer_list>
#include <vector>
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/host_utility/kernel_launch.hpp"
#include "ck/utility/common_header.hpp"
#include "ck/wrapper/layout.hpp"
#include "ck/wrapper/tensor.hpp"
#include "ck/wrapper/operations/copy.hpp"
static
constexpr
ck
::
index_t
NumDimSpatial
=
3
;
using
DataType
=
float
;
using
InputLayout
=
ck
::
tensor_layout
::
convolution
::
NDHWGC
;
struct
SimpleDeviceMem
{
SimpleDeviceMem
()
=
delete
;
SimpleDeviceMem
(
std
::
size_t
mem_size
)
:
p_mem_
{}
{
(
void
)
hipMalloc
(
static_cast
<
void
**>
(
&
p_mem_
),
mem_size
);
}
void
*
GetDeviceBuffer
()
{
return
p_mem_
;
}
~
SimpleDeviceMem
()
{
(
void
)
hipFree
(
p_mem_
);
}
void
*
p_mem_
;
};
// Test copy from Global to Global through LDS and VGPR
template
<
typename
InputTensor
,
typename
OutputTensor
,
typename
BlockShape
,
typename
ThreadLayoutShape
>
__global__
void
DeviceImageToColumnPad0
(
InputTensor
input_tensor
,
OutputTensor
output_tensor
,
const
BlockShape
tile_shape
,
const
ThreadLayoutShape
thread_layout
)
{
const
ck
::
index_t
block_idx
=
static_cast
<
ck
::
index_t
>
(
blockIdx
.
x
);
// Get local tiles for global memory
auto
input_local_tile
=
ck
::
wrapper
::
make_local_tile
(
input_tensor
,
tile_shape
,
block_idx
);
auto
output_local_tile
=
ck
::
wrapper
::
make_local_tile
(
output_tensor
,
tile_shape
,
block_idx
);
// Get partition per thread
const
auto
input_local_partition
=
ck
::
wrapper
::
make_local_partition
(
input_local_tile
,
thread_layout
,
threadIdx
.
x
);
auto
output_local_partition
=
ck
::
wrapper
::
make_local_partition
(
output_local_tile
,
thread_layout
,
threadIdx
.
x
);
// Perform copy
using
DimAccessOrder
=
ck
::
Tuple
<
ck
::
Number
<
0
>
,
ck
::
Number
<
1
>>
;
constexpr
ck
::
index_t
vector_dim
=
1
;
constexpr
ck
::
index_t
scalar_per_vector
=
4
;
ck
::
wrapper
::
copy
<
DimAccessOrder
,
vector_dim
,
scalar_per_vector
>
(
input_local_partition
,
output_local_partition
);
}
void
PerformImageToColumnPad0
(
const
ck
::
index_t
G
,
const
ck
::
index_t
N
,
const
ck
::
index_t
Di
,
const
ck
::
index_t
Hi
,
const
ck
::
index_t
Wi
,
const
ck
::
index_t
Do
,
const
ck
::
index_t
Ho
,
const
ck
::
index_t
Wo
,
const
ck
::
index_t
C
,
const
ck
::
index_t
Z
,
const
ck
::
index_t
Y
,
const
ck
::
index_t
X
,
std
::
array
<
ck
::
index_t
,
NumDimSpatial
>
filter_strides
,
std
::
array
<
ck
::
index_t
,
NumDimSpatial
>
filter_dilations
)
{
const
ck
::
index_t
ZYXC
=
Z
*
Y
*
X
*
C
;
const
ck
::
index_t
GC
=
G
*
C
;
// shape: (G, (Wo, Ho, Do, N)), (C, X, Y, Z))
const
auto
shape
=
ck
::
make_tuple
(
ck
::
make_tuple
(
G
,
ck
::
make_tuple
(
Wo
,
Ho
,
Do
,
N
)),
ck
::
make_tuple
(
C
,
X
,
Y
,
Z
));
const
auto
in_strides
=
ck
::
make_tuple
(
ck
::
make_tuple
(
C
,
ck
::
make_tuple
(
filter_strides
[
2
]
*
GC
,
filter_strides
[
1
]
*
Wi
*
GC
,
filter_strides
[
0
]
*
Hi
*
Wi
*
GC
,
Di
*
Hi
*
Wi
*
GC
)),
ck
::
make_tuple
(
1
,
filter_dilations
[
2
]
*
GC
,
filter_dilations
[
1
]
*
Wi
*
GC
,
filter_dilations
[
0
]
*
Hi
*
Wi
*
GC
));
const
auto
in_layout
=
ck
::
wrapper
::
make_layout
(
shape
,
in_strides
);
const
auto
out_strides
=
ck
::
make_tuple
(
ck
::
make_tuple
(
ZYXC
,
ck
::
make_tuple
(
ZYXC
*
G
,
Wo
*
ZYXC
*
G
,
Ho
*
Wo
*
ZYXC
*
G
,
Do
*
Ho
*
Wo
*
ZYXC
*
G
)),
ck
::
make_tuple
(
1
,
C
,
X
*
C
,
Y
*
X
*
C
));
const
auto
out_layout
=
ck
::
wrapper
::
make_layout
(
shape
,
out_strides
);
const
ck
::
index_t
input_size
=
N
*
Di
*
Hi
*
Wi
*
GC
;
// Global memory buffers
SimpleDeviceMem
in_buf
(
input_size
*
sizeof
(
DataType
));
SimpleDeviceMem
out_buf
(
ck
::
wrapper
::
size
(
out_layout
)
*
sizeof
(
DataType
));
// User can choose appropriate number of threads and sizes per block
const
auto
thread_layout
=
ck
::
make_tuple
(
ck
::
Number
<
8
>
{},
ck
::
Number
<
16
>
{});
// This example doesn't support padding, user should select tile sizes
// which divides the shape completely
const
auto
tile_shape
=
ck
::
make_tuple
(
ck
::
Number
<
32
>
{},
ck
::
Number
<
64
>
{});
// Create buffers for global memory
auto
input_tensor_global
=
ck
::
wrapper
::
make_tensor
<
ck
::
wrapper
::
MemoryTypeEnum
::
Global
>
(
static_cast
<
const
DataType
*>
(
in_buf
.
GetDeviceBuffer
()),
in_layout
);
auto
output_tensor_global
=
ck
::
wrapper
::
make_tensor
<
ck
::
wrapper
::
MemoryTypeEnum
::
Global
>
(
static_cast
<
DataType
*>
(
out_buf
.
GetDeviceBuffer
()),
out_layout
);
const
ck
::
index_t
grid_size
=
ck
::
math
::
integer_divide_ceil
(
ck
::
wrapper
::
size
<
0
>
(
in_layout
),
ck
::
wrapper
::
size
<
0
>
(
tile_shape
))
*
ck
::
math
::
integer_divide_ceil
(
ck
::
wrapper
::
size
<
1
>
(
in_layout
),
ck
::
wrapper
::
size
<
1
>
(
tile_shape
));
const
auto
kernel
=
DeviceImageToColumnPad0
<
decltype
(
input_tensor_global
),
decltype
(
output_tensor_global
),
decltype
(
tile_shape
),
decltype
(
thread_layout
)
>
;
const
float
avg_time
=
launch_and_time_kernel
(
StreamConfig
{
nullptr
,
true
},
kernel
,
dim3
(
grid_size
),
dim3
(
ck
::
wrapper
::
size
(
thread_layout
)),
0
,
input_tensor_global
,
output_tensor_global
,
tile_shape
,
thread_layout
);
std
::
size_t
num_btype
=
G
*
N
*
Do
*
Ho
*
Wo
*
ZYXC
*
2
*
sizeof
(
DataType
);
float
gb_per_sec
=
num_btype
/
1.E6
/
avg_time
;
std
::
cout
<<
"Perf: "
<<
std
::
setw
(
10
)
<<
avg_time
<<
" ms, "
<<
gb_per_sec
<<
" GB/s, "
<<
std
::
endl
;
}
int
main
(
int
argc
,
char
*
argv
[])
{
constexpr
ck
::
index_t
G
=
4
;
// number of groups
constexpr
ck
::
index_t
N
=
32
;
// batch
constexpr
ck
::
index_t
C
=
64
;
// input channel (per group)
constexpr
ck
::
index_t
Z
=
3
;
// filter D
constexpr
ck
::
index_t
Y
=
3
;
// filter H
constexpr
ck
::
index_t
X
=
3
;
// filter W
constexpr
ck
::
index_t
Di
=
9
;
// input D
constexpr
ck
::
index_t
Hi
=
9
;
// input H
constexpr
ck
::
index_t
Wi
=
7
;
// input W
constexpr
ck
::
index_t
Do
=
7
;
// output D
constexpr
ck
::
index_t
Ho
=
7
;
// output H
constexpr
ck
::
index_t
Wo
=
5
;
// output W
PerformImageToColumnPad0
(
G
,
N
,
Di
,
Hi
,
Wi
,
Do
,
Ho
,
Wo
,
C
,
Z
,
Y
,
X
,
{
1
,
1
,
1
}
/*filter_strides*/
,
{
1
,
1
,
1
}
/*filter_dilations*/
);
return
0
;
}
cmake/ClangTidy.cmake
View file @
cbcc844e
...
...
@@ -149,7 +149,7 @@ function(clang_tidy_check TARGET)
add_custom_target
(
${
tidy_target
}
# for some targets clang-tidy not able to get information from .clang-tidy
DEPENDS
${
SOURCE
}
COMMAND
${
CLANG_TIDY_COMMAND
}
"-config=\{CheckOptions: \[\{key: bugprone-reserved-identifier.AllowedIdentifiers,value: __HIP_PLATFORM_HCC__
\;
__HIP_ROCclr__\}\]\}"
${
SOURCE
}
"-export-fixes=
${
CLANG_TIDY_FIXIT_DIR
}
/
${
TARGET
}
-
${
tidy_file
}
.yaml"
COMMAND
${
CLANG_TIDY_COMMAND
}
"-config=\{CheckOptions: \[\{key: bugprone-reserved-identifier.AllowedIdentifiers,value: __HIP_PLATFORM_HCC__
\;
__HIP_PLATFORM_AMD__
\;
__HIP_ROCclr__\}\]\}"
${
SOURCE
}
"-export-fixes=
${
CLANG_TIDY_FIXIT_DIR
}
/
${
TARGET
}
-
${
tidy_file
}
.yaml"
WORKING_DIRECTORY
${
CMAKE_CURRENT_SOURCE_DIR
}
COMMENT
"clang-tidy: Running clang-tidy on target
${
SOURCE
}
..."
)
...
...
cmake/getopt.cmake
0 → 100644
View file @
cbcc844e
# SPDX-License-Identifier: MIT
# Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
add_library
(
getopt::getopt INTERFACE IMPORTED GLOBAL
)
if
(
WIN32
)
include
(
FetchContent
)
FetchContent_Declare
(
getopt
GIT_REPOSITORY https://github.com/apwojcik/getopt.git
GIT_TAG main
SYSTEM
)
set
(
__build_shared_libs
${
BUILD_SHARED_LIBS
}
)
set
(
BUILD_SHARED_LIBS OFF CACHE INTERNAL
""
)
FetchContent_MakeAvailable
(
getopt
)
# Restore the old value of BUILD_SHARED_LIBS
set
(
BUILD_SHARED_LIBS
${
__build_shared_libs
}
CACHE BOOL
"Type of libraries to build"
FORCE
)
FetchContent_GetProperties
(
getopt
)
target_link_libraries
(
getopt::getopt INTERFACE wingetopt
)
target_include_directories
(
getopt::getopt INTERFACE
${
getopt_SOURCE_DIR
}
/src
)
endif
()
\ No newline at end of file
Prev
1
2
3
4
5
6
…
20
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment