Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
composable_kernel_ROCM
Commits
ef326c73
Commit
ef326c73
authored
Nov 19, 2024
by
Alan Turner
Browse files
Merge remote-tracking branch 'origin/develop' into migraphx-update
parents
b7775add
e4dfe4d8
Changes
511
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
2140 additions
and
0 deletions
+2140
-0
client_example/24_grouped_conv_activation/grouped_convnd_fwd_convscale_relu/conv3d_fwd_convscale_relu_fp8.cpp
...nvnd_fwd_convscale_relu/conv3d_fwd_convscale_relu_fp8.cpp
+50
-0
client_example/24_grouped_conv_activation/grouped_convnd_fwd_scale/grouped_conv_fwd_scale_fp16.cpp
.../grouped_convnd_fwd_scale/grouped_conv_fwd_scale_fp16.cpp
+220
-0
client_example/24_grouped_conv_activation/grouped_convnd_fwd_scaleadd_ab/grouped_conv_fwd_scaleadd_ab.inc
...d_convnd_fwd_scaleadd_ab/grouped_conv_fwd_scaleadd_ab.inc
+221
-0
client_example/24_grouped_conv_activation/grouped_convnd_fwd_scaleadd_ab/grouped_conv_fwd_scaleadd_ab_bf16.cpp
...vnd_fwd_scaleadd_ab/grouped_conv_fwd_scaleadd_ab_bf16.cpp
+13
-0
client_example/24_grouped_conv_activation/grouped_convnd_fwd_scaleadd_ab/grouped_conv_fwd_scaleadd_ab_fp16.cpp
...vnd_fwd_scaleadd_ab/grouped_conv_fwd_scaleadd_ab_fp16.cpp
+13
-0
client_example/24_grouped_conv_activation/grouped_convnd_fwd_scaleadd_ab/grouped_conv_fwd_scaleadd_ab_fp32.cpp
...vnd_fwd_scaleadd_ab/grouped_conv_fwd_scaleadd_ab_fp32.cpp
+13
-0
client_example/24_grouped_conv_activation/grouped_convnd_fwd_scaleadd_ab/grouped_conv_fwd_scaleadd_ab_int8.cpp
...vnd_fwd_scaleadd_ab/grouped_conv_fwd_scaleadd_ab_int8.cpp
+13
-0
client_example/24_grouped_conv_activation/grouped_convnd_fwd_scaleadd_scaleadd_relu/grouped_conv_fwd_scaleadd_scaleadd_relu.inc
...scaleadd_relu/grouped_conv_fwd_scaleadd_scaleadd_relu.inc
+216
-0
client_example/24_grouped_conv_activation/grouped_convnd_fwd_scaleadd_scaleadd_relu/grouped_conv_fwd_scaleadd_scaleadd_relu_bf16.cpp
...add_relu/grouped_conv_fwd_scaleadd_scaleadd_relu_bf16.cpp
+18
-0
client_example/24_grouped_conv_activation/grouped_convnd_fwd_scaleadd_scaleadd_relu/grouped_conv_fwd_scaleadd_scaleadd_relu_fp16.cpp
...add_relu/grouped_conv_fwd_scaleadd_scaleadd_relu_fp16.cpp
+18
-0
client_example/24_grouped_conv_activation/grouped_convnd_fwd_scaleadd_scaleadd_relu/grouped_conv_fwd_scaleadd_scaleadd_relu_fp32.cpp
...add_relu/grouped_conv_fwd_scaleadd_scaleadd_relu_fp32.cpp
+18
-0
client_example/24_grouped_conv_activation/grouped_convnd_fwd_scaleadd_scaleadd_relu/grouped_conv_fwd_scaleadd_scaleadd_relu_int8.cpp
...add_relu/grouped_conv_fwd_scaleadd_scaleadd_relu_int8.cpp
+18
-0
client_example/25_wrapper/CMakeLists.txt
client_example/25_wrapper/CMakeLists.txt
+10
-0
client_example/25_wrapper/README.md
client_example/25_wrapper/README.md
+177
-0
client_example/25_wrapper/tensor_transform_using_wrapper.cpp
client_example/25_wrapper/tensor_transform_using_wrapper.cpp
+114
-0
client_example/25_wrapper/wrapper_basic_gemm.cpp
client_example/25_wrapper/wrapper_basic_gemm.cpp
+228
-0
client_example/25_wrapper/wrapper_img2col.cpp
client_example/25_wrapper/wrapper_img2col.cpp
+183
-0
client_example/25_wrapper/wrapper_optimized_gemm.cpp
client_example/25_wrapper/wrapper_optimized_gemm.cpp
+319
-0
client_example/30_gemm_bf16Aint8B/CMakeLists.txt
client_example/30_gemm_bf16Aint8B/CMakeLists.txt
+16
-0
client_example/30_gemm_bf16Aint8B/gemm_bias_fastgelu_xdl_bf16_i8.cpp
...ple/30_gemm_bf16Aint8B/gemm_bias_fastgelu_xdl_bf16_i8.cpp
+262
-0
No files found.
Too many changes to show.
To preserve performance only
511 of 511+
files are displayed.
Plain diff
Email patch
client_example/24_grouped_conv_activation/grouped_convnd_fwd_convscale_relu/conv3d_fwd_convscale_relu_fp8.cpp
0 → 100644
View file @
ef326c73
// SPDX-License-Identifier: MIT
// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
#include "common.hpp"
#include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
using
InDataType
=
ck
::
f8_t
;
using
WeiDataType
=
ck
::
f8_t
;
using
CShuffleDataType
=
float
;
using
OutDataType
=
ck
::
f8_t
;
using
AComputeDataType
=
ck
::
f8_t
;
using
BComputeDataType
=
ck
::
f8_t
;
using
InLayout
=
ck
::
tensor_layout
::
convolution
::
NDHWGC
;
using
WeiLayout
=
ck
::
tensor_layout
::
convolution
::
GKZYXC
;
using
OutLayout
=
ck
::
tensor_layout
::
convolution
::
NDHWGK
;
static
constexpr
ck
::
index_t
NumDimSpatial
=
3
;
static
constexpr
ck
::
index_t
G
=
1
;
static
constexpr
ck
::
index_t
N
=
64
;
static
constexpr
ck
::
index_t
K
=
128
;
static
constexpr
ck
::
index_t
C
=
64
;
static
constexpr
ck
::
index_t
Z
=
3
;
static
constexpr
ck
::
index_t
Y
=
3
;
static
constexpr
ck
::
index_t
X
=
3
;
static
constexpr
ck
::
index_t
Di
=
28
;
static
constexpr
ck
::
index_t
Hi
=
28
;
static
constexpr
ck
::
index_t
Wi
=
3
;
static
constexpr
ck
::
index_t
Do
=
28
;
static
constexpr
ck
::
index_t
Ho
=
28
;
static
constexpr
ck
::
index_t
Wo
=
3
;
int
main
()
{
return
run_grouped_conv_fwd_convscale_relu
<
NumDimSpatial
,
InDataType
,
WeiDataType
,
OutDataType
,
InLayout
,
WeiLayout
,
OutLayout
,
3
,
AComputeDataType
,
BComputeDataType
>
(
{
N
,
Di
,
Hi
,
Wi
,
G
,
C
},
{
G
,
K
,
Z
,
Y
,
X
,
C
},
{
N
,
Do
,
Ho
,
Wo
,
G
,
K
})
?
EXIT_SUCCESS
:
EXIT_FAILURE
;
}
client_example/24_grouped_conv_activation/grouped_convnd_fwd_scale/grouped_conv_fwd_scale_fp16.cpp
0 → 100644
View file @
ef326c73
// SPDX-License-Identifier: MIT
// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
#include <tuple>
#include <cstdlib>
#include <iomanip>
#include <iostream>
#include <iterator>
#include <numeric>
#include <vector>
#include "ck/utility/data_type.hpp"
#include "ck/utility/tuple.hpp"
#include "ck/ck.hpp"
#include "ck/library/tensor_operation_instance/gpu/grouped_convolution_forward_scale.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
using
InDataType
=
ck
::
half_t
;
using
WeiDataType
=
ck
::
half_t
;
using
OutDataType
=
ck
::
half_t
;
// Use std tuple instead of ck tuple to avoid clang
// implicit instantiation of undefined template error.
using
DDataTypes
=
std
::
tuple
<
ck
::
half_t
>
;
using
InLayout
=
ck
::
tensor_layout
::
convolution
::
NDHWGC
;
using
WeiLayout
=
ck
::
tensor_layout
::
convolution
::
GKZYXC
;
using
OutLayout
=
ck
::
tensor_layout
::
convolution
::
NDHWGK
;
using
PassThrough
=
ck
::
tensor_operation
::
element_wise
::
PassThrough
;
using
Scale
=
ck
::
tensor_operation
::
element_wise
::
Scale
;
static
constexpr
ck
::
index_t
NumDimSpatial
=
3
;
static
constexpr
ck
::
index_t
G
=
32
;
static
constexpr
ck
::
index_t
N
=
64
;
// batch size
static
constexpr
ck
::
index_t
K
=
64
;
// output channel
static
constexpr
ck
::
index_t
C
=
32
;
// input channel (per group)
static
constexpr
ck
::
index_t
Z
=
3
;
// filter D
static
constexpr
ck
::
index_t
Y
=
3
;
// filter H
static
constexpr
ck
::
index_t
X
=
3
;
// filter W
static
constexpr
ck
::
index_t
Di
=
14
;
// input D
static
constexpr
ck
::
index_t
Hi
=
14
;
// input H
static
constexpr
ck
::
index_t
Wi
=
14
;
// input W
static
constexpr
ck
::
index_t
Do
=
14
;
// output D
static
constexpr
ck
::
index_t
Ho
=
14
;
// output H
static
constexpr
ck
::
index_t
Wo
=
14
;
// output W
struct
SimpleDeviceMem
{
SimpleDeviceMem
()
=
delete
;
SimpleDeviceMem
(
std
::
size_t
mem_size
)
:
p_mem_
{}
{
(
void
)
hipMalloc
(
static_cast
<
void
**>
(
&
p_mem_
),
mem_size
);
}
void
*
GetDeviceBuffer
()
{
return
p_mem_
;
}
~
SimpleDeviceMem
()
{
(
void
)
hipFree
(
p_mem_
);
}
void
*
p_mem_
;
};
int
execute_conv_fwd_scale
()
{
// We have NHWGC/GKYXC/NHWGK (x, weight, y) in memory space.
// However, CK's API only accepts lengths and strides with order of GNCDHW/GKCZYX/GNKDHW.
// Hence, we need to adjust the order of strides.
std
::
array
<
ck
::
index_t
,
6
>
in_lengths
{
G
,
N
,
C
,
Di
,
Hi
,
Wi
};
std
::
array
<
ck
::
index_t
,
6
>
in_strides
{
C
,
Di
*
Hi
*
Wi
*
G
*
C
,
1
,
Hi
*
Wi
*
G
*
C
,
Wi
*
G
*
C
,
G
*
C
};
std
::
array
<
ck
::
index_t
,
6
>
wei_lengths
{
G
,
K
,
C
,
Z
,
Y
,
X
};
std
::
array
<
ck
::
index_t
,
6
>
wei_strides
{
K
*
Z
*
Y
*
X
*
C
,
Z
*
Y
*
X
*
C
,
1
,
Y
*
X
*
C
,
X
*
C
,
C
};
std
::
array
<
ck
::
index_t
,
6
>
out_lengths
{
G
,
N
,
K
,
Do
,
Ho
,
Wo
};
std
::
array
<
ck
::
index_t
,
6
>
out_strides
{
K
,
Do
*
Ho
*
Wo
*
G
*
K
,
1
,
Ho
*
Wo
*
G
*
K
,
Wo
*
G
*
K
,
G
*
K
};
// Logical broadcast bias (we have to pass bias lengths in the same format as output - GNKDHW)
std
::
array
<
ck
::
index_t
,
6
>
bias_lengths
{
G
,
1
,
K
,
1
,
1
,
1
};
std
::
array
<
ck
::
index_t
,
6
>
bias_strides
{
K
,
0
,
1
,
0
,
0
,
0
};
std
::
array
<
ck
::
index_t
,
NumDimSpatial
>
filter_strides
{
1
,
1
,
1
};
std
::
array
<
ck
::
index_t
,
NumDimSpatial
>
filter_dilations
{
1
,
1
,
1
};
std
::
array
<
ck
::
index_t
,
NumDimSpatial
>
input_left_pads
{
1
,
1
,
1
};
std
::
array
<
ck
::
index_t
,
NumDimSpatial
>
input_right_pads
{
1
,
1
,
1
};
SimpleDeviceMem
in
(
sizeof
(
InDataType
)
*
N
*
Di
*
Hi
*
Wi
*
G
*
C
);
SimpleDeviceMem
wei
(
sizeof
(
WeiDataType
)
*
G
*
K
*
Z
*
Y
*
X
*
C
);
SimpleDeviceMem
out
(
sizeof
(
OutDataType
)
*
N
*
Do
*
Ho
*
Wo
*
G
*
K
);
using
DeviceOp
=
ck
::
tensor_operation
::
device
::
DeviceGroupedConvFwdMultipleABD
<
NumDimSpatial
,
InLayout
,
WeiLayout
,
ck
::
Tuple
<>
,
OutLayout
,
InDataType
,
WeiDataType
,
ck
::
Tuple
<>
,
OutDataType
,
PassThrough
,
PassThrough
,
Scale
>
;
// get device op instances
const
auto
op_ptrs
=
ck
::
tensor_operation
::
device
::
instance
::
DeviceOperationInstanceFactory
<
DeviceOp
>::
GetInstances
();
std
::
cout
<<
"found "
<<
op_ptrs
.
size
()
<<
" instances"
<<
std
::
endl
;
std
::
string
best_op_name
;
int
best_op_id
=
-
1
;
float
best_avg_time
=
std
::
numeric_limits
<
float
>::
max
();
float
best_gb_per_sec
=
0
;
float
best_tflops
=
0
;
// profile device operation instances
std
::
cout
<<
"Run all instances and do timing"
<<
std
::
endl
;
for
(
int
i
=
0
;
i
<
op_ptrs
.
size
();
++
i
)
{
auto
&
op_ptr
=
op_ptrs
[
i
];
auto
argument_ptr
=
op_ptr
->
MakeArgumentPointer
(
in
.
GetDeviceBuffer
(),
wei
.
GetDeviceBuffer
(),
{},
out
.
GetDeviceBuffer
(),
in_lengths
,
in_strides
,
wei_lengths
,
wei_strides
,
{},
{},
out_lengths
,
out_strides
,
filter_strides
,
filter_dilations
,
input_left_pads
,
input_right_pads
,
PassThrough
{},
PassThrough
{},
Scale
{
2.
f
});
auto
invoker_ptr
=
op_ptr
->
MakeInvokerPointer
();
std
::
string
op_name
=
op_ptr
->
GetTypeString
();
if
(
op_ptr
->
IsSupportedArgument
(
argument_ptr
.
get
()))
{
float
avg_time
=
invoker_ptr
->
Run
(
argument_ptr
.
get
(),
StreamConfig
{
nullptr
,
true
});
std
::
size_t
flop
=
std
::
size_t
(
2
)
*
G
*
N
*
K
*
C
*
Ho
*
Wo
*
Y
*
X
+
3
*
N
*
Ho
*
Wo
*
G
*
K
;
std
::
size_t
num_bytes
=
sizeof
(
InDataType
)
*
N
*
Hi
*
Wi
*
G
*
C
+
sizeof
(
WeiDataType
)
*
G
*
K
*
Y
*
X
*
C
+
sizeof
(
OutDataType
)
*
2
*
N
*
Ho
*
Wo
*
G
*
K
;
float
tflops
=
static_cast
<
float
>
(
flop
)
/
1.E9
/
avg_time
;
float
gb_per_sec
=
num_bytes
/
1.E6
/
avg_time
;
std
::
cout
<<
"Perf: "
<<
std
::
setw
(
10
)
<<
avg_time
<<
" ms, "
<<
tflops
<<
" TFlops, "
<<
gb_per_sec
<<
" GB/s, "
<<
op_name
<<
std
::
endl
;
if
(
tflops
>
best_tflops
)
{
best_op_id
=
i
;
best_op_name
=
op_name
;
best_avg_time
=
avg_time
;
best_gb_per_sec
=
gb_per_sec
;
best_tflops
=
tflops
;
}
}
else
{
std
::
cerr
<<
op_name
<<
" does not support this problem"
<<
std
::
endl
;
}
}
if
(
best_op_id
<
0
)
{
std
::
cerr
<<
"no suitable instance"
<<
std
::
endl
;
return
EXIT_FAILURE
;
}
std
::
cout
<<
"Best Perf: "
<<
std
::
setw
(
10
)
<<
best_avg_time
<<
" ms, "
<<
best_tflops
<<
" TFlops, "
<<
best_gb_per_sec
<<
" GB/s, "
<<
best_op_name
<<
std
::
endl
;
// run the best intance
{
auto
&
op_ptr
=
op_ptrs
[
best_op_id
];
std
::
cout
<<
"Run the best instance without timing: "
<<
op_ptr
->
GetTypeString
()
<<
std
::
endl
;
auto
argument_ptr
=
op_ptr
->
MakeArgumentPointer
(
in
.
GetDeviceBuffer
(),
wei
.
GetDeviceBuffer
(),
{},
out
.
GetDeviceBuffer
(),
in_lengths
,
in_strides
,
wei_lengths
,
wei_strides
,
{},
{},
out_lengths
,
out_strides
,
filter_strides
,
filter_dilations
,
input_left_pads
,
input_right_pads
,
PassThrough
{},
PassThrough
{},
Scale
{
2.
f
});
auto
invoker_ptr
=
op_ptr
->
MakeInvokerPointer
();
if
(
op_ptr
->
IsSupportedArgument
(
argument_ptr
.
get
()))
{
invoker_ptr
->
Run
(
argument_ptr
.
get
(),
StreamConfig
{
nullptr
,
false
});
}
std
::
cout
<<
"Done"
<<
std
::
endl
;
}
return
0
;
}
int
main
()
{
return
execute_conv_fwd_scale
();
}
client_example/24_grouped_conv_activation/grouped_convnd_fwd_scaleadd_ab/grouped_conv_fwd_scaleadd_ab.inc
0 → 100644
View file @
ef326c73
// SPDX-License-Identifier: MIT
// Copyright (c) 2023-2024, Advanced Micro Devices, Inc. All rights reserved.
#include <cstdlib>
#include <iomanip>
#include <iostream>
#include <iterator>
#include <numeric>
#include <vector>
#include "ck/ck.hpp"
#include "ck/library/tensor_operation_instance/gpu/grouped_convolution_forward_scaleadd_ab.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
using
InLayout
=
ck
::
tensor_layout
::
convolution
::
NDHWGC
;
using
WeiLayout
=
ck
::
tensor_layout
::
convolution
::
GKZYXC
;
using
OutLayout
=
ck
::
tensor_layout
::
convolution
::
NDHWGK
;
using
PassThrough
=
ck
::
tensor_operation
::
element_wise
::
PassThrough
;
using
ScaleAdd
=
ck
::
tensor_operation
::
element_wise
::
ScaleAdd
;
static
constexpr
ck
::
index_t
NumDimSpatial
=
3
;
static
constexpr
ck
::
index_t
G
=
32
;
static
constexpr
ck
::
index_t
N
=
64
;
// batch size
static
constexpr
ck
::
index_t
K
=
64
;
// output channel
static
constexpr
ck
::
index_t
C
=
32
;
// input channel (per group)
static
constexpr
ck
::
index_t
Z
=
3
;
// filter D
static
constexpr
ck
::
index_t
Y
=
3
;
// filter H
static
constexpr
ck
::
index_t
X
=
3
;
// filter W
static
constexpr
ck
::
index_t
Di
=
14
;
// input D
static
constexpr
ck
::
index_t
Hi
=
14
;
// input H
static
constexpr
ck
::
index_t
Wi
=
14
;
// input W
static
constexpr
ck
::
index_t
Do
=
14
;
// output D
static
constexpr
ck
::
index_t
Ho
=
14
;
// output H
static
constexpr
ck
::
index_t
Wo
=
14
;
// output W
struct
SimpleDeviceMem
{
SimpleDeviceMem
()
=
delete
;
SimpleDeviceMem
(
std
::
size_t
mem_size
)
:
p_mem_
{}
{
(
void
)
hipMalloc
(
static_cast
<
void
**>
(
&
p_mem_
),
mem_size
);
}
void
*
GetDeviceBuffer
()
{
return
p_mem_
;
}
~
SimpleDeviceMem
()
{
(
void
)
hipFree
(
p_mem_
);
}
void
*
p_mem_
;
};
int
execute_conv_fwd_scaleadd_ab
()
{
constexpr
ck
::
index_t
NumAs
=
2
;
constexpr
ck
::
index_t
NumBs
=
2
;
constexpr
float
scale
=
1.5
f
;
// We have NHWGC/GKYXC/NHWGK (x, weight, y) in memory space.
// However, CK's API only accepts lengths and strides with order of GNCDHW/GKCZYX/GNKDHW.
// Hence, we need to adjust the order of strides.
std
::
array
<
ck
::
index_t
,
6
>
in_lengths
{
G
,
N
,
C
,
Di
,
Hi
,
Wi
};
std
::
array
<
ck
::
index_t
,
6
>
in_strides
{
C
,
Di
*
Hi
*
Wi
*
G
*
C
,
1
,
Hi
*
Wi
*
G
*
C
,
Wi
*
G
*
C
,
G
*
C
};
std
::
array
<
ck
::
index_t
,
6
>
wei_lengths
{
G
,
K
,
C
,
Z
,
Y
,
X
};
std
::
array
<
ck
::
index_t
,
6
>
wei_strides
{
K
*
Z
*
Y
*
X
*
C
,
Z
*
Y
*
X
*
C
,
1
,
Y
*
X
*
C
,
X
*
C
,
C
};
std
::
array
<
ck
::
index_t
,
6
>
out_lengths
{
G
,
N
,
K
,
Do
,
Ho
,
Wo
};
std
::
array
<
ck
::
index_t
,
6
>
out_strides
{
K
,
Do
*
Ho
*
Wo
*
G
*
K
,
1
,
Ho
*
Wo
*
G
*
K
,
Wo
*
G
*
K
,
G
*
K
};
std
::
array
<
ck
::
index_t
,
NumDimSpatial
>
filter_strides
{
1
,
1
,
1
};
std
::
array
<
ck
::
index_t
,
NumDimSpatial
>
filter_dilations
{
1
,
1
,
1
};
std
::
array
<
ck
::
index_t
,
NumDimSpatial
>
input_left_pads
{
1
,
1
,
1
};
std
::
array
<
ck
::
index_t
,
NumDimSpatial
>
input_right_pads
{
1
,
1
,
1
};
using
InputDtype
=
ck
::
tuple_element_t
<
0
,
InDataType
>
;
using
InputBiasDtype
=
ck
::
tuple_element_t
<
1
,
InDataType
>
;
using
WeightDtype
=
ck
::
tuple_element_t
<
0
,
WeiDataType
>
;
using
WeightBiasDtype
=
ck
::
tuple_element_t
<
1
,
WeiDataType
>
;
SimpleDeviceMem
in
(
sizeof
(
InputDtype
)
*
N
*
Di
*
Hi
*
Wi
*
G
*
C
);
SimpleDeviceMem
in_bias
(
sizeof
(
InputBiasDtype
)
*
N
*
Di
*
Hi
*
Wi
*
G
*
C
);
SimpleDeviceMem
wei
(
sizeof
(
WeightDtype
)
*
G
*
K
*
Z
*
Y
*
X
*
C
);
SimpleDeviceMem
wei_bias
(
sizeof
(
WeightBiasDtype
)
*
G
*
K
*
Z
*
Y
*
X
*
C
);
SimpleDeviceMem
out
(
sizeof
(
OutDataType
)
*
N
*
Do
*
Ho
*
Wo
*
G
*
K
);
using
DeviceOp
=
ck
::
tensor_operation
::
device
::
DeviceGroupedConvFwdMultipleABD
<
NumDimSpatial
,
InLayout
,
WeiLayout
,
ck
::
Tuple
<>
,
OutLayout
,
InDataType
,
WeiDataType
,
ck
::
Tuple
<>
,
OutDataType
,
ScaleAdd
,
ScaleAdd
,
PassThrough
>
;
// get device op instances
const
auto
op_ptrs
=
ck
::
tensor_operation
::
device
::
instance
::
DeviceOperationInstanceFactory
<
DeviceOp
>::
GetInstances
();
std
::
cout
<<
"found "
<<
op_ptrs
.
size
()
<<
" instances"
<<
std
::
endl
;
std
::
string
best_op_name
;
int
best_op_id
=
-
1
;
float
best_avg_time
=
std
::
numeric_limits
<
float
>::
max
();
float
best_gb_per_sec
=
0
;
float
best_tflops
=
0
;
// profile device operation instances
std
::
cout
<<
"Run all instances and do timing"
<<
std
::
endl
;
std
::
array
<
const
void
*
,
NumAs
>
as
=
{
in
.
GetDeviceBuffer
(),
in_bias
.
GetDeviceBuffer
()};
std
::
array
<
const
void
*
,
NumBs
>
bs
=
{
wei
.
GetDeviceBuffer
(),
wei_bias
.
GetDeviceBuffer
()};
std
::
array
<
const
void
*
,
0
>
ds
{};
for
(
int
i
=
0
;
i
<
op_ptrs
.
size
();
++
i
)
{
auto
&
op_ptr
=
op_ptrs
[
i
];
auto
argument_ptr
=
op_ptr
->
MakeArgumentPointer
(
as
,
bs
,
ds
,
out
.
GetDeviceBuffer
(),
in_lengths
,
in_strides
,
wei_lengths
,
wei_strides
,
{},
{},
out_lengths
,
out_strides
,
filter_strides
,
filter_dilations
,
input_left_pads
,
input_right_pads
,
ScaleAdd
{
scale
},
ScaleAdd
{
scale
},
PassThrough
{});
auto
invoker_ptr
=
op_ptr
->
MakeInvokerPointer
();
std
::
string
op_name
=
op_ptr
->
GetTypeString
();
if
(
op_ptr
->
IsSupportedArgument
(
argument_ptr
.
get
()))
{
float
avg_time
=
invoker_ptr
->
Run
(
argument_ptr
.
get
(),
StreamConfig
{
nullptr
,
true
});
std
::
size_t
flop
=
std
::
size_t
(
2
)
*
G
*
N
*
K
*
C
*
Do
*
Ho
*
Wo
*
Z
*
Y
*
X
+
N
*
Di
*
Hi
*
Wi
*
G
*
C
+
G
*
K
*
Z
*
Y
*
X
*
C
;
std
::
size_t
num_bytes
=
2
*
sizeof
(
InDataType
)
*
N
*
Di
*
Hi
*
Wi
*
G
*
C
+
2
*
sizeof
(
WeiDataType
)
*
G
*
K
*
Z
*
Y
*
X
*
C
+
sizeof
(
OutDataType
)
*
N
*
Do
*
Ho
*
Wo
*
G
*
K
;
float
tflops
=
static_cast
<
float
>
(
flop
)
/
1.E9
/
avg_time
;
float
gb_per_sec
=
num_bytes
/
1.E6
/
avg_time
;
std
::
cout
<<
"Perf: "
<<
std
::
setw
(
10
)
<<
avg_time
<<
" ms, "
<<
tflops
<<
" TFlops, "
<<
gb_per_sec
<<
" GB/s, "
<<
op_name
<<
std
::
endl
;
if
(
tflops
>
best_tflops
)
{
best_op_id
=
i
;
best_op_name
=
op_name
;
best_avg_time
=
avg_time
;
best_gb_per_sec
=
gb_per_sec
;
best_tflops
=
tflops
;
}
}
else
{
std
::
cerr
<<
op_name
<<
" does not support this problem"
<<
std
::
endl
;
}
}
if
(
best_op_id
<
0
)
{
std
::
cerr
<<
"no suitable instance"
<<
std
::
endl
;
return
EXIT_FAILURE
;
}
std
::
cout
<<
"Best Perf: "
<<
std
::
setw
(
10
)
<<
best_avg_time
<<
" ms, "
<<
best_tflops
<<
" TFlops, "
<<
best_gb_per_sec
<<
" GB/s, "
<<
best_op_name
<<
std
::
endl
;
// run the best intance
{
auto
&
op_ptr
=
op_ptrs
[
best_op_id
];
std
::
cout
<<
"Run the best instance without timing: "
<<
op_ptr
->
GetTypeString
()
<<
std
::
endl
;
auto
argument_ptr
=
op_ptr
->
MakeArgumentPointer
(
as
,
bs
,
ds
,
out
.
GetDeviceBuffer
(),
in_lengths
,
in_strides
,
wei_lengths
,
wei_strides
,
{},
{},
out_lengths
,
out_strides
,
filter_strides
,
filter_dilations
,
input_left_pads
,
input_right_pads
,
ScaleAdd
{
scale
},
ScaleAdd
{
scale
},
PassThrough
{});
auto
invoker_ptr
=
op_ptr
->
MakeInvokerPointer
();
if
(
op_ptr
->
IsSupportedArgument
(
argument_ptr
.
get
()))
{
invoker_ptr
->
Run
(
argument_ptr
.
get
(),
StreamConfig
{
nullptr
,
false
});
}
std
::
cout
<<
"Done"
<<
std
::
endl
;
}
return
0
;
}
client_example/24_grouped_conv_activation/grouped_convnd_fwd_scaleadd_ab/grouped_conv_fwd_scaleadd_ab_bf16.cpp
0 → 100644
View file @
ef326c73
// SPDX-License-Identifier: MIT
// Copyright (c) 2023-2024, Advanced Micro Devices, Inc. All rights reserved.
#include "ck/utility/data_type.hpp"
#include "ck/utility/tuple.hpp"
using
InDataType
=
ck
::
Tuple
<
ck
::
bhalf_t
,
ck
::
bhalf_t
>
;
using
WeiDataType
=
ck
::
Tuple
<
ck
::
bhalf_t
,
ck
::
bhalf_t
>
;
using
OutDataType
=
ck
::
bhalf_t
;
#include "grouped_conv_fwd_scaleadd_ab.inc"
int
main
()
{
return
execute_conv_fwd_scaleadd_ab
();
}
client_example/24_grouped_conv_activation/grouped_convnd_fwd_scaleadd_ab/grouped_conv_fwd_scaleadd_ab_fp16.cpp
0 → 100644
View file @
ef326c73
// SPDX-License-Identifier: MIT
// Copyright (c) 2023-2024, Advanced Micro Devices, Inc. All rights reserved.
#include "ck/utility/data_type.hpp"
#include "ck/utility/tuple.hpp"
using
InDataType
=
ck
::
Tuple
<
ck
::
half_t
,
ck
::
half_t
>
;
using
WeiDataType
=
ck
::
Tuple
<
ck
::
half_t
,
ck
::
half_t
>
;
using
OutDataType
=
ck
::
half_t
;
#include "grouped_conv_fwd_scaleadd_ab.inc"
int
main
()
{
return
execute_conv_fwd_scaleadd_ab
();
}
client_example/24_grouped_conv_activation/grouped_convnd_fwd_scaleadd_ab/grouped_conv_fwd_scaleadd_ab_fp32.cpp
0 → 100644
View file @
ef326c73
// SPDX-License-Identifier: MIT
// Copyright (c) 2023-2024, Advanced Micro Devices, Inc. All rights reserved.
#include "ck/utility/data_type.hpp"
#include "ck/utility/tuple.hpp"
using
InDataType
=
ck
::
Tuple
<
float
,
float
>
;
using
WeiDataType
=
ck
::
Tuple
<
float
,
float
>
;
using
OutDataType
=
float
;
#include "grouped_conv_fwd_scaleadd_ab.inc"
int
main
()
{
return
execute_conv_fwd_scaleadd_ab
();
}
client_example/24_grouped_conv_activation/grouped_convnd_fwd_scaleadd_ab/grouped_conv_fwd_scaleadd_ab_int8.cpp
0 → 100644
View file @
ef326c73
// SPDX-License-Identifier: MIT
// Copyright (c) 2023-2024, Advanced Micro Devices, Inc. All rights reserved.
#include "ck/utility/data_type.hpp"
#include "ck/utility/tuple.hpp"
using
InDataType
=
ck
::
Tuple
<
int8_t
,
int8_t
>
;
using
WeiDataType
=
ck
::
Tuple
<
int8_t
,
int8_t
>
;
using
OutDataType
=
int8_t
;
#include "grouped_conv_fwd_scaleadd_ab.inc"
int
main
()
{
return
execute_conv_fwd_scaleadd_ab
();
}
client_example/24_grouped_conv_activation/grouped_convnd_fwd_scaleadd_scaleadd_relu/grouped_conv_fwd_scaleadd_scaleadd_relu.inc
0 → 100644
View file @
ef326c73
// SPDX-License-Identifier: MIT
// Copyright (c) 2023-2024, Advanced Micro Devices, Inc. All rights reserved.
#include <cstdlib>
#include <iomanip>
#include <iostream>
#include <iterator>
#include <numeric>
#include <vector>
#include "ck/ck.hpp"
#include "ck/library/tensor_operation_instance/gpu/grouped_convolution_forward_scaleadd_scaleadd_relu.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
using
InLayout
=
ck
::
tensor_layout
::
convolution
::
NDHWGC
;
using
WeiLayout
=
ck
::
tensor_layout
::
convolution
::
GKZYXC
;
using
OutLayout
=
ck
::
tensor_layout
::
convolution
::
NDHWGK
;
using
BiasLayout
=
ck
::
tensor_layout
::
convolution
::
G_K
;
using
PassThrough
=
ck
::
tensor_operation
::
element_wise
::
PassThrough
;
using
ScaleAddScaleAddRelu
=
ck
::
tensor_operation
::
element_wise
::
ScaleAddScaleAddRelu
;
static
constexpr
ck
::
index_t
NumDimSpatial
=
3
;
static
constexpr
ck
::
index_t
G
=
32
;
static
constexpr
ck
::
index_t
N
=
64
;
// batch size
static
constexpr
ck
::
index_t
K
=
64
;
// output channel
static
constexpr
ck
::
index_t
C
=
32
;
// input channel (per group)
static
constexpr
ck
::
index_t
Z
=
3
;
// filter D
static
constexpr
ck
::
index_t
Y
=
3
;
// filter H
static
constexpr
ck
::
index_t
X
=
3
;
// filter W
static
constexpr
ck
::
index_t
Di
=
14
;
// input D
static
constexpr
ck
::
index_t
Hi
=
14
;
// input H
static
constexpr
ck
::
index_t
Wi
=
14
;
// input W
static
constexpr
ck
::
index_t
Do
=
14
;
// output D
static
constexpr
ck
::
index_t
Ho
=
14
;
// output H
static
constexpr
ck
::
index_t
Wo
=
14
;
// output W
struct
SimpleDeviceMem
{
SimpleDeviceMem
()
=
delete
;
SimpleDeviceMem
(
std
::
size_t
mem_size
)
:
p_mem_
{}
{
(
void
)
hipMalloc
(
static_cast
<
void
**>
(
&
p_mem_
),
mem_size
);
}
void
*
GetDeviceBuffer
()
{
return
p_mem_
;
}
~
SimpleDeviceMem
()
{
(
void
)
hipFree
(
p_mem_
);
}
void
*
p_mem_
;
};
int
execute_conv_fwd_scaleadd_scaleadd_relu
()
{
// We have NHWGC/GKYXC/NHWGK (x, weight, y) in memory space.
// However, CK's API only accepts lengths and strides with order of GNCDHW/GKCZYX/GNKDHW.
// Hence, we need to adjust the order of strides.
std
::
array
<
ck
::
index_t
,
6
>
in_lengths
{
G
,
N
,
C
,
Di
,
Hi
,
Wi
};
std
::
array
<
ck
::
index_t
,
6
>
in_strides
{
C
,
Di
*
Hi
*
Wi
*
G
*
C
,
1
,
Hi
*
Wi
*
G
*
C
,
Wi
*
G
*
C
,
G
*
C
};
std
::
array
<
ck
::
index_t
,
6
>
wei_lengths
{
G
,
K
,
C
,
Z
,
Y
,
X
};
std
::
array
<
ck
::
index_t
,
6
>
wei_strides
{
K
*
Z
*
Y
*
X
*
C
,
Z
*
Y
*
X
*
C
,
1
,
Y
*
X
*
C
,
X
*
C
,
C
};
std
::
array
<
ck
::
index_t
,
6
>
out_lengths
{
G
,
N
,
K
,
Do
,
Ho
,
Wo
};
std
::
array
<
ck
::
index_t
,
6
>
out_strides
{
K
,
Do
*
Ho
*
Wo
*
G
*
K
,
1
,
Ho
*
Wo
*
G
*
K
,
Wo
*
G
*
K
,
G
*
K
};
// Logical broadcast bias (we have to pass bias lengths in the same format as output - GNKDHW)
std
::
array
<
ck
::
index_t
,
6
>
bias_lengths
{
G
,
1
,
K
,
1
,
1
,
1
};
std
::
array
<
ck
::
index_t
,
6
>
bias_strides
{
K
,
0
,
1
,
0
,
0
,
0
};
std
::
array
<
ck
::
index_t
,
NumDimSpatial
>
filter_strides
{
1
,
1
,
1
};
std
::
array
<
ck
::
index_t
,
NumDimSpatial
>
filter_dilations
{
1
,
1
,
1
};
std
::
array
<
ck
::
index_t
,
NumDimSpatial
>
input_left_pads
{
1
,
1
,
1
};
std
::
array
<
ck
::
index_t
,
NumDimSpatial
>
input_right_pads
{
1
,
1
,
1
};
SimpleDeviceMem
in
(
sizeof
(
InDataType
)
*
N
*
Di
*
Hi
*
Wi
*
G
*
C
);
SimpleDeviceMem
wei
(
sizeof
(
WeiDataType
)
*
G
*
K
*
Z
*
Y
*
X
*
C
);
SimpleDeviceMem
out
(
sizeof
(
OutDataType
)
*
N
*
Do
*
Ho
*
Wo
*
G
*
K
);
SimpleDeviceMem
d0
(
sizeof
(
std
::
tuple_element_t
<
0
,
DDataTypes
>
)
*
N
*
Do
*
Ho
*
Wo
*
G
*
K
);
SimpleDeviceMem
d1
(
sizeof
(
std
::
tuple_element_t
<
1
,
DDataTypes
>
)
*
G
*
K
);
using
DeviceOp
=
ck
::
tensor_operation
::
device
::
DeviceGroupedConvFwdMultipleABD
<
NumDimSpatial
,
InLayout
,
WeiLayout
,
ck
::
Tuple
<
OutLayout
,
BiasLayout
>
,
OutLayout
,
InDataType
,
WeiDataType
,
ck
::
Tuple
<
std
::
tuple_element_t
<
0
,
DDataTypes
>
,
std
::
tuple_element_t
<
1
,
DDataTypes
>>
,
OutDataType
,
PassThrough
,
PassThrough
,
ScaleAddScaleAddRelu
>
;
// get device op instances
const
auto
op_ptrs
=
ck
::
tensor_operation
::
device
::
instance
::
DeviceOperationInstanceFactory
<
DeviceOp
>::
GetInstances
();
std
::
cout
<<
"found "
<<
op_ptrs
.
size
()
<<
" instances"
<<
std
::
endl
;
std
::
string
best_op_name
;
int
best_op_id
=
-
1
;
float
best_avg_time
=
std
::
numeric_limits
<
float
>::
max
();
float
best_gb_per_sec
=
0
;
float
best_tflops
=
0
;
// profile device operation instances
std
::
cout
<<
"Run all instances and do timing"
<<
std
::
endl
;
for
(
int
i
=
0
;
i
<
op_ptrs
.
size
();
++
i
)
{
auto
&
op_ptr
=
op_ptrs
[
i
];
auto
argument_ptr
=
op_ptr
->
MakeArgumentPointer
(
in
.
GetDeviceBuffer
(),
wei
.
GetDeviceBuffer
(),
{
d0
.
GetDeviceBuffer
(),
d1
.
GetDeviceBuffer
()},
out
.
GetDeviceBuffer
(),
in_lengths
,
in_strides
,
wei_lengths
,
wei_strides
,
{
out_lengths
,
bias_lengths
},
{
out_strides
,
bias_strides
},
out_lengths
,
out_strides
,
filter_strides
,
filter_dilations
,
input_left_pads
,
input_right_pads
,
PassThrough
{},
PassThrough
{},
ScaleAddScaleAddRelu
{
2.
f
,
2.
f
});
auto
invoker_ptr
=
op_ptr
->
MakeInvokerPointer
();
std
::
string
op_name
=
op_ptr
->
GetTypeString
();
if
(
op_ptr
->
IsSupportedArgument
(
argument_ptr
.
get
()))
{
float
avg_time
=
invoker_ptr
->
Run
(
argument_ptr
.
get
(),
StreamConfig
{
nullptr
,
true
});
std
::
size_t
flop
=
std
::
size_t
(
2
)
*
G
*
N
*
K
*
C
*
Ho
*
Wo
*
Y
*
X
+
2
*
N
*
Ho
*
Wo
*
G
*
K
;
std
::
size_t
num_bytes
=
sizeof
(
InDataType
)
*
N
*
Hi
*
Wi
*
G
*
C
+
sizeof
(
WeiDataType
)
*
G
*
K
*
Y
*
X
*
C
+
(
sizeof
(
OutDataType
)
+
sizeof
(
std
::
tuple_element_t
<
0
,
DDataTypes
>
)
+
sizeof
(
std
::
tuple_element_t
<
1
,
DDataTypes
>
))
*
N
*
Ho
*
Wo
*
G
*
K
;
float
tflops
=
static_cast
<
float
>
(
flop
)
/
1.E9
/
avg_time
;
float
gb_per_sec
=
num_bytes
/
1.E6
/
avg_time
;
std
::
cout
<<
"Perf: "
<<
std
::
setw
(
10
)
<<
avg_time
<<
" ms, "
<<
tflops
<<
" TFlops, "
<<
gb_per_sec
<<
" GB/s, "
<<
op_name
<<
std
::
endl
;
if
(
tflops
>
best_tflops
)
{
best_op_id
=
i
;
best_op_name
=
op_name
;
best_avg_time
=
avg_time
;
best_gb_per_sec
=
gb_per_sec
;
best_tflops
=
tflops
;
}
}
else
{
std
::
cerr
<<
op_name
<<
" does not support this problem"
<<
std
::
endl
;
}
}
if
(
best_op_id
<
0
)
{
std
::
cerr
<<
"no suitable instance"
<<
std
::
endl
;
return
EXIT_FAILURE
;
}
std
::
cout
<<
"Best Perf: "
<<
std
::
setw
(
10
)
<<
best_avg_time
<<
" ms, "
<<
best_tflops
<<
" TFlops, "
<<
best_gb_per_sec
<<
" GB/s, "
<<
best_op_name
<<
std
::
endl
;
// run the best intance
{
auto
&
op_ptr
=
op_ptrs
[
best_op_id
];
std
::
cout
<<
"Run the best instance without timing: "
<<
op_ptr
->
GetTypeString
()
<<
std
::
endl
;
auto
argument_ptr
=
op_ptr
->
MakeArgumentPointer
(
in
.
GetDeviceBuffer
(),
wei
.
GetDeviceBuffer
(),
{
d0
.
GetDeviceBuffer
(),
d1
.
GetDeviceBuffer
()},
out
.
GetDeviceBuffer
(),
in_lengths
,
in_strides
,
wei_lengths
,
wei_strides
,
{
out_lengths
,
bias_lengths
},
{
out_strides
,
bias_strides
},
out_lengths
,
out_strides
,
filter_strides
,
filter_dilations
,
input_left_pads
,
input_right_pads
,
PassThrough
{},
PassThrough
{},
ScaleAddScaleAddRelu
{
2.
f
,
2.
f
});
auto
invoker_ptr
=
op_ptr
->
MakeInvokerPointer
();
if
(
op_ptr
->
IsSupportedArgument
(
argument_ptr
.
get
()))
{
invoker_ptr
->
Run
(
argument_ptr
.
get
(),
StreamConfig
{
nullptr
,
false
});
}
std
::
cout
<<
"Done"
<<
std
::
endl
;
}
return
0
;
}
client_example/24_grouped_conv_activation/grouped_convnd_fwd_scaleadd_scaleadd_relu/grouped_conv_fwd_scaleadd_scaleadd_relu_bf16.cpp
0 → 100644
View file @
ef326c73
// SPDX-License-Identifier: MIT
// Copyright (c) 2023-2024, Advanced Micro Devices, Inc. All rights reserved.
#include <tuple>
#include "ck/utility/data_type.hpp"
#include "ck/utility/tuple.hpp"
using
InDataType
=
ck
::
bhalf_t
;
using
WeiDataType
=
ck
::
bhalf_t
;
using
OutDataType
=
ck
::
bhalf_t
;
// Use std tuple instead of ck tuple to avoid clang
// implicit instantiation of undefined template error.
using
DDataTypes
=
std
::
tuple
<
ck
::
bhalf_t
,
ck
::
bhalf_t
>
;
#include "grouped_conv_fwd_scaleadd_scaleadd_relu.inc"
int
main
()
{
return
execute_conv_fwd_scaleadd_scaleadd_relu
();
}
client_example/24_grouped_conv_activation/grouped_convnd_fwd_scaleadd_scaleadd_relu/grouped_conv_fwd_scaleadd_scaleadd_relu_fp16.cpp
0 → 100644
View file @
ef326c73
// SPDX-License-Identifier: MIT
// Copyright (c) 2023-2024, Advanced Micro Devices, Inc. All rights reserved.
#include <tuple>
#include "ck/utility/data_type.hpp"
#include "ck/utility/tuple.hpp"
using
InDataType
=
ck
::
half_t
;
using
WeiDataType
=
ck
::
half_t
;
using
OutDataType
=
ck
::
half_t
;
// Use std tuple instead of ck tuple to avoid clang
// implicit instantiation of undefined template error.
using
DDataTypes
=
std
::
tuple
<
ck
::
half_t
,
ck
::
half_t
>
;
#include "grouped_conv_fwd_scaleadd_scaleadd_relu.inc"
int
main
()
{
return
execute_conv_fwd_scaleadd_scaleadd_relu
();
}
client_example/24_grouped_conv_activation/grouped_convnd_fwd_scaleadd_scaleadd_relu/grouped_conv_fwd_scaleadd_scaleadd_relu_fp32.cpp
0 → 100644
View file @
ef326c73
// SPDX-License-Identifier: MIT
// Copyright (c) 2023-2024, Advanced Micro Devices, Inc. All rights reserved.
#include <tuple>
#include "ck/utility/data_type.hpp"
#include "ck/utility/tuple.hpp"
using
InDataType
=
float
;
using
WeiDataType
=
float
;
using
OutDataType
=
float
;
// Use std tuple instead of ck tuple to avoid clang
// implicit instantiation of undefined template error.
using
DDataTypes
=
std
::
tuple
<
float
,
float
>
;
#include "grouped_conv_fwd_scaleadd_scaleadd_relu.inc"
int
main
()
{
return
execute_conv_fwd_scaleadd_scaleadd_relu
();
}
client_example/24_grouped_conv_activation/grouped_convnd_fwd_scaleadd_scaleadd_relu/grouped_conv_fwd_scaleadd_scaleadd_relu_int8.cpp
0 → 100644
View file @
ef326c73
// SPDX-License-Identifier: MIT
// Copyright (c) 2023-2024, Advanced Micro Devices, Inc. All rights reserved.
#include <tuple>
#include "ck/utility/data_type.hpp"
#include "ck/utility/tuple.hpp"
using
InDataType
=
int8_t
;
using
WeiDataType
=
int8_t
;
using
OutDataType
=
int8_t
;
// Use std tuple instead of ck tuple to avoid clang
// implicit instantiation of undefined template error.
using
DDataTypes
=
std
::
tuple
<
float
,
float
>
;
#include "grouped_conv_fwd_scaleadd_scaleadd_relu.inc"
int
main
()
{
return
execute_conv_fwd_scaleadd_scaleadd_relu
();
}
client_example/25_wrapper/CMakeLists.txt
0 → 100644
View file @
ef326c73
add_executable
(
client_tensor_transform_using_wrapper tensor_transform_using_wrapper.cpp
)
target_link_libraries
(
client_tensor_transform_using_wrapper PRIVATE composable_kernel::device_other_operations
)
add_executable
(
client_wrapper_img2col wrapper_img2col.cpp
)
target_link_libraries
(
client_wrapper_img2col PRIVATE composable_kernel::device_other_operations
)
if
(
GPU_TARGETS MATCHES
"gfx9"
)
add_executable
(
client_wrapper_basic_gemm wrapper_basic_gemm.cpp
)
target_link_libraries
(
client_wrapper_basic_gemm PRIVATE composable_kernel::device_other_operations
)
add_executable
(
client_wrapper_optimized_gemm wrapper_optimized_gemm.cpp
)
target_link_libraries
(
client_wrapper_optimized_gemm PRIVATE composable_kernel::device_other_operations
)
endif
()
client_example/25_wrapper/README.md
0 → 100644
View file @
ef326c73
# Composable Kernel wrapper GEMM tutorial
This tutorial demonstrates how to implement matrix multiplication using Composable Kernel (CK)
wrapper. We present the base version of GEMM without most of the available optimizations; however,
it's worth noting that CK has kernels with different optimizations.
To implement these optimizations, you can use the CK wrapper or directly use available instances in
CK. You can also refer to the
[
optimized GEMM example
](
https://github.com/ROCm/composable_kernel/blob/develop/client_example/25_wrapper/wrapper_optimized_gemm.cpp
)
,
that uses CK wrapper based on the
[
`gridwise_gemm_xdlops_v2r3`
](
https://github.com/ROCm/composable_kernel/blob/develop/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v2r3.hpp
)
implementation.
The kernel definition should look similar to:
```
cpp
template
<
typename
DataType
,
typename
GemmTraits
,
ck
::
index_t
scalar_per_vector
,
typename
BlockShape
,
typename
ThreadLayout
>
__global__
void
__CK_WRAPPER_LAUNCH_BOUNDS__
DeviceGemm
(
const
void
*
p_a
,
const
void
*
p_b
,
void
*
p_c
,
const
ck
::
index_t
M
,
const
ck
::
index_t
N
,
const
ck
::
index_t
K
,
const
BlockShape
tile_shape
,
const
ThreadLayout
thread_layout
)
```
We pass pointers to global memory and matrix dimensions via arguments. Additionally, we pass
selected lengths of processed data through each block (
`tile_shape`
) and thread layout
(
`thread_layout`
). For compilation time parameters, we define the data type,
[
traits for the GEMM operation
](
https://github.com/ROCm/composable_kernel/blob/develop/include/ck/wrapper/traits/blockwise_gemm_xdl_traits.hpp
)
and scalar per vector value during copy.
Step 1: Create layouts for global and LDS memory.
```
cpp
// Specify layouts for global memory.
const
auto
a_global_layout
=
ck
::
wrapper
::
make_layout
(
ck
::
make_tuple
(
M
,
K
),
ck
::
make_tuple
(
K
,
1
));
const
auto
b_global_layout
=
ck
::
wrapper
::
make_layout
(
ck
::
make_tuple
(
N
,
K
),
ck
::
make_tuple
(
K
,
1
));
const
auto
c_global_layout
=
ck
::
wrapper
::
make_layout
(
ck
::
make_tuple
(
M
,
N
),
ck
::
make_tuple
(
N
,
1
));
// Specify layouts for tiles.
constexpr
auto
a_tile_layout
=
ck
::
wrapper
::
make_layout
(
ck
::
make_tuple
(
MPerBlock
,
KPerBlock
),
ck
::
make_tuple
(
KPerBlock
,
ck
::
Number
<
1
>
{}));
constexpr
auto
b_tile_layout
=
ck
::
wrapper
::
make_layout
(
ck
::
make_tuple
(
NPerBlock
,
KPerBlock
),
ck
::
make_tuple
(
KPerBlock
,
ck
::
Number
<
1
>
{}));
constexpr
auto
c_tile_layout
=
ck
::
wrapper
::
make_layout
(
ck
::
make_tuple
(
MPerBlock
,
NPerBlock
),
ck
::
make_tuple
(
NPerBlock
,
ck
::
Number
<
1
>
{}));
// Apply padding for global memory.
auto
a_global_layout_padded
=
ck
::
wrapper
::
pad
(
a_global_layout
,
shape
(
a_tile_layout
));
auto
b_global_layout_padded
=
ck
::
wrapper
::
pad
(
b_global_layout
,
shape
(
b_tile_layout
));
auto
c_global_layout_padded
=
ck
::
wrapper
::
pad
(
c_global_layout
,
shape
(
c_tile_layout
));
```
We pad layouts for global tensors in case M, N, and K are not divisible by
`MPerBlock`
,
`NPerBlock`
, or
`KPerBlock`
.
Step 2: Create tensors for global and LDS memory.
```
cpp
// Make tensors for global memory.
auto
a_global_tensor
=
ck
::
wrapper
::
make_tensor
<
ck
::
wrapper
::
MemoryTypeEnum
::
Global
>
(
static_cast
<
const
DataType
*>
(
p_a
),
a_global_layout_padded
);
auto
b_global_tensor
=
ck
::
wrapper
::
make_tensor
<
ck
::
wrapper
::
MemoryTypeEnum
::
Global
>
(
static_cast
<
const
DataType
*>
(
p_b
),
b_global_layout_padded
);
auto
c_global_tensor
=
ck
::
wrapper
::
make_tensor
<
ck
::
wrapper
::
MemoryTypeEnum
::
Global
>
(
static_cast
<
DataType
*>
(
p_c
),
c_global_layout_padded
);
// Allocate LDS memory.
__shared__
DataType
lds_a
[
ck
::
wrapper
::
size
(
a_tile_layout
)];
__shared__
DataType
lds_b
[
ck
::
wrapper
::
size
(
b_tile_layout
)];
// Make tensors for lds memory.
auto
a_lds_tensor
=
ck
::
wrapper
::
make_tensor
<
ck
::
wrapper
::
MemoryTypeEnum
::
Lds
>
(
static_cast
<
DataType
*>
(
lds_a
),
a_tile_layout
);
auto
b_lds_tensor
=
ck
::
wrapper
::
make_tensor
<
ck
::
wrapper
::
MemoryTypeEnum
::
Lds
>
(
static_cast
<
DataType
*>
(
lds_b
),
b_tile_layout
);
```
We must specify parameters for copy and convert block indexes to tuple:
```
cpp
// Specify block index as tuple.
const
auto
block_idxs
=
ck
::
make_tuple
(
static_cast
<
ck
::
index_t
>
(
blockIdx
.
x
),
static_cast
<
ck
::
index_t
>
(
blockIdx
.
y
),
ck
::
wrapper
::
slice
());
// Specify access parameters for copy.
using
DimAccessOrder
=
ck
::
Tuple
<
ck
::
Number
<
0
>
,
ck
::
Number
<
1
>>
;
constexpr
ck
::
index_t
vector_dim
=
1
;
```
We create a local tile (per block) and local partitions (per thread) for the global memory (
`C`
). We also
define and clear an output register (
`c_vgpr_reg`
) for the accumulation.
```
cpp
auto
c_global_local_tile
=
ck
::
wrapper
::
make_local_tile
(
c_global_tensor
,
tile_shape
,
block_idxs
,
make_tuple
(
ck
::
Number
<
1
>
{},
ck
::
Number
<
1
>
{},
ck
::
wrapper
::
slice
(
KPerBlock
)));
auto
c_global_local_partition
=
ck
::
wrapper
::
make_blockwise_gemm_xdl_c_local_partition
<
DataType
,
decltype
(
a_tile_layout
),
decltype
(
b_tile_layout
),
ck
::
wrapper
::
size
(
thread_layout
),
GemmTraits
>
(
c_global_local_tile
);
// Create C vgpr to accumulate results.
auto
c_vgpr_reg
=
ck
::
wrapper
::
make_blockwise_gemm_xdl_c_vgpr
<
DataType
,
decltype
(
a_tile_layout
),
decltype
(
b_tile_layout
),
ck
::
wrapper
::
size
(
thread_layout
),
GemmTraits
>
();
// Clear C vgpr.
ck
::
wrapper
::
clear
(
c_vgpr_reg
);
```
We use two specific functions for
`blockwise_gemm`
:
`make_blockwise_gemm_xdl_c_local_partition`
and
`make_blockwise_gemm_xdl_c_vgpr`
. This helps to choose the appropriate partition for the
`C`
output
and define tensors with specific layouts for
`blockwise_gemm`
. In the following step, we use only
generic functions for the CK wrapper.
Step 3: Create the compute loop.
```
cpp
const
ck
::
index_t
num_loop
=
ck
::
math
::
integer_divide_ceil
(
K
,
KPerBlock
);
ck
::
index_t
i
=
0
;
do
{
// Get KPerBlock slice.
const
auto
k_slice
=
ck
::
wrapper
::
slice
(
i
*
KPerBlock
,
(
i
+
1
)
*
KPerBlock
);
auto
a_global_tensor_k_slice
=
a_global_tensor
(
ck
::
wrapper
::
slice
(),
k_slice
);
auto
b_global_tensor_k_slice
=
b_global_tensor
(
ck
::
wrapper
::
slice
(),
k_slice
);
// Create local tiles for A and B.
auto
a_global_local_tile
=
ck
::
wrapper
::
make_local_tile
(
a_global_tensor_k_slice
,
tile_shape
,
block_idxs
,
make_tuple
(
ck
::
Number
<
1
>
{},
ck
::
wrapper
::
slice
(
N
),
ck
::
Number
<
1
>
{}));
auto
b_global_local_tile
=
ck
::
wrapper
::
make_local_tile
(
b_global_tensor_k_slice
,
tile_shape
,
block_idxs
,
make_tuple
(
ck
::
wrapper
::
slice
(
M
),
ck
::
Number
<
1
>
{},
ck
::
Number
<
1
>
{}));
// Copy from global to LDS.
ck
::
wrapper
::
blockwise_copy
<
DimAccessOrder
,
vector_dim
,
scalar_per_vector
>
(
a_global_local_tile
,
a_lds_tensor
,
thread_layout
);
ck
::
wrapper
::
blockwise_copy
<
DimAccessOrder
,
vector_dim
,
scalar_per_vector
>
(
b_global_local_tile
,
b_lds_tensor
,
thread_layout
);
// Synchronize lds.
ck
::
block_sync_lds
();
// Execute blockwise GEMM.
ck
::
wrapper
::
blockwise_gemm_xdl
<
DataType
,
ck
::
wrapper
::
size
(
thread_layout
),
GemmTraits
>
(
a_lds_tensor
,
b_lds_tensor
,
c_vgpr_reg
);
++
i
;
}
while
(
i
<
num_loop
);
```
Loop iterate over
`K / KPerBlock`
. Each time a local tile is created for A and B tensors (tensor per block),
data is copied from global memory to LDS. The
`blockwise_gemm`
function performs the GEMM
operation on
`a_lds_tensor`
and
`b_lds_tensor`
, and stores results in
`c_vgpr_reg`
.
The end result from
`c_vgpr_reg`
is stored in the
`C`
local partition (tensor per thread):
```
cpp
ck
::
wrapper
::
copy
(
c_vgpr_reg
,
c_global_local_partition
);
```
If you want to dive deep into the details, you can find the entire example
[
here
](
https://github.com/ROCm/composable_kernel/blob/develop/client_example/25_wrapper/wrapper_basic_gemm.cpp
)
.
client_example/25_wrapper/tensor_transform_using_wrapper.cpp
0 → 100644
View file @
ef326c73
// SPDX-License-Identifier: MIT
// Copyright (c) 2023-2024, Advanced Micro Devices, Inc. All rights reserved.
#include <iostream>
#include "ck/ck.hpp"
#include "ck/utility/number.hpp"
#include "ck/utility/tuple.hpp"
#include "ck/utility/sequence.hpp"
#include "ck/wrapper/layout.hpp"
using
DataType
=
int
;
template
<
typename
Layout
>
void
Print1d
(
const
Layout
&
layout
)
{
std
::
cout
<<
"Print1d"
<<
std
::
endl
;
for
(
ck
::
index_t
w
=
0
;
w
<
ck
::
wrapper
::
size
(
layout
);
w
++
)
{
std
::
cout
<<
layout
(
ck
::
make_tuple
(
w
))
<<
" "
;
}
std
::
cout
<<
std
::
endl
;
}
template
<
typename
Layout
>
void
Print2d
(
const
Layout
&
layout
)
{
std
::
cout
<<
"Print2d"
<<
std
::
endl
;
for
(
ck
::
index_t
h
=
0
;
h
<
ck
::
wrapper
::
size
<
0
>
(
layout
);
h
++
)
{
for
(
ck
::
index_t
w
=
0
;
w
<
ck
::
wrapper
::
size
<
1
>
(
layout
);
w
++
)
{
std
::
cout
<<
layout
(
ck
::
make_tuple
(
h
,
w
))
<<
" "
;
}
std
::
cout
<<
std
::
endl
;
}
}
// Print in (x,y),z pattern
template
<
typename
Layout
>
void
Print3dCustom
(
const
Layout
&
layout
)
{
std
::
cout
<<
"Print3dCustom"
<<
std
::
endl
;
for
(
ck
::
index_t
d
=
0
;
d
<
ck
::
wrapper
::
size
<
0
>
(
ck
::
wrapper
::
get
<
0
>
(
layout
));
d
++
)
{
for
(
ck
::
index_t
h
=
0
;
h
<
ck
::
wrapper
::
size
<
1
>
(
ck
::
wrapper
::
get
<
0
>
(
layout
));
h
++
)
{
for
(
ck
::
index_t
w
=
0
;
w
<
ck
::
wrapper
::
size
<
1
>
(
layout
);
w
++
)
{
std
::
cout
<<
layout
(
ck
::
make_tuple
(
ck
::
make_tuple
(
d
,
h
),
w
))
<<
" "
;
}
std
::
cout
<<
std
::
endl
;
}
std
::
cout
<<
std
::
endl
;
}
}
int
main
()
{
// Layout traverse in row-major
std
::
cout
<<
"Note: Layout traverse in column-major"
<<
std
::
endl
;
// Basic descriptor 0, 1, 2, ... 30, 31 (compile-time descriptor)
// (dims:4,8 strides:1,4)
const
auto
shape_4x8
=
ck
::
make_tuple
(
ck
::
Number
<
4
>
{},
ck
::
Number
<
8
>
{});
const
auto
layout_4x8_s1x4
=
ck
::
wrapper
::
make_layout
(
shape_4x8
);
std
::
cout
<<
"dims:4,8 strides:1,4"
<<
std
::
endl
;
Print2d
(
layout_4x8_s1x4
);
using
Cord1x1Type
=
ck
::
Tuple
<
ck
::
Number
<
1
>
,
ck
::
Number
<
1
>>
;
constexpr
ck
::
index_t
offset_1x1
=
layout_4x8_s1x4
.
template
operator
()
<
Cord1x1Type
>();
std
::
cout
<<
"Constexpr calculated [1, 1] offset:"
<<
offset_1x1
<<
std
::
endl
;
// Basic descriptor 0, 1, 8, 9, 16, 17, ... 30, 31 (runtime descriptor)
// dims:4,(2,4) strides:2,(1,8)
const
auto
shape_4x2x4
=
ck
::
make_tuple
(
4
,
ck
::
make_tuple
(
2
,
4
));
const
auto
strides_s2x1x8
=
ck
::
make_tuple
(
2
,
ck
::
make_tuple
(
1
,
8
));
const
auto
layout_4x2x4_s2x1x8
=
ck
::
wrapper
::
make_layout
(
shape_4x2x4
,
strides_s2x1x8
);
std
::
cout
<<
"dims:4,(2,4) strides:2,(1,8)"
<<
std
::
endl
;
Print2d
(
layout_4x2x4_s2x1x8
);
// Basic descriptor 0, 1, 8, 9, 16, 17, ... 30, 31 (compile-time descriptor)
// dims:(2,2),(2,4) strides:((1,4),(2,8)
const
auto
shape_2x2x2x4
=
ck
::
make_tuple
(
ck
::
make_tuple
(
ck
::
Number
<
2
>
{},
ck
::
Number
<
2
>
{}),
ck
::
make_tuple
(
ck
::
Number
<
2
>
{},
ck
::
Number
<
4
>
{}));
const
auto
strides_s1x4x2x8
=
ck
::
make_tuple
(
ck
::
make_tuple
(
ck
::
Number
<
1
>
{},
ck
::
Number
<
4
>
{}),
ck
::
make_tuple
(
ck
::
Number
<
2
>
{},
ck
::
Number
<
8
>
{}));
static
const
auto
layout_2x2x2x4_s1x4x2x8
=
ck
::
wrapper
::
make_layout
(
shape_2x2x2x4
,
strides_s1x4x2x8
);
std
::
cout
<<
"dims:(2,2),(2,4) strides:(1,4),(2,8)"
<<
std
::
endl
;
Print2d
(
layout_2x2x2x4_s1x4x2x8
);
Print3dCustom
(
layout_2x2x2x4_s1x4x2x8
);
// Basic descriptor 0, 1, 8, 9, 16, 17, ... 30, 31 (compile-time descriptor)
// dims:((2,2),2),4 strides:((1,4),2),8
// Transform to 2d
const
auto
shape_2x2x2x4_nested
=
ck
::
make_tuple
(
ck
::
make_tuple
(
ck
::
make_tuple
(
ck
::
Number
<
2
>
{},
ck
::
Number
<
2
>
{}),
ck
::
Number
<
2
>
{}),
ck
::
Number
<
4
>
{});
const
auto
strides_s1x4x2x8_nested
=
ck
::
make_tuple
(
ck
::
make_tuple
(
ck
::
make_tuple
(
ck
::
Number
<
1
>
{},
ck
::
Number
<
4
>
{}),
ck
::
Number
<
2
>
{}),
ck
::
Number
<
8
>
{});
static
const
auto
layout_2x2x2x4_s1x4x2x8_nested
=
ck
::
wrapper
::
make_layout
(
shape_2x2x2x4_nested
,
strides_s1x4x2x8_nested
);
std
::
cout
<<
"dims:((2,2),2),4 strides:((1,4),2),8"
<<
std
::
endl
;
Print1d
(
layout_2x2x2x4_s1x4x2x8_nested
);
Print2d
(
layout_2x2x2x4_s1x4x2x8_nested
);
Print3dCustom
(
layout_2x2x2x4_s1x4x2x8_nested
);
return
0
;
}
client_example/25_wrapper/wrapper_basic_gemm.cpp
0 → 100644
View file @
ef326c73
// SPDX-License-Identifier: MIT
// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
#include <numeric>
#include <cstdlib>
#include <iostream>
#include <initializer_list>
#include <vector>
#include "ck/utility/common_header.hpp"
// __gfx9__ defined in the above header via ck.hpp
#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx9__))
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/host_utility/kernel_launch.hpp"
#include "ck/library/utility/device_memory.hpp"
#include "ck/library/utility/check_err.hpp"
#include "ck/library/utility/fill.hpp"
#include "ck/library/utility/host_tensor.hpp"
#include "ck/wrapper/layout.hpp"
#include "ck/wrapper/tensor.hpp"
#include "ck/wrapper/operations/copy.hpp"
#include "ck/wrapper/operations/gemm.hpp"
#include "ck/wrapper/utils/kernel_utils.hpp"
#include "ck/host_utility/device_prop.hpp"
struct
SimpleDeviceMem
{
SimpleDeviceMem
()
=
delete
;
SimpleDeviceMem
(
std
::
size_t
mem_size
)
:
p_mem_
{}
{
(
void
)
hipMalloc
(
static_cast
<
void
**>
(
&
p_mem_
),
mem_size
);
}
void
*
GetDeviceBuffer
()
{
return
p_mem_
;
}
~
SimpleDeviceMem
()
{
(
void
)
hipFree
(
p_mem_
);
}
void
*
p_mem_
;
};
template
<
typename
DataType
,
typename
GemmTraits
,
ck
::
index_t
scalar_per_vector
,
typename
BlockShape
,
typename
ThreadLayout
>
__global__
void
__CK_WRAPPER_LAUNCH_BOUNDS__
DeviceGemm
(
const
void
*
p_a
,
const
void
*
p_b
,
void
*
p_c
,
const
ck
::
index_t
M
,
const
ck
::
index_t
N
,
const
ck
::
index_t
K
,
const
BlockShape
tile_shape
,
const
ThreadLayout
thread_layout
)
{
constexpr
auto
MPerBlock
=
ck
::
wrapper
::
size
<
0
>
(
tile_shape
);
constexpr
auto
NPerBlock
=
ck
::
wrapper
::
size
<
1
>
(
tile_shape
);
constexpr
auto
KPerBlock
=
ck
::
wrapper
::
size
<
2
>
(
tile_shape
);
// Specify layouts for global memory.
const
auto
a_global_layout
=
ck
::
wrapper
::
make_layout
(
ck
::
make_tuple
(
M
,
K
),
ck
::
make_tuple
(
K
,
1
));
const
auto
b_global_layout
=
ck
::
wrapper
::
make_layout
(
ck
::
make_tuple
(
N
,
K
),
ck
::
make_tuple
(
K
,
1
));
const
auto
c_global_layout
=
ck
::
wrapper
::
make_layout
(
ck
::
make_tuple
(
M
,
N
),
ck
::
make_tuple
(
N
,
1
));
// Specify layouts for tiles.
constexpr
auto
a_tile_layout
=
ck
::
wrapper
::
make_layout
(
ck
::
make_tuple
(
MPerBlock
,
KPerBlock
),
ck
::
make_tuple
(
KPerBlock
,
ck
::
Number
<
1
>
{}));
constexpr
auto
b_tile_layout
=
ck
::
wrapper
::
make_layout
(
ck
::
make_tuple
(
NPerBlock
,
KPerBlock
),
ck
::
make_tuple
(
KPerBlock
,
ck
::
Number
<
1
>
{}));
constexpr
auto
c_tile_layout
=
ck
::
wrapper
::
make_layout
(
ck
::
make_tuple
(
MPerBlock
,
NPerBlock
),
ck
::
make_tuple
(
NPerBlock
,
ck
::
Number
<
1
>
{}));
// Apply padding for global memory.
auto
a_global_layout_padded
=
ck
::
wrapper
::
pad
(
a_global_layout
,
shape
(
a_tile_layout
));
auto
b_global_layout_padded
=
ck
::
wrapper
::
pad
(
b_global_layout
,
shape
(
b_tile_layout
));
auto
c_global_layout_padded
=
ck
::
wrapper
::
pad
(
c_global_layout
,
shape
(
c_tile_layout
));
// Make tensors for global memory.
auto
a_global_tensor
=
ck
::
wrapper
::
make_tensor
<
ck
::
wrapper
::
MemoryTypeEnum
::
Global
>
(
static_cast
<
const
DataType
*>
(
p_a
),
a_global_layout_padded
);
auto
b_global_tensor
=
ck
::
wrapper
::
make_tensor
<
ck
::
wrapper
::
MemoryTypeEnum
::
Global
>
(
static_cast
<
const
DataType
*>
(
p_b
),
b_global_layout_padded
);
auto
c_global_tensor
=
ck
::
wrapper
::
make_tensor
<
ck
::
wrapper
::
MemoryTypeEnum
::
Global
>
(
static_cast
<
DataType
*>
(
p_c
),
c_global_layout_padded
);
// Allocate lds memory.
__shared__
DataType
lds_a
[
ck
::
wrapper
::
size
(
a_tile_layout
)];
__shared__
DataType
lds_b
[
ck
::
wrapper
::
size
(
b_tile_layout
)];
// Make tensors for lds memory.
auto
a_lds_tensor
=
ck
::
wrapper
::
make_tensor
<
ck
::
wrapper
::
MemoryTypeEnum
::
Lds
>
(
static_cast
<
DataType
*>
(
lds_a
),
a_tile_layout
);
auto
b_lds_tensor
=
ck
::
wrapper
::
make_tensor
<
ck
::
wrapper
::
MemoryTypeEnum
::
Lds
>
(
static_cast
<
DataType
*>
(
lds_b
),
b_tile_layout
);
// Specify block index as tuple.
const
auto
block_idxs
=
ck
::
make_tuple
(
static_cast
<
ck
::
index_t
>
(
blockIdx
.
x
),
static_cast
<
ck
::
index_t
>
(
blockIdx
.
y
),
ck
::
wrapper
::
slice
());
// Specify access parameters for copy.
using
DimAccessOrder
=
ck
::
Tuple
<
ck
::
Number
<
0
>
,
ck
::
Number
<
1
>>
;
constexpr
ck
::
index_t
vector_dim
=
1
;
// Create tile and partition for C. Use specific function for blockwise_gemm to assign the
// appropriate partitions.
auto
c_global_local_tile
=
ck
::
wrapper
::
make_local_tile
(
c_global_tensor
,
tile_shape
,
block_idxs
,
make_tuple
(
ck
::
Number
<
1
>
{},
ck
::
Number
<
1
>
{},
ck
::
wrapper
::
slice
(
KPerBlock
)));
auto
c_global_local_partition
=
ck
::
wrapper
::
make_blockwise_gemm_xdl_c_local_partition
<
DataType
,
decltype
(
a_tile_layout
),
decltype
(
b_tile_layout
),
ck
::
wrapper
::
size
(
thread_layout
),
GemmTraits
>
(
c_global_local_tile
);
// Create C vgpr to accumulate results.
auto
c_vgpr_reg
=
ck
::
wrapper
::
make_blockwise_gemm_xdl_c_vgpr
<
DataType
,
decltype
(
a_tile_layout
),
decltype
(
b_tile_layout
),
ck
::
wrapper
::
size
(
thread_layout
),
GemmTraits
>
();
// Clear C vgpr.
ck
::
wrapper
::
clear
(
c_vgpr_reg
);
// Iterate over K with KPerBlock step.
const
ck
::
index_t
num_loop
=
ck
::
math
::
integer_divide_ceil
(
K
,
KPerBlock
);
ck
::
index_t
i
=
0
;
do
{
// Get KPerBlock slice.
const
auto
k_slice
=
ck
::
wrapper
::
slice
(
i
*
KPerBlock
,
(
i
+
1
)
*
KPerBlock
);
auto
a_global_tensor_k_slice
=
a_global_tensor
(
ck
::
wrapper
::
slice
(),
k_slice
);
auto
b_global_tensor_k_slice
=
b_global_tensor
(
ck
::
wrapper
::
slice
(),
k_slice
);
// Create local tiles for A and B.
auto
a_global_local_tile
=
ck
::
wrapper
::
make_local_tile
(
a_global_tensor_k_slice
,
tile_shape
,
block_idxs
,
make_tuple
(
ck
::
Number
<
1
>
{},
ck
::
wrapper
::
slice
(
N
),
ck
::
Number
<
1
>
{}));
auto
b_global_local_tile
=
ck
::
wrapper
::
make_local_tile
(
b_global_tensor_k_slice
,
tile_shape
,
block_idxs
,
make_tuple
(
ck
::
wrapper
::
slice
(
M
),
ck
::
Number
<
1
>
{},
ck
::
Number
<
1
>
{}));
// Copy from global to lds.
ck
::
wrapper
::
blockwise_copy
<
DimAccessOrder
,
vector_dim
,
scalar_per_vector
>
(
a_global_local_tile
,
a_lds_tensor
,
thread_layout
);
ck
::
wrapper
::
blockwise_copy
<
DimAccessOrder
,
vector_dim
,
scalar_per_vector
>
(
b_global_local_tile
,
b_lds_tensor
,
thread_layout
);
// Synchronize lds.
ck
::
block_sync_lds
();
// Execute blockwise gemm.
ck
::
wrapper
::
blockwise_gemm_xdl
<
DataType
,
ck
::
wrapper
::
size
(
thread_layout
),
GemmTraits
>
(
a_lds_tensor
,
b_lds_tensor
,
c_vgpr_reg
);
++
i
;
}
while
(
i
<
num_loop
);
// Copy vgpr results to C global memory.
ck
::
wrapper
::
copy
(
c_vgpr_reg
,
c_global_local_partition
);
}
template
<
typename
DataType
,
typename
GemmTraits
,
ck
::
index_t
scalar_per_vector
,
typename
BlockShape
,
typename
ThreadLayout
>
void
PerformGemm
(
const
ck
::
index_t
M
,
const
ck
::
index_t
N
,
const
ck
::
index_t
K
,
const
BlockShape
&
tile_shape
,
const
ThreadLayout
&
thread_layout
)
{
// Global memory buffers
SimpleDeviceMem
a_mem
(
M
*
K
*
sizeof
(
DataType
));
SimpleDeviceMem
b_mem
(
K
*
N
*
sizeof
(
DataType
));
SimpleDeviceMem
c_mem
(
M
*
N
*
sizeof
(
DataType
));
const
ck
::
index_t
grid_size_x
=
ck
::
math
::
integer_divide_ceil
(
M
,
ck
::
wrapper
::
size
<
0
>
(
tile_shape
));
const
ck
::
index_t
grid_size_y
=
ck
::
math
::
integer_divide_ceil
(
N
,
ck
::
wrapper
::
size
<
1
>
(
tile_shape
));
const
auto
kernel
=
DeviceGemm
<
DataType
,
GemmTraits
,
scalar_per_vector
,
BlockShape
,
ThreadLayout
>
;
const
float
avg_time
=
launch_and_time_kernel
(
StreamConfig
{
nullptr
,
true
},
kernel
,
dim3
(
grid_size_x
,
grid_size_y
,
1
),
dim3
(
ck
::
wrapper
::
size
(
thread_layout
)),
0
,
a_mem
.
GetDeviceBuffer
(),
b_mem
.
GetDeviceBuffer
(),
c_mem
.
GetDeviceBuffer
(),
M
,
N
,
K
,
tile_shape
,
thread_layout
);
std
::
size_t
flop
=
std
::
size_t
(
2
)
*
M
*
N
*
K
;
std
::
size_t
num_btype
=
sizeof
(
DataType
)
*
M
*
K
+
sizeof
(
DataType
)
*
K
*
N
+
sizeof
(
DataType
)
*
M
*
N
;
float
tflops
=
static_cast
<
float
>
(
flop
)
/
1.E9
/
avg_time
;
float
gb_per_sec
=
num_btype
/
1.E6
/
avg_time
;
std
::
cout
<<
"Perf: "
<<
std
::
setw
(
10
)
<<
avg_time
<<
" ms, "
<<
tflops
<<
" TFlops, "
<<
gb_per_sec
<<
" GB/s, "
<<
std
::
endl
;
}
int
main
(
int
argc
,
char
*
argv
[])
{
bool
is_supported
=
ck
::
is_xdl_supported
();
if
(
!
is_supported
)
{
std
::
cout
<<
"WARNING: xdl example not supported on the platform "
<<
ck
::
get_device_name
()
<<
std
::
endl
;
return
0
;
}
using
DataType
=
ck
::
half_t
;
const
auto
thread_layout
=
ck
::
wrapper
::
make_layout
(
ck
::
make_tuple
(
ck
::
Number
<
64
>
{},
ck
::
Number
<
4
>
{}),
ck
::
make_tuple
(
ck
::
Number
<
4
>
{},
ck
::
Number
<
1
>
{}));
const
auto
tile_shape
=
ck
::
make_tuple
(
ck
::
Number
<
256
>
{},
ck
::
Number
<
128
>
{},
ck
::
Number
<
32
>
{});
PerformGemm
<
DataType
,
ck
::
wrapper
::
BlockwisGemmXdlTraits_32x32Xdl_4x2XdlPerWave_8K1
,
8
>
(
3840
,
4096
,
4096
,
tile_shape
,
thread_layout
);
return
0
;
}
#endif
client_example/25_wrapper/wrapper_img2col.cpp
0 → 100644
View file @
ef326c73
// SPDX-License-Identifier: MIT
// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
#include <numeric>
#include <cstdlib>
#include <iomanip>
#include <iostream>
#include <initializer_list>
#include <vector>
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/host_utility/kernel_launch.hpp"
#include "ck/utility/common_header.hpp"
#include "ck/wrapper/layout.hpp"
#include "ck/wrapper/tensor.hpp"
#include "ck/wrapper/operations/copy.hpp"
#include "ck/wrapper/utils/kernel_utils.hpp"
static
constexpr
ck
::
index_t
NumDimSpatial
=
3
;
using
DataType
=
float
;
using
InputLayout
=
ck
::
tensor_layout
::
convolution
::
NDHWGC
;
struct
SimpleDeviceMem
{
SimpleDeviceMem
()
=
delete
;
SimpleDeviceMem
(
std
::
size_t
mem_size
)
:
p_mem_
{}
{
(
void
)
hipMalloc
(
static_cast
<
void
**>
(
&
p_mem_
),
mem_size
);
}
void
*
GetDeviceBuffer
()
{
return
p_mem_
;
}
~
SimpleDeviceMem
()
{
(
void
)
hipFree
(
p_mem_
);
}
void
*
p_mem_
;
};
template
<
typename
InputTensor
,
typename
OutputTensor
,
typename
BlockShape
,
typename
ThreadLayout
>
__global__
void
__CK_WRAPPER_LAUNCH_BOUNDS__
DeviceImageToColumnPad0
(
InputTensor
input_tensor
,
OutputTensor
output_tensor
,
const
BlockShape
tile_shape
,
const
ThreadLayout
thread_layout
)
{
// grid layout (dim1, dim0)
const
auto
block_idxs
=
ck
::
make_tuple
(
static_cast
<
ck
::
index_t
>
(
blockIdx
.
y
),
static_cast
<
ck
::
index_t
>
(
blockIdx
.
x
));
// Get local tiles for global memory
auto
input_local_tile
=
ck
::
wrapper
::
make_local_tile
(
input_tensor
,
tile_shape
,
block_idxs
);
auto
output_local_tile
=
ck
::
wrapper
::
make_local_tile
(
output_tensor
,
tile_shape
,
block_idxs
);
// Get partition per thread
const
auto
input_local_partition
=
ck
::
wrapper
::
make_local_partition
(
input_local_tile
,
thread_layout
,
threadIdx
.
x
);
auto
output_local_partition
=
ck
::
wrapper
::
make_local_partition
(
output_local_tile
,
thread_layout
,
threadIdx
.
x
);
// Perform copy
using
DimAccessOrder
=
ck
::
Tuple
<
ck
::
Number
<
0
>
,
ck
::
Number
<
1
>>
;
constexpr
ck
::
index_t
vector_dim
=
1
;
constexpr
ck
::
index_t
scalar_per_vector
=
4
;
ck
::
wrapper
::
copy
<
DimAccessOrder
,
vector_dim
,
scalar_per_vector
>
(
input_local_partition
,
output_local_partition
);
}
void
PerformImageToColumnPad0
(
const
ck
::
index_t
G
,
const
ck
::
index_t
N
,
const
ck
::
index_t
Di
,
const
ck
::
index_t
Hi
,
const
ck
::
index_t
Wi
,
const
ck
::
index_t
Do
,
const
ck
::
index_t
Ho
,
const
ck
::
index_t
Wo
,
const
ck
::
index_t
C
,
const
ck
::
index_t
Z
,
const
ck
::
index_t
Y
,
const
ck
::
index_t
X
,
std
::
array
<
ck
::
index_t
,
NumDimSpatial
>
filter_strides
,
std
::
array
<
ck
::
index_t
,
NumDimSpatial
>
filter_dilations
)
{
const
ck
::
index_t
ZYXC
=
Z
*
Y
*
X
*
C
;
const
ck
::
index_t
GC
=
G
*
C
;
// shape: (G, (Wo, Ho, Do, N)), (C, X, Y, Z))
const
auto
shape
=
ck
::
make_tuple
(
ck
::
make_tuple
(
G
,
ck
::
make_tuple
(
Wo
,
Ho
,
Do
,
N
)),
ck
::
make_tuple
(
C
,
X
,
Y
,
Z
));
const
auto
in_strides
=
ck
::
make_tuple
(
ck
::
make_tuple
(
C
,
ck
::
make_tuple
(
filter_strides
[
2
]
*
GC
,
filter_strides
[
1
]
*
Wi
*
GC
,
filter_strides
[
0
]
*
Hi
*
Wi
*
GC
,
Di
*
Hi
*
Wi
*
GC
)),
ck
::
make_tuple
(
1
,
filter_dilations
[
2
]
*
GC
,
filter_dilations
[
1
]
*
Wi
*
GC
,
filter_dilations
[
0
]
*
Hi
*
Wi
*
GC
));
const
auto
in_layout
=
ck
::
wrapper
::
make_layout
(
shape
,
in_strides
);
const
auto
out_strides
=
ck
::
make_tuple
(
ck
::
make_tuple
(
ZYXC
,
ck
::
make_tuple
(
ZYXC
*
G
,
Wo
*
ZYXC
*
G
,
Ho
*
Wo
*
ZYXC
*
G
,
Do
*
Ho
*
Wo
*
ZYXC
*
G
)),
ck
::
make_tuple
(
1
,
C
,
X
*
C
,
Y
*
X
*
C
));
const
auto
out_layout
=
ck
::
wrapper
::
make_layout
(
shape
,
out_strides
);
const
ck
::
index_t
input_size
=
N
*
Di
*
Hi
*
Wi
*
GC
;
// Global memory buffers
SimpleDeviceMem
in_buf
(
input_size
*
sizeof
(
DataType
));
SimpleDeviceMem
out_buf
(
ck
::
wrapper
::
size
(
out_layout
)
*
sizeof
(
DataType
));
// User can choose appropriate number of threads and sizes per block
const
auto
thread_layout
=
ck
::
wrapper
::
make_layout
(
ck
::
make_tuple
(
ck
::
Number
<
8
>
{},
ck
::
Number
<
16
>
{}),
ck
::
make_tuple
(
ck
::
Number
<
16
>
{},
ck
::
Number
<
1
>
{}));
// This example doesn't support padding, user should select tile sizes
// which are divisible by the shape.
const
auto
tile_shape
=
ck
::
make_tuple
(
ck
::
Number
<
32
>
{},
ck
::
Number
<
64
>
{});
// Create buffers for global memory
auto
input_tensor_global
=
ck
::
wrapper
::
make_tensor
<
ck
::
wrapper
::
MemoryTypeEnum
::
Global
>
(
static_cast
<
const
DataType
*>
(
in_buf
.
GetDeviceBuffer
()),
in_layout
);
auto
output_tensor_global
=
ck
::
wrapper
::
make_tensor
<
ck
::
wrapper
::
MemoryTypeEnum
::
Global
>
(
static_cast
<
DataType
*>
(
out_buf
.
GetDeviceBuffer
()),
out_layout
);
// grid layout (dim1, dim0)
const
ck
::
index_t
grid_size_x
=
ck
::
math
::
integer_divide_ceil
(
ck
::
wrapper
::
size
<
1
>
(
in_layout
),
ck
::
wrapper
::
size
<
1
>
(
tile_shape
));
const
ck
::
index_t
grid_size_y
=
ck
::
math
::
integer_divide_ceil
(
ck
::
wrapper
::
size
<
0
>
(
in_layout
),
ck
::
wrapper
::
size
<
0
>
(
tile_shape
));
const
auto
kernel
=
DeviceImageToColumnPad0
<
decltype
(
input_tensor_global
),
decltype
(
output_tensor_global
),
decltype
(
tile_shape
),
decltype
(
thread_layout
)
>
;
const
float
avg_time
=
launch_and_time_kernel
(
StreamConfig
{
nullptr
,
true
},
kernel
,
dim3
(
grid_size_x
,
grid_size_y
,
1
),
dim3
(
ck
::
wrapper
::
size
(
thread_layout
)),
0
,
input_tensor_global
,
output_tensor_global
,
tile_shape
,
thread_layout
);
std
::
size_t
num_btype
=
G
*
N
*
Do
*
Ho
*
Wo
*
ZYXC
*
2
*
sizeof
(
DataType
);
float
gb_per_sec
=
num_btype
/
1.E6
/
avg_time
;
std
::
cout
<<
"Perf: "
<<
std
::
setw
(
10
)
<<
avg_time
<<
" ms, "
<<
gb_per_sec
<<
" GB/s, "
<<
std
::
endl
;
}
int
main
(
int
argc
,
char
*
argv
[])
{
constexpr
ck
::
index_t
G
=
4
;
// number of groups
constexpr
ck
::
index_t
N
=
32
;
// batch
constexpr
ck
::
index_t
C
=
64
;
// input channel (per group)
constexpr
ck
::
index_t
Z
=
3
;
// filter D
constexpr
ck
::
index_t
Y
=
3
;
// filter H
constexpr
ck
::
index_t
X
=
3
;
// filter W
constexpr
ck
::
index_t
Di
=
9
;
// input D
constexpr
ck
::
index_t
Hi
=
9
;
// input H
constexpr
ck
::
index_t
Wi
=
7
;
// input W
constexpr
ck
::
index_t
Do
=
7
;
// output D
constexpr
ck
::
index_t
Ho
=
7
;
// output H
constexpr
ck
::
index_t
Wo
=
5
;
// output W
PerformImageToColumnPad0
(
G
,
N
,
Di
,
Hi
,
Wi
,
Do
,
Ho
,
Wo
,
C
,
Z
,
Y
,
X
,
{
1
,
1
,
1
}
/*filter_strides*/
,
{
1
,
1
,
1
}
/*filter_dilations*/
);
return
0
;
}
client_example/25_wrapper/wrapper_optimized_gemm.cpp
0 → 100644
View file @
ef326c73
// SPDX-License-Identifier: MIT
// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
#include <numeric>
#include <cstdlib>
#include <iostream>
#include <initializer_list>
#include <vector>
#include "ck/utility/common_header.hpp"
// __gfx9__ defined in the above header via ck.hpp
#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx9__))
#include "ck/host_utility/kernel_launch.hpp"
#include "ck/library/utility/device_memory.hpp"
#include "ck/library/utility/check_err.hpp"
#include "ck/library/utility/fill.hpp"
#include "ck/library/utility/host_tensor.hpp"
#include "ck/wrapper/layout.hpp"
#include "ck/wrapper/tensor.hpp"
#include "ck/wrapper/operations/copy.hpp"
#include "ck/wrapper/operations/gemm.hpp"
#include "ck/wrapper/utils/kernel_utils.hpp"
#include "ck/host_utility/device_prop.hpp"
struct
SimpleDeviceMem
{
SimpleDeviceMem
()
=
delete
;
SimpleDeviceMem
(
std
::
size_t
mem_size
)
:
p_mem_
{}
{
(
void
)
hipMalloc
(
static_cast
<
void
**>
(
&
p_mem_
),
mem_size
);
}
void
*
GetDeviceBuffer
()
{
return
p_mem_
;
}
~
SimpleDeviceMem
()
{
(
void
)
hipFree
(
p_mem_
);
}
void
*
p_mem_
;
};
template
<
bool
DoPad
,
typename
Layout
,
typename
PaddingDims
>
__device__
auto
ApplyPadding
(
const
Layout
&
layout
,
const
PaddingDims
&
padding_dims
)
{
if
constexpr
(
DoPad
)
{
return
ck
::
wrapper
::
pad
(
layout
,
padding_dims
);
}
else
{
return
layout
;
}
}
template
<
typename
DataType
,
typename
GemmTraits
,
ck
::
index_t
scalar_per_vector
,
typename
BlockShape
,
typename
ThreadLayout
,
bool
DoPadding
>
__global__
void
__CK_WRAPPER_LAUNCH_BOUNDS__
DeviceGemm
(
const
void
*
p_a
,
const
void
*
p_b
,
void
*
p_c
,
const
ck
::
index_t
M
,
const
ck
::
index_t
N
,
const
ck
::
index_t
K
,
const
BlockShape
tile_shape
,
const
ThreadLayout
thread_layout
)
{
constexpr
auto
MPerBlock
=
ck
::
wrapper
::
size
<
0
>
(
tile_shape
);
constexpr
auto
NPerBlock
=
ck
::
wrapper
::
size
<
1
>
(
tile_shape
);
constexpr
auto
KPerBlock
=
ck
::
wrapper
::
size
<
2
>
(
tile_shape
);
constexpr
auto
K1
=
GemmTraits
::
K1
;
constexpr
auto
K0PerBlock
=
KPerBlock
/
K1
;
const
auto
K0
=
ck
::
math
::
integer_divide_ceil
(
K
,
K1
);
const
auto
tile_shape_k0_m_n_k1
=
ck
::
make_tuple
(
K0PerBlock
,
MPerBlock
,
NPerBlock
,
K1
);
// Create layouts for global memory
const
auto
a_global_layout
=
ck
::
wrapper
::
make_layout
(
ck
::
make_tuple
(
M
,
K
),
ck
::
make_tuple
(
K
,
1
));
const
auto
b_global_layout
=
ck
::
wrapper
::
make_layout
(
ck
::
make_tuple
(
N
,
K
),
ck
::
make_tuple
(
K
,
1
));
const
auto
c_global_layout
=
ck
::
wrapper
::
make_layout
(
ck
::
make_tuple
(
M
,
N
),
ck
::
make_tuple
(
N
,
1
));
// Apply padding
auto
a_padded_global_layout
=
ApplyPadding
<
DoPadding
>
(
a_global_layout
,
ck
::
make_tuple
(
MPerBlock
,
KPerBlock
));
auto
b_padded_global_layout
=
ApplyPadding
<
DoPadding
>
(
b_global_layout
,
ck
::
make_tuple
(
NPerBlock
,
KPerBlock
));
auto
c_padded_global_layout
=
ApplyPadding
<
DoPadding
>
(
c_global_layout
,
ck
::
make_tuple
(
MPerBlock
,
NPerBlock
));
// Reshape from M,K to K0,M,K1
const
auto
reshaped_dims_idxs
=
ck
::
make_tuple
(
ck
::
Number
<
1
>
{},
ck
::
make_tuple
(
ck
::
Number
<
0
>
{},
ck
::
Number
<
2
>
{}));
auto
a_padded_unmerged_global_layout
=
ck
::
wrapper
::
unmerge
<
1
>
(
a_padded_global_layout
,
ck
::
make_tuple
(
K0
,
K1
),
reshaped_dims_idxs
);
auto
b_padded_unmerged_global_layout
=
ck
::
wrapper
::
unmerge
<
1
>
(
b_padded_global_layout
,
ck
::
make_tuple
(
K0
,
K1
),
reshaped_dims_idxs
);
// Create tensors for global memory
auto
a_global_tensor
=
ck
::
wrapper
::
make_tensor
<
ck
::
wrapper
::
MemoryTypeEnum
::
Global
>
(
static_cast
<
const
DataType
*>
(
p_a
),
a_padded_unmerged_global_layout
);
auto
b_global_tensor
=
ck
::
wrapper
::
make_tensor
<
ck
::
wrapper
::
MemoryTypeEnum
::
Global
>
(
static_cast
<
const
DataType
*>
(
p_b
),
b_padded_unmerged_global_layout
);
auto
c_global_tensor
=
ck
::
wrapper
::
make_tensor
<
ck
::
wrapper
::
MemoryTypeEnum
::
Global
>
(
static_cast
<
DataType
*>
(
p_c
),
c_padded_global_layout
);
// Create layouts and tensors for lds memory.
constexpr
auto
a_tile_layout
=
ck
::
wrapper
::
make_layout
(
ck
::
make_tuple
(
K0PerBlock
,
MPerBlock
,
K1
),
ck
::
make_tuple
((
MPerBlock
+
ck
::
Number
<
1
>
{})
*
K1
,
K1
,
ck
::
Number
<
1
>
{}));
constexpr
auto
b_tile_layout
=
ck
::
wrapper
::
make_layout
(
ck
::
make_tuple
(
K0PerBlock
,
NPerBlock
,
K1
),
ck
::
make_tuple
((
NPerBlock
+
ck
::
Number
<
1
>
{})
*
K1
,
K1
,
ck
::
Number
<
1
>
{}));
__shared__
DataType
lds_a
[
ck
::
wrapper
::
size
(
a_tile_layout
)
+
K0PerBlock
];
__shared__
DataType
lds_b
[
ck
::
wrapper
::
size
(
b_tile_layout
)
+
K0PerBlock
];
auto
a_lds_tensor
=
ck
::
wrapper
::
make_tensor
<
ck
::
wrapper
::
MemoryTypeEnum
::
Lds
>
(
static_cast
<
DataType
*>
(
lds_a
),
a_tile_layout
);
auto
b_lds_tensor
=
ck
::
wrapper
::
make_tensor
<
ck
::
wrapper
::
MemoryTypeEnum
::
Lds
>
(
static_cast
<
DataType
*>
(
lds_b
),
b_tile_layout
);
const
auto
block_idxs
=
ck
::
make_tuple
(
ck
::
wrapper
::
slice
(),
static_cast
<
ck
::
index_t
>
(
blockIdx
.
x
),
static_cast
<
ck
::
index_t
>
(
blockIdx
.
y
),
ck
::
wrapper
::
slice
());
using
DimAccessOrder
=
ck
::
Tuple
<
ck
::
Number
<
1
>
,
ck
::
Number
<
0
>
,
ck
::
Number
<
2
>>
;
constexpr
ck
::
index_t
vector_dim
=
2
;
// Create tile and partition for C global memory. Use specific gemm
// functions to get appropriate layouts.
auto
c_global_local_tile
=
ck
::
wrapper
::
make_local_tile
(
c_global_tensor
,
tile_shape_k0_m_n_k1
,
block_idxs
,
make_tuple
(
ck
::
wrapper
::
slice
(
K0PerBlock
),
ck
::
Number
<
1
>
{},
ck
::
Number
<
1
>
{},
ck
::
wrapper
::
slice
(
K1
)));
auto
c_global_local_partition
=
ck
::
wrapper
::
make_blockwise_gemm_xdl_c_local_partition
<
DataType
,
decltype
(
a_tile_layout
),
decltype
(
b_tile_layout
),
ck
::
wrapper
::
size
(
thread_layout
),
GemmTraits
>
(
c_global_local_tile
);
// Define and clear c vgpr register
auto
c_vgpr_reg
=
ck
::
wrapper
::
make_blockwise_gemm_xdl_c_vgpr
<
DataType
,
decltype
(
a_tile_layout
),
decltype
(
b_tile_layout
),
ck
::
wrapper
::
size
(
thread_layout
),
GemmTraits
>
();
ck
::
wrapper
::
clear
(
c_vgpr_reg
);
// Local partitions for lds memory
auto
a_lds_tensor_local_partition
=
ck
::
wrapper
::
make_local_partition
(
a_lds_tensor
,
thread_layout
,
threadIdx
.
x
);
auto
b_lds_tensor_local_partition
=
ck
::
wrapper
::
make_local_partition
(
b_lds_tensor
,
thread_layout
,
threadIdx
.
x
);
// Lamda to slice tensor, then create local tile and partition
auto
make_global_partition
=
[
&
](
auto
tensor
,
auto
projection
,
ck
::
index_t
i
)
{
const
auto
k_slice
=
ck
::
make_tuple
(
ck
::
wrapper
::
slice
(
i
*
K0PerBlock
,
(
i
+
1
)
*
K0PerBlock
),
ck
::
wrapper
::
slice
(),
ck
::
wrapper
::
slice
());
auto
local_tile
=
ck
::
wrapper
::
make_local_tile
(
tensor
(
k_slice
),
tile_shape_k0_m_n_k1
,
block_idxs
,
projection
);
return
ck
::
wrapper
::
make_local_partition
(
local_tile
,
thread_layout
,
threadIdx
.
x
);
};
auto
a_global_local_partition
=
make_global_partition
(
a_global_tensor
,
make_tuple
(
ck
::
Number
<
1
>
{},
ck
::
Number
<
1
>
{},
ck
::
wrapper
::
slice
(
N
),
ck
::
Number
<
1
>
{}),
0
);
auto
b_global_local_partition
=
make_global_partition
(
b_global_tensor
,
make_tuple
(
ck
::
Number
<
1
>
{},
ck
::
wrapper
::
slice
(
M
),
ck
::
Number
<
1
>
{},
ck
::
Number
<
1
>
{}),
0
);
// (row-major vgpr layout)
auto
a_vgpr_tensor
=
ck
::
wrapper
::
make_register_tensor
<
ck
::
wrapper
::
MemoryTypeEnum
::
Vgpr
,
DataType
>
(
ck
::
wrapper
::
make_layout
(
shape
(
a_global_local_partition
),
ck
::
make_tuple
(
ck
::
wrapper
::
size
<
1
>
(
a_global_local_partition
)
*
ck
::
wrapper
::
size
<
2
>
(
a_global_local_partition
),
ck
::
wrapper
::
size
<
2
>
(
a_global_local_partition
),
ck
::
Number
<
1
>
{})));
auto
b_vgpr_tensor
=
ck
::
wrapper
::
make_register_tensor
<
ck
::
wrapper
::
MemoryTypeEnum
::
Vgpr
,
DataType
>
(
ck
::
wrapper
::
make_layout
(
shape
(
b_global_local_partition
),
ck
::
make_tuple
(
ck
::
wrapper
::
size
<
1
>
(
a_global_local_partition
)
*
ck
::
wrapper
::
size
<
2
>
(
a_global_local_partition
),
ck
::
wrapper
::
size
<
2
>
(
a_global_local_partition
),
ck
::
Number
<
1
>
{})));
// Copy first values to lds
ck
::
wrapper
::
copy
<
DimAccessOrder
,
vector_dim
,
scalar_per_vector
>
(
a_global_local_partition
,
a_vgpr_tensor
);
ck
::
wrapper
::
copy
<
DimAccessOrder
,
vector_dim
,
scalar_per_vector
>
(
b_global_local_partition
,
b_vgpr_tensor
);
ck
::
wrapper
::
copy
<
DimAccessOrder
,
vector_dim
,
scalar_per_vector
>
(
a_vgpr_tensor
,
a_lds_tensor_local_partition
);
ck
::
wrapper
::
copy
<
DimAccessOrder
,
vector_dim
,
scalar_per_vector
>
(
b_vgpr_tensor
,
b_lds_tensor_local_partition
);
// Pipeline loop
const
ck
::
index_t
num_loop
=
__builtin_amdgcn_readfirstlane
(
ck
::
math
::
integer_divide_ceil
(
K
,
KPerBlock
));
// Skip if only tile should be processed
if
(
num_loop
>
1
)
{
ck
::
index_t
i
=
0
;
do
{
auto
a_global_local_partition_i
=
make_global_partition
(
a_global_tensor
,
make_tuple
(
ck
::
Number
<
1
>
{},
ck
::
Number
<
1
>
{},
ck
::
wrapper
::
slice
(
N
),
ck
::
Number
<
1
>
{}),
i
+
1
);
auto
b_global_local_partition_i
=
make_global_partition
(
b_global_tensor
,
make_tuple
(
ck
::
Number
<
1
>
{},
ck
::
wrapper
::
slice
(
M
),
ck
::
Number
<
1
>
{},
ck
::
Number
<
1
>
{}),
i
+
1
);
// Copy data to A vgpr.
ck
::
wrapper
::
copy
<
DimAccessOrder
,
vector_dim
,
scalar_per_vector
>
(
a_global_local_partition_i
,
a_vgpr_tensor
);
// Synchronize.
ck
::
block_sync_lds
();
// Copy data to B vgpr.
ck
::
wrapper
::
copy
<
DimAccessOrder
,
vector_dim
,
scalar_per_vector
>
(
b_global_local_partition_i
,
b_vgpr_tensor
);
// Perform gemm.
ck
::
wrapper
::
blockwise_gemm_xdl
<
DataType
,
ck
::
wrapper
::
size
(
thread_layout
),
GemmTraits
>
(
a_lds_tensor
,
b_lds_tensor
,
c_vgpr_reg
);
// Synchronize
ck
::
block_sync_lds
();
// Copy data to A and B lds tiles.
ck
::
wrapper
::
copy
<
DimAccessOrder
,
vector_dim
,
scalar_per_vector
>
(
a_vgpr_tensor
,
a_lds_tensor_local_partition
);
ck
::
wrapper
::
copy
<
DimAccessOrder
,
vector_dim
,
scalar_per_vector
>
(
b_vgpr_tensor
,
b_lds_tensor_local_partition
);
++
i
;
}
while
(
i
<
(
num_loop
-
1
));
}
// Handle tail.
ck
::
block_sync_lds
();
ck
::
wrapper
::
blockwise_gemm_xdl
<
DataType
,
ck
::
wrapper
::
size
(
thread_layout
),
GemmTraits
>
(
a_lds_tensor
,
b_lds_tensor
,
c_vgpr_reg
);
// Store data from C vgpr to C global memory.
ck
::
wrapper
::
copy
(
c_vgpr_reg
,
c_global_local_partition
);
}
template
<
typename
DataType
,
typename
GemmTraits
,
ck
::
index_t
scalar_per_vector
,
bool
DoPadding
,
typename
BlockShape
,
typename
ThreadLayout
>
void
PerformGemm
(
const
ck
::
index_t
M
,
const
ck
::
index_t
N
,
const
ck
::
index_t
K
,
const
BlockShape
&
tile_shape
,
const
ThreadLayout
&
thread_layout
)
{
// Global memory buffers
SimpleDeviceMem
a_mem
(
M
*
K
*
sizeof
(
DataType
));
SimpleDeviceMem
b_mem
(
K
*
N
*
sizeof
(
DataType
));
SimpleDeviceMem
c_mem
(
M
*
N
*
sizeof
(
DataType
));
const
ck
::
index_t
grid_size_x
=
ck
::
math
::
integer_divide_ceil
(
M
,
ck
::
wrapper
::
size
<
0
>
(
tile_shape
));
const
ck
::
index_t
grid_size_y
=
ck
::
math
::
integer_divide_ceil
(
N
,
ck
::
wrapper
::
size
<
1
>
(
tile_shape
));
const
auto
kernel
=
DeviceGemm
<
DataType
,
GemmTraits
,
scalar_per_vector
,
BlockShape
,
ThreadLayout
,
DoPadding
>
;
const
float
avg_time
=
launch_and_time_kernel
(
StreamConfig
{
nullptr
,
true
},
kernel
,
dim3
(
grid_size_x
,
grid_size_y
,
1
),
dim3
(
ck
::
wrapper
::
size
(
thread_layout
)),
0
,
a_mem
.
GetDeviceBuffer
(),
b_mem
.
GetDeviceBuffer
(),
c_mem
.
GetDeviceBuffer
(),
M
,
N
,
K
,
tile_shape
,
thread_layout
);
std
::
size_t
flop
=
std
::
size_t
(
2
)
*
M
*
N
*
K
;
std
::
size_t
num_btype
=
sizeof
(
DataType
)
*
M
*
K
+
sizeof
(
DataType
)
*
K
*
N
+
sizeof
(
DataType
)
*
M
*
N
;
float
tflops
=
static_cast
<
float
>
(
flop
)
/
1.E9
/
avg_time
;
float
gb_per_sec
=
num_btype
/
1.E6
/
avg_time
;
std
::
cout
<<
"Perf: "
<<
std
::
setw
(
10
)
<<
avg_time
<<
" ms, "
<<
tflops
<<
" TFlops, "
<<
gb_per_sec
<<
" GB/s, "
<<
std
::
endl
;
}
int
main
(
int
argc
,
char
*
argv
[])
{
bool
is_supported
=
ck
::
is_xdl_supported
();
if
(
!
is_supported
)
{
std
::
cout
<<
"WARNING: xdl example not supported on the platform "
<<
ck
::
get_device_name
()
<<
std
::
endl
;
return
0
;
}
using
DataType
=
ck
::
half_t
;
const
auto
thread_layout
=
ck
::
wrapper
::
make_layout
(
ck
::
make_tuple
(
ck
::
Number
<
4
>
{},
ck
::
Number
<
64
>
{},
ck
::
Number
<
1
>
{}),
ck
::
make_tuple
(
ck
::
Number
<
1
>
{},
ck
::
Number
<
4
>
{},
ck
::
Number
<
1
>
{}));
const
auto
tile_shape
=
ck
::
make_tuple
(
ck
::
Number
<
256
>
{},
ck
::
Number
<
128
>
{},
ck
::
Number
<
32
>
{});
PerformGemm
<
DataType
,
ck
::
wrapper
::
BlockwisGemmXdlTraits_32x32Xdl_4x2XdlPerWave_8K1
,
8
,
false
>
(
3840
,
4096
,
4096
,
tile_shape
,
thread_layout
);
return
0
;
}
#endif
client_example/30_gemm_bf16Aint8B/CMakeLists.txt
0 → 100644
View file @
ef326c73
if
(
GPU_TARGETS MATCHES
"gfx9"
AND
((
DTYPES MATCHES
"int8"
AND DTYPES MATCHES
"bf16"
)
OR NOT DEFINED DTYPES
))
add_executable
(
client_gemm_bias_fastgelu_bf16_i8_bf16 gemm_bias_fastgelu_xdl_bf16_i8.cpp
)
target_link_libraries
(
client_gemm_bias_fastgelu_bf16_i8_bf16 PRIVATE composable_kernel::device_gemm_operations
)
add_executable
(
client_gemm_bias_bf16_i8_bf16 gemm_bias_xdl_bf16_i8.cpp
)
target_link_libraries
(
client_gemm_bias_bf16_i8_bf16 PRIVATE composable_kernel::device_gemm_operations
)
add_executable
(
client_gemm_gelu_bf16_i8_bf16 gemm_xdl_gelu_bf16_i8.cpp
)
target_link_libraries
(
client_gemm_gelu_bf16_i8_bf16 PRIVATE composable_kernel::device_gemm_operations
)
add_executable
(
client_gemm_bf16_i8_bf16 gemm_xdl_bf16_i8.cpp
)
target_link_libraries
(
client_gemm_bf16_i8_bf16 PRIVATE composable_kernel::device_gemm_operations
)
add_executable
(
client_gemm_multiply_bf16_i8_bf16 gemm_xdl_multiply_bf16_i8.cpp
)
target_link_libraries
(
client_gemm_multiply_bf16_i8_bf16 PRIVATE composable_kernel::device_gemm_operations
)
endif
()
client_example/30_gemm_bf16Aint8B/gemm_bias_fastgelu_xdl_bf16_i8.cpp
0 → 100644
View file @
ef326c73
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
#include <iostream>
#include <iomanip>
#include <numeric>
#include <initializer_list>
#include <cstdlib>
#include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
#include "ck/tensor_operation/gpu/device/device_gemm_multiple_abd.hpp"
#include "ck/tensor_operation/gpu/element/binary_element_wise_operation.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/library/tensor_operation_instance/gpu/gemm_multi_abd.hpp"
template
<
ck
::
index_t
...
Is
>
using
S
=
ck
::
Sequence
<
Is
...
>
;
using
BF16
=
ck
::
bhalf_t
;
using
I8
=
int8_t
;
using
F32
=
float
;
using
Row
=
ck
::
tensor_layout
::
gemm
::
RowMajor
;
using
Col
=
ck
::
tensor_layout
::
gemm
::
ColumnMajor
;
using
A0DataType
=
BF16
;
using
AsDataType
=
ck
::
Tuple
<
A0DataType
>
;
using
B0DataType
=
I8
;
using
B1DataType
=
BF16
;
using
BsDataType
=
ck
::
Tuple
<
B0DataType
,
B1DataType
>
;
using
AccDataType
=
F32
;
using
CShuffleDataType
=
BF16
;
using
D0DataType
=
BF16
;
using
DsDataType
=
ck
::
Tuple
<
D0DataType
>
;
using
EDataType
=
BF16
;
using
A0Layout
=
Row
;
using
AsLayout
=
ck
::
Tuple
<
A0Layout
>
;
using
B0Layout
=
Row
;
using
B1Layout
=
B0Layout
;
using
BsLayout
=
ck
::
Tuple
<
B0Layout
,
B1Layout
>
;
using
D0Layout
=
Row
;
using
DsLayout
=
ck
::
Tuple
<
D0Layout
>
;
using
ELayout
=
Row
;
using
Multiply
=
ck
::
tensor_operation
::
element_wise
::
Multiply
;
using
PassThrough
=
ck
::
tensor_operation
::
element_wise
::
PassThrough
;
using
AddFastGelu
=
ck
::
tensor_operation
::
element_wise
::
AddFastGelu
;
using
AElementOp
=
PassThrough
;
using
BElementOp
=
Multiply
;
using
CDEElementOp
=
AddFastGelu
;
static
constexpr
auto
GemmSpec
=
ck
::
tensor_operation
::
device
::
GemmSpecialization
::
MNKPadding
;
struct
SimpleDeviceMem
{
SimpleDeviceMem
()
=
delete
;
SimpleDeviceMem
(
std
::
size_t
mem_size
)
:
p_mem_
{}
{
(
void
)
hipMalloc
(
static_cast
<
void
**>
(
&
p_mem_
),
mem_size
);
}
void
*
GetDeviceBuffer
()
{
return
p_mem_
;
}
~
SimpleDeviceMem
()
{
(
void
)
hipFree
(
p_mem_
);
}
void
*
p_mem_
;
};
// clang-format on
int
main
(
int
argc
,
char
*
argv
[])
{
// GEMM shape
ck
::
index_t
M
=
64
;
ck
::
index_t
N
=
1024
;
ck
::
index_t
K
=
512
;
ck
::
index_t
StrideA
=
K
;
ck
::
index_t
StrideB
=
K
;
ck
::
index_t
StrideD
=
N
;
ck
::
index_t
StrideE
=
N
;
if
(
argc
==
1
)
{
// use default case
}
else
if
(
argc
==
8
)
{
M
=
std
::
stoi
(
argv
[
1
]);
N
=
std
::
stoi
(
argv
[
2
]);
K
=
std
::
stoi
(
argv
[
3
]);
StrideA
=
std
::
stoi
(
argv
[
4
]);
StrideB
=
std
::
stoi
(
argv
[
5
]);
StrideD
=
std
::
stoi
(
argv
[
6
]);
StrideE
=
std
::
stoi
(
argv
[
7
]);
}
else
{
printf
(
"arg1 to 7: M, N, K, StrideA, StrideB, StrideD, StrideE
\n
"
);
exit
(
0
);
}
auto
f_matrix_space_size
=
[](
std
::
size_t
nRow
,
std
::
size_t
nCol
,
std
::
size_t
stride
,
auto
layout
)
{
using
Layout
=
decltype
(
layout
);
if
constexpr
(
std
::
is_same
<
Layout
,
Row
>::
value
)
{
return
(
nRow
-
1
)
*
stride
+
nCol
;
}
else
{
return
(
nCol
-
1
)
*
stride
+
nRow
;
}
};
SimpleDeviceMem
a0_device_buf
(
sizeof
(
A0DataType
)
*
f_matrix_space_size
(
M
,
K
,
StrideA
,
A0Layout
{}));
SimpleDeviceMem
b0_device_buf
(
sizeof
(
B0DataType
)
*
f_matrix_space_size
(
K
,
N
,
StrideB
,
B0Layout
{}));
SimpleDeviceMem
b1_device_buf
(
sizeof
(
B1DataType
)
*
f_matrix_space_size
(
K
,
N
,
0
,
B1Layout
{}));
SimpleDeviceMem
d0_device_buf
(
sizeof
(
D0DataType
)
*
f_matrix_space_size
(
M
,
N
,
StrideD
,
ELayout
{}));
SimpleDeviceMem
e_device_buf
(
sizeof
(
EDataType
)
*
f_matrix_space_size
(
M
,
N
,
StrideE
,
ELayout
{}));
auto
a_element_op
=
AElementOp
{};
auto
b_element_op
=
BElementOp
{};
auto
cde_element_op
=
CDEElementOp
{};
constexpr
ck
::
index_t
NumATensor
=
1
;
constexpr
ck
::
index_t
NumBTensor
=
2
;
constexpr
ck
::
index_t
NumDTensor
=
1
;
using
DeviceOp
=
ck
::
tensor_operation
::
device
::
DeviceGemmMultipleABD
<
AsLayout
,
BsLayout
,
DsLayout
,
Row
,
AsDataType
,
BsDataType
,
DsDataType
,
BF16
,
AElementOp
,
BElementOp
,
CDEElementOp
>
;
const
auto
op_ptrs
=
ck
::
tensor_operation
::
device
::
instance
::
DeviceOperationInstanceFactory
<
DeviceOp
>::
GetInstances
();
std
::
cout
<<
"found "
<<
op_ptrs
.
size
()
<<
" instances"
<<
std
::
endl
;
std
::
string
best_op_name
;
bool
found
=
false
;
int
best_op_id
=
-
1
;
float
best_ave_time
=
0
;
float
best_tflops
=
0
;
float
best_gb_per_sec
=
0
;
// profile device operation instances
std
::
cout
<<
"Run all instances and do timing"
<<
std
::
endl
;
for
(
int
i
=
0
;
i
<
op_ptrs
.
size
();
++
i
)
{
auto
&
op_ptr
=
op_ptrs
[
i
];
auto
argument_ptr
=
op_ptr
->
MakeArgumentPointer
(
std
::
array
<
const
void
*
,
NumATensor
>
{
a0_device_buf
.
GetDeviceBuffer
()},
std
::
array
<
const
void
*
,
NumBTensor
>
{
b0_device_buf
.
GetDeviceBuffer
(),
b1_device_buf
.
GetDeviceBuffer
()},
std
::
array
<
const
void
*
,
NumDTensor
>
{
d0_device_buf
.
GetDeviceBuffer
()},
e_device_buf
.
GetDeviceBuffer
(),
M
,
N
,
K
,
std
::
array
<
ck
::
index_t
,
NumATensor
>
{
StrideA
},
std
::
array
<
ck
::
index_t
,
NumBTensor
>
{
StrideB
,
0
},
std
::
array
<
ck
::
index_t
,
NumDTensor
>
{
StrideD
},
StrideE
,
a_element_op
,
b_element_op
,
cde_element_op
);
auto
invoker_ptr
=
op_ptr
->
MakeInvokerPointer
();
std
::
string
op_name
=
op_ptr
->
GetTypeString
();
if
(
op_ptr
->
IsSupportedArgument
(
argument_ptr
.
get
()))
{
float
ave_time
=
invoker_ptr
->
Run
(
argument_ptr
.
get
(),
StreamConfig
{
nullptr
,
true
});
std
::
size_t
flop
=
std
::
size_t
(
2
)
*
M
*
N
*
K
;
std
::
size_t
num_btype
=
sizeof
(
A0DataType
)
*
M
*
K
+
sizeof
(
B0DataType
)
*
K
*
N
+
sizeof
(
EDataType
)
*
M
*
N
;
float
tflops
=
static_cast
<
float
>
(
flop
)
/
1.E9
/
ave_time
;
float
gb_per_sec
=
num_btype
/
1.E6
/
ave_time
;
std
::
cout
<<
"Perf: "
<<
std
::
setw
(
10
)
<<
ave_time
<<
" ms, "
<<
tflops
<<
" TFlops, "
<<
gb_per_sec
<<
" GB/s, "
<<
op_name
<<
std
::
endl
;
if
(
tflops
>
best_tflops
)
{
found
=
true
;
best_op_id
=
i
;
best_op_name
=
op_name
;
best_tflops
=
tflops
;
best_ave_time
=
ave_time
;
best_gb_per_sec
=
gb_per_sec
;
}
}
else
{
std
::
cout
<<
op_name
<<
" does not support this problem"
<<
std
::
endl
;
}
}
std
::
cout
<<
"Best Perf: "
<<
best_ave_time
<<
" ms, "
<<
best_tflops
<<
" TFlops, "
<<
best_gb_per_sec
<<
" GB/s, "
<<
best_op_name
<<
std
::
endl
;
// run the best intance
if
(
found
)
{
auto
&
op_ptr
=
op_ptrs
[
best_op_id
];
std
::
cout
<<
"Run the best instance without timing: "
<<
op_ptr
->
GetTypeString
()
<<
std
::
endl
;
auto
argument_ptr
=
op_ptr
->
MakeArgumentPointer
(
std
::
array
<
const
void
*
,
NumATensor
>
{
a0_device_buf
.
GetDeviceBuffer
()},
std
::
array
<
const
void
*
,
NumBTensor
>
{
b0_device_buf
.
GetDeviceBuffer
(),
b1_device_buf
.
GetDeviceBuffer
()},
std
::
array
<
const
void
*
,
NumDTensor
>
{
d0_device_buf
.
GetDeviceBuffer
()},
e_device_buf
.
GetDeviceBuffer
(),
M
,
N
,
K
,
std
::
array
<
ck
::
index_t
,
NumATensor
>
{
StrideA
},
std
::
array
<
ck
::
index_t
,
NumBTensor
>
{
StrideB
,
0
},
std
::
array
<
ck
::
index_t
,
NumDTensor
>
{
StrideD
},
StrideE
,
a_element_op
,
b_element_op
,
cde_element_op
);
auto
invoker_ptr
=
op_ptr
->
MakeInvokerPointer
();
if
(
op_ptr
->
IsSupportedArgument
(
argument_ptr
.
get
()))
{
invoker_ptr
->
Run
(
argument_ptr
.
get
(),
StreamConfig
{
nullptr
,
false
});
}
std
::
cout
<<
"Done"
<<
std
::
endl
;
}
return
0
;
}
Prev
1
…
4
5
6
7
8
9
10
11
12
…
26
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment