Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
composable_kernel
Commits
87e80971
Commit
87e80971
authored
Nov 14, 2023
by
Artur Wojcik
Browse files
Merge branch 'develop' into uif2-initial
parents
7c284291
5356c4a9
Changes
12
Hide whitespace changes
Inline
Side-by-side
Showing
12 changed files
with
1255 additions
and
4 deletions
+1255
-4
client_example/11_grouped_conv_bwd_weight/CMakeLists.txt
client_example/11_grouped_conv_bwd_weight/CMakeLists.txt
+4
-0
client_example/11_grouped_conv_bwd_weight/common.hpp
client_example/11_grouped_conv_bwd_weight/common.hpp
+6
-2
client_example/11_grouped_conv_bwd_weight/grouped_conv3d_bwd_weight_fp16_comp_bf8_fp8.cpp
...wd_weight/grouped_conv3d_bwd_weight_fp16_comp_bf8_fp8.cpp
+70
-0
example/44_elementwise_permute/CMakeLists.txt
example/44_elementwise_permute/CMakeLists.txt
+4
-0
example/44_elementwise_permute/elementwise_permute_4D_fp16_col.cpp
...4_elementwise_permute/elementwise_permute_4D_fp16_col.cpp
+149
-0
example/44_elementwise_permute/elementwise_permute_4D_fp16_row.cpp
...4_elementwise_permute/elementwise_permute_4D_fp16_row.cpp
+132
-0
example/44_elementwise_permute/elementwise_permute_4D_fp32_col.cpp
...4_elementwise_permute/elementwise_permute_4D_fp32_col.cpp
+148
-0
example/44_elementwise_permute/elementwise_permute_4D_fp32_row.cpp
...4_elementwise_permute/elementwise_permute_4D_fp32_row.cpp
+132
-0
include/ck/tensor_operation/gpu/device/device_elementwise_scale.hpp
.../tensor_operation/gpu/device/device_elementwise_scale.hpp
+55
-0
include/ck/tensor_operation/gpu/device/impl/device_elementwise_scale_impl.hpp
...eration/gpu/device/impl/device_elementwise_scale_impl.hpp
+329
-0
include/ck/tensor_operation/gpu/element/unary_element_wise_operation.hpp
...or_operation/gpu/element/unary_element_wise_operation.hpp
+2
-2
include/ck/tensor_operation/gpu/grid/gridwise_elementwise_1d_scale.hpp
...nsor_operation/gpu/grid/gridwise_elementwise_1d_scale.hpp
+224
-0
No files found.
client_example/11_grouped_conv_bwd_weight/CMakeLists.txt
View file @
87e80971
...
...
@@ -13,3 +13,7 @@ target_compile_features(client_grouped_conv3d_bwd_weight_fp16 PRIVATE cxx_std_17
add_executable
(
client_grouped_conv3d_bwd_weight_fp32 grouped_conv3d_bwd_weight_fp32.cpp
)
target_link_libraries
(
client_grouped_conv3d_bwd_weight_fp32 PRIVATE composable_kernel::device_operations
)
target_compile_features
(
client_grouped_conv3d_bwd_weight_fp32 PRIVATE cxx_std_17
)
add_executable
(
client_grouped_conv3d_bwd_weight_fp16_comp_bf8_fp8 grouped_conv3d_bwd_weight_fp16_comp_bf8_fp8.cpp
)
target_link_libraries
(
client_grouped_conv3d_bwd_weight_fp16_comp_bf8_fp8 PRIVATE composable_kernel::device_operations
)
target_compile_features
(
client_grouped_conv3d_bwd_weight_fp16_comp_bf8_fp8 PRIVATE cxx_std_17
)
client_example/11_grouped_conv_bwd_weight/common.hpp
View file @
87e80971
...
...
@@ -85,7 +85,9 @@ template <ck::index_t NumDimSpatial,
typename
OutDataType
,
typename
InLayout
,
typename
WeiLayout
,
typename
OutLayout
>
typename
OutLayout
,
typename
AComputeType
=
InDataType
,
typename
BComputeType
=
AComputeType
>
bool
run_grouped_conv_bwd_weight
(
const
std
::
array
<
ck
::
index_t
,
NumDimSpatial
+
3
>&
input_lengths
,
const
std
::
array
<
ck
::
index_t
,
NumDimSpatial
+
3
>&
input_strides
,
...
...
@@ -113,7 +115,9 @@ bool run_grouped_conv_bwd_weight(
OutDataType
,
PassThrough
,
PassThrough
,
PassThrough
>
;
PassThrough
,
AComputeType
,
BComputeType
>
;
// get device op instances
const
auto
op_ptrs
=
ck
::
tensor_operation
::
device
::
instance
::
DeviceOperationInstanceFactory
<
DeviceOp
>::
GetInstances
();
...
...
client_example/11_grouped_conv_bwd_weight/grouped_conv3d_bwd_weight_fp16_comp_bf8_fp8.cpp
0 → 100644
View file @
87e80971
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
#include "common.hpp"
#include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
using
InDataType
=
ck
::
half_t
;
using
WeiDataType
=
ck
::
half_t
;
using
OutDataType
=
ck
::
half_t
;
using
InLayout
=
ck
::
tensor_layout
::
convolution
::
NDHWGC
;
using
WeiLayout
=
ck
::
tensor_layout
::
convolution
::
GKZYXC
;
using
OutLayout
=
ck
::
tensor_layout
::
convolution
::
NDHWGK
;
using
AComputeType
=
ck
::
bf8_t
;
using
BComputeType
=
ck
::
f8_t
;
static
constexpr
ck
::
index_t
NumDimSpatial
=
3
;
static
constexpr
ck
::
index_t
G
=
8
;
static
constexpr
ck
::
index_t
N
=
64
;
static
constexpr
ck
::
index_t
K
=
128
;
static
constexpr
ck
::
index_t
C
=
128
;
static
constexpr
ck
::
index_t
Z
=
3
;
static
constexpr
ck
::
index_t
Y
=
3
;
static
constexpr
ck
::
index_t
X
=
3
;
static
constexpr
ck
::
index_t
Di
=
28
;
static
constexpr
ck
::
index_t
Hi
=
28
;
static
constexpr
ck
::
index_t
Wi
=
3
;
static
constexpr
ck
::
index_t
Do
=
28
;
static
constexpr
ck
::
index_t
Ho
=
28
;
static
constexpr
ck
::
index_t
Wo
=
3
;
static
constexpr
std
::
array
<
ck
::
index_t
,
NumDimSpatial
+
3
>
input_lengths
{
G
,
N
,
C
,
Di
,
Hi
,
Wi
};
static
constexpr
std
::
array
<
ck
::
index_t
,
NumDimSpatial
+
3
>
filter_lengths
{
G
,
K
,
C
,
Z
,
Y
,
X
};
static
constexpr
std
::
array
<
ck
::
index_t
,
NumDimSpatial
+
3
>
output_lengths
{
G
,
N
,
K
,
Do
,
Ho
,
Wo
};
static
constexpr
std
::
array
<
ck
::
index_t
,
NumDimSpatial
+
3
>
input_strides
{
N
*
Di
*
Hi
*
Wi
*
C
,
Di
*
Hi
*
Wi
*
C
,
1
,
Hi
*
Wi
*
C
,
Wi
*
C
,
C
};
static
constexpr
std
::
array
<
ck
::
index_t
,
NumDimSpatial
+
3
>
weights_strides
{
K
*
Z
*
Y
*
X
*
C
,
Z
*
Y
*
X
*
C
,
1
,
Y
*
X
*
C
,
X
*
C
,
C
};
static
constexpr
std
::
array
<
ck
::
index_t
,
NumDimSpatial
+
3
>
output_strides
{
N
*
Do
*
Ho
*
Wo
*
K
,
Do
*
Ho
*
Wo
*
K
,
1
,
Ho
*
Wo
*
K
,
Wo
*
K
,
K
};
static
constexpr
std
::
array
<
ck
::
index_t
,
NumDimSpatial
>
conv_filter_strides
{
1
,
1
,
1
};
static
constexpr
std
::
array
<
ck
::
index_t
,
NumDimSpatial
>
conv_filter_dilations
{
1
,
1
,
1
};
static
constexpr
std
::
array
<
ck
::
index_t
,
NumDimSpatial
>
input_left_pads
{
1
,
1
,
1
};
static
constexpr
std
::
array
<
ck
::
index_t
,
NumDimSpatial
>
input_right_pads
{
1
,
1
,
1
};
int
main
()
{
return
run_grouped_conv_bwd_weight
<
NumDimSpatial
,
InDataType
,
WeiDataType
,
OutDataType
,
InLayout
,
WeiLayout
,
OutLayout
,
AComputeType
,
BComputeType
>
(
input_lengths
,
input_strides
,
filter_lengths
,
weights_strides
,
output_lengths
,
output_strides
,
conv_filter_strides
,
conv_filter_dilations
,
input_left_pads
,
input_right_pads
)
?
EXIT_SUCCESS
:
EXIT_FAILURE
;
}
example/44_elementwise_permute/CMakeLists.txt
View file @
87e80971
add_example_executable
(
example_elementwise_permute_4D_fp16 elementwise_permute_4D_fp16.cpp
)
add_example_executable
(
example_elementwise_permute_4D_fp16_2d elementwise_permute_4D_fp16_2d.cpp
)
add_example_executable
(
example_elementwise_permute_4D_fp32_row elementwise_permute_4D_fp32_row.cpp
)
add_example_executable
(
example_elementwise_permute_4D_fp16_row elementwise_permute_4D_fp16_row.cpp
)
add_example_executable
(
example_elementwise_permute_4D_fp32_col elementwise_permute_4D_fp32_col.cpp
)
add_example_executable
(
example_elementwise_permute_4D_fp16_col elementwise_permute_4D_fp16_col.cpp
)
add_example_executable
(
example_elementwise_permute elementwise_permute.cpp
)
add_example_executable
(
example_elementwise_permute_3d elementwise_permute_3d.cpp
)
example/44_elementwise_permute/elementwise_permute_4D_fp16_col.cpp
0 → 100644
View file @
87e80971
#include <iostream>
#include <cstdlib>
#include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/element/binary_element_wise_operation.hpp"
#include "ck/tensor_operation/gpu/device/impl/device_elementwise_scale_impl.hpp"
#include "ck/library/utility/algorithm.hpp"
#include "ck/library/utility/check_err.hpp"
#include "ck/library/utility/device_memory.hpp"
#include "ck/library/utility/host_tensor.hpp"
#include "ck/library/utility/host_tensor_generator.hpp"
using
F16
=
ck
::
half_t
;
using
F32
=
float
;
using
ADataType
=
F16
;
using
BDataType
=
F16
;
using
PassThrough
=
ck
::
tensor_operation
::
element_wise
::
PassThrough
;
using
UnaryOp
=
ck
::
tensor_operation
::
element_wise
::
UnarySquare
;
using
Scale
=
ck
::
tensor_operation
::
element_wise
::
Scale
;
using
DeviceElementwisePermuteInstance
=
ck
::
tensor_operation
::
device
::
DeviceElementwiseImpl
<
ck
::
Tuple
<
ADataType
>
,
// InDataTypeTuple
ck
::
Tuple
<
BDataType
>
,
// OutDataTypeTuple
PassThrough
,
// ElementwiseOp
UnaryOp
,
// UnaryOp
Scale
,
// Scalar
4
,
// NumDim
8
,
// MPerThread
ck
::
Sequence
<
1
>
,
// InScalarPerVectorSeq
ck
::
Sequence
<
1
>>
;
// OutScalarPerVectorSeq
template
<
typename
HostTensorA
,
typename
HostTensorB
,
typename
FunctorA
,
typename
FunctorB
>
void
host_elementwise4D
(
HostTensorB
&
B_nhwc
,
const
HostTensorA
&
A_nchw
,
FunctorA
functor_a
,
FunctorB
functor_b
,
float
scale
)
{
std
::
size_t
N
=
A_nchw
.
mDesc
.
GetLengths
()[
0
];
std
::
size_t
C
=
A_nchw
.
mDesc
.
GetLengths
()[
1
];
std
::
size_t
H
=
A_nchw
.
mDesc
.
GetLengths
()[
2
];
std
::
size_t
W
=
A_nchw
.
mDesc
.
GetLengths
()[
3
];
for
(
std
::
size_t
w
=
0
;
w
<
W
;
++
w
)
for
(
std
::
size_t
h
=
0
;
h
<
H
;
++
h
)
for
(
std
::
size_t
c
=
0
;
c
<
C
;
++
c
)
for
(
std
::
size_t
n
=
0
;
n
<
N
;
++
n
)
{
ADataType
tmp_val
;
// auto a_val = A_nchw(n, c, h, w);
auto
a_val
=
A_nchw
.
mData
[(
n
)
+
(
c
*
N
)
+
(
h
*
C
*
N
)
+
(
w
*
H
*
C
*
N
)];
functor_b
(
tmp_val
,
a_val
);
// functor_a(B_nhwc(n, h, w, c), scale * tmp_val);
functor_a
(
B_nhwc
.
mData
[(
n
)
+
(
c
*
W
*
H
*
N
)
+
(
h
*
N
)
+
(
w
*
H
*
N
)],
scale
*
tmp_val
);
}
}
int
main
()
{
bool
do_verification
=
true
;
bool
time_kernel
=
true
;
std
::
vector
<
std
::
size_t
>
nchw
=
{
4
,
2
,
1
,
8
};
std
::
vector
<
std
::
size_t
>
nhwc
=
{
4
,
1
,
8
,
2
};
Tensor
<
ADataType
>
a
(
nchw
);
Tensor
<
BDataType
>
b
(
nhwc
);
float
scale
=
1.
f
;
auto
i
=
0
;
for
(
std
::
size_t
w
=
0
;
w
<
a
.
mDesc
.
GetLengths
()[
3
];
++
w
)
for
(
std
::
size_t
h
=
0
;
h
<
a
.
mDesc
.
GetLengths
()[
2
];
++
h
)
for
(
std
::
size_t
c
=
0
;
c
<
a
.
mDesc
.
GetLengths
()[
1
];
++
c
)
for
(
std
::
size_t
n
=
0
;
n
<
a
.
mDesc
.
GetLengths
()[
0
];
++
n
)
{
a
.
mData
[(
n
*
nchw
[
1
]
*
nchw
[
2
]
*
nchw
[
3
])
+
(
c
*
nchw
[
2
]
*
nchw
[
3
])
+
(
h
*
nchw
[
3
])
+
w
]
=
i
;
i
++
;
}
DeviceMem
a_device_buf
(
sizeof
(
ADataType
)
*
a
.
mDesc
.
GetElementSpaceSize
());
DeviceMem
b_device_buf
(
sizeof
(
BDataType
)
*
b
.
mDesc
.
GetElementSpaceSize
());
a_device_buf
.
ToDevice
(
a
.
mData
.
data
());
std
::
array
<
const
void
*
,
1
>
input
=
{
a_device_buf
.
GetDeviceBuffer
()};
std
::
array
<
void
*
,
1
>
output
=
{
b_device_buf
.
GetDeviceBuffer
()};
std
::
array
<
ck
::
index_t
,
4
>
ab_lengths
;
std
::
array
<
ck
::
index_t
,
4
>
a_strides
=
{
1
,
static_cast
<
int
>
(
nchw
[
0
]),
static_cast
<
int
>
(
nchw
[
0
]
*
nchw
[
1
]),
static_cast
<
int
>
(
nchw
[
0
]
*
nchw
[
1
]
*
nchw
[
2
])};
std
::
array
<
ck
::
index_t
,
4
>
b_strides
=
{
1
,
static_cast
<
int
>
(
nhwc
[
0
]
*
nhwc
[
1
]
*
nhwc
[
2
]),
static_cast
<
int
>
(
nhwc
[
0
]),
static_cast
<
int
>
(
nhwc
[
0
]
*
nhwc
[
1
])};
ck
::
ranges
::
copy
(
nchw
,
ab_lengths
.
begin
());
auto
broadcastPermute
=
DeviceElementwisePermuteInstance
{};
auto
argument
=
broadcastPermute
.
MakeArgumentPointer
(
ab_lengths
,
{
a_strides
},
{
b_strides
},
input
,
output
,
PassThrough
{},
UnaryOp
{},
Scale
{
scale
});
if
(
!
broadcastPermute
.
IsSupportedArgument
(
argument
.
get
()))
{
throw
std
::
runtime_error
(
"The runtime parameters seems not supported by the device instance, exiting!"
);
};
std
::
cout
<<
"A (nchw): "
<<
a
.
mDesc
<<
std
::
endl
;
std
::
cout
<<
"B (nhwc): "
<<
b
.
mDesc
<<
std
::
endl
;
auto
broadcastPermute_invoker_ptr
=
broadcastPermute
.
MakeInvokerPointer
();
float
ave_time
=
broadcastPermute_invoker_ptr
->
Run
(
argument
.
get
(),
StreamConfig
{
nullptr
,
time_kernel
});
std
::
size_t
flop
=
std
::
size_t
(
2
)
*
nchw
[
0
]
*
nchw
[
1
]
*
nchw
[
2
]
*
nchw
[
3
];
std
::
size_t
num_btype
=
sizeof
(
ADataType
)
*
(
nchw
[
0
]
*
nchw
[
1
]
*
nchw
[
2
]
*
nchw
[
3
])
+
sizeof
(
BDataType
)
*
(
nchw
[
0
]
*
nchw
[
1
]
*
nchw
[
2
]
*
nchw
[
3
]);
float
tflops
=
static_cast
<
float
>
(
flop
)
/
1.E9
/
ave_time
;
float
gb_per_sec
=
num_btype
/
1.E6
/
ave_time
;
std
::
cout
<<
"Perf: "
<<
ave_time
<<
" ms, "
<<
tflops
<<
" TFlops, "
<<
gb_per_sec
<<
" GB/s"
<<
std
::
endl
;
bool
pass
=
true
;
if
(
do_verification
)
{
b_device_buf
.
FromDevice
(
b
.
mData
.
data
());
Tensor
<
BDataType
>
host_b
(
nhwc
);
host_elementwise4D
(
host_b
,
a
,
PassThrough
{},
UnaryOp
{},
scale
);
pass
&=
ck
::
utils
::
check_err
(
b
.
mData
,
host_b
.
mData
,
"Error: Incorrect results b"
,
1e-3
,
1e-3
);
}
return
pass
?
0
:
1
;
}
example/44_elementwise_permute/elementwise_permute_4D_fp16_row.cpp
0 → 100644
View file @
87e80971
#include <iostream>
#include <cstdlib>
#include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/element/binary_element_wise_operation.hpp"
#include "ck/tensor_operation/gpu/device/impl/device_elementwise_scale_impl.hpp"
#include "ck/library/utility/algorithm.hpp"
#include "ck/library/utility/check_err.hpp"
#include "ck/library/utility/device_memory.hpp"
#include "ck/library/utility/host_tensor.hpp"
#include "ck/library/utility/host_tensor_generator.hpp"
using
F16
=
ck
::
half_t
;
using
F32
=
float
;
using
ADataType
=
F16
;
using
BDataType
=
F16
;
using
PassThrough
=
ck
::
tensor_operation
::
element_wise
::
PassThrough
;
using
UnaryOp
=
ck
::
tensor_operation
::
element_wise
::
UnarySquare
;
using
Scale
=
ck
::
tensor_operation
::
element_wise
::
Scale
;
using
DeviceElementwisePermuteInstance
=
ck
::
tensor_operation
::
device
::
DeviceElementwiseImpl
<
ck
::
Tuple
<
ADataType
>
,
// InDataTypeTuple
ck
::
Tuple
<
BDataType
>
,
// OutDataTypeTuple
PassThrough
,
// ElementwiseOp
UnaryOp
,
// UnaryOp
Scale
,
// Scalar
4
,
// NumDim
8
,
// MPerThread
ck
::
Sequence
<
8
>
,
// InScalarPerVectorSeq
ck
::
Sequence
<
1
>>
;
// OutScalarPerVectorSeq
template
<
typename
HostTensorA
,
typename
HostTensorB
,
typename
FunctorA
,
typename
FunctorB
>
void
host_elementwise4D
(
HostTensorB
&
B_nhwc
,
const
HostTensorA
&
A_nchw
,
FunctorA
functor_a
,
FunctorB
functor_b
,
float
scale
)
{
for
(
std
::
size_t
n
=
0
;
n
<
A_nchw
.
mDesc
.
GetLengths
()[
0
];
++
n
)
for
(
std
::
size_t
c
=
0
;
c
<
A_nchw
.
mDesc
.
GetLengths
()[
1
];
++
c
)
for
(
std
::
size_t
h
=
0
;
h
<
A_nchw
.
mDesc
.
GetLengths
()[
2
];
++
h
)
for
(
std
::
size_t
w
=
0
;
w
<
A_nchw
.
mDesc
.
GetLengths
()[
3
];
++
w
)
{
ADataType
tmp_val
;
auto
a_val
=
A_nchw
(
n
,
c
,
h
,
w
);
functor_b
(
tmp_val
,
a_val
);
functor_a
(
B_nhwc
(
n
,
h
,
w
,
c
),
scale
*
tmp_val
);
}
}
int
main
()
{
bool
do_verification
=
true
;
bool
time_kernel
=
true
;
std
::
vector
<
std
::
size_t
>
nchw
=
{
16
,
128
,
32
,
64
};
std
::
vector
<
std
::
size_t
>
nhwc
=
{
16
,
32
,
64
,
128
};
Tensor
<
ADataType
>
a
(
nchw
);
Tensor
<
BDataType
>
b
(
nhwc
);
float
scale
=
2.
f
;
a
.
GenerateTensorValue
(
GeneratorTensor_3
<
ADataType
>
{
0.0
,
1.0
});
DeviceMem
a_device_buf
(
sizeof
(
ADataType
)
*
a
.
mDesc
.
GetElementSpaceSize
());
DeviceMem
b_device_buf
(
sizeof
(
BDataType
)
*
b
.
mDesc
.
GetElementSpaceSize
());
a_device_buf
.
ToDevice
(
a
.
mData
.
data
());
std
::
array
<
const
void
*
,
1
>
input
=
{
a_device_buf
.
GetDeviceBuffer
()};
std
::
array
<
void
*
,
1
>
output
=
{
b_device_buf
.
GetDeviceBuffer
()};
std
::
array
<
ck
::
index_t
,
4
>
ab_lengths
;
std
::
array
<
ck
::
index_t
,
4
>
a_strides
=
{
static_cast
<
int
>
(
nchw
[
1
]
*
nchw
[
2
]
*
nchw
[
3
]),
static_cast
<
int
>
(
nchw
[
2
]
*
nchw
[
3
]),
static_cast
<
int
>
(
nchw
[
3
]),
1
};
std
::
array
<
ck
::
index_t
,
4
>
b_strides
=
{
static_cast
<
int
>
(
nhwc
[
1
]
*
nhwc
[
2
]
*
nhwc
[
3
]),
1
,
static_cast
<
int
>
(
nhwc
[
2
]
*
nhwc
[
3
]),
static_cast
<
int
>
(
nhwc
[
3
])};
ck
::
ranges
::
copy
(
nchw
,
ab_lengths
.
begin
());
auto
broadcastPermute
=
DeviceElementwisePermuteInstance
{};
auto
argument
=
broadcastPermute
.
MakeArgumentPointer
(
ab_lengths
,
{
a_strides
},
{
b_strides
},
input
,
output
,
PassThrough
{},
UnaryOp
{},
Scale
{
scale
});
if
(
!
broadcastPermute
.
IsSupportedArgument
(
argument
.
get
()))
{
throw
std
::
runtime_error
(
"The runtime parameters seems not supported by the device instance, exiting!"
);
};
std
::
cout
<<
"A (nchw): "
<<
a
.
mDesc
<<
std
::
endl
;
std
::
cout
<<
"B (nhwc): "
<<
b
.
mDesc
<<
std
::
endl
;
auto
broadcastPermute_invoker_ptr
=
broadcastPermute
.
MakeInvokerPointer
();
float
ave_time
=
broadcastPermute_invoker_ptr
->
Run
(
argument
.
get
(),
StreamConfig
{
nullptr
,
time_kernel
});
std
::
size_t
flop
=
std
::
size_t
(
2
)
*
nchw
[
0
]
*
nchw
[
1
]
*
nchw
[
2
]
*
nchw
[
3
];
std
::
size_t
num_btype
=
sizeof
(
ADataType
)
*
(
nchw
[
0
]
*
nchw
[
1
]
*
nchw
[
2
]
*
nchw
[
3
])
+
sizeof
(
BDataType
)
*
(
nchw
[
0
]
*
nchw
[
1
]
*
nchw
[
2
]
*
nchw
[
3
]);
float
tflops
=
static_cast
<
float
>
(
flop
)
/
1.E9
/
ave_time
;
float
gb_per_sec
=
num_btype
/
1.E6
/
ave_time
;
std
::
cout
<<
"Perf: "
<<
ave_time
<<
" ms, "
<<
tflops
<<
" TFlops, "
<<
gb_per_sec
<<
" GB/s"
<<
std
::
endl
;
bool
pass
=
true
;
if
(
do_verification
)
{
b_device_buf
.
FromDevice
(
b
.
mData
.
data
());
Tensor
<
BDataType
>
host_b
(
nhwc
);
host_elementwise4D
(
host_b
,
a
,
PassThrough
{},
UnaryOp
{},
scale
);
pass
&=
ck
::
utils
::
check_err
(
b
.
mData
,
host_b
.
mData
,
"Error: Incorrect results b"
,
1e-3
,
1e-3
);
}
return
pass
?
0
:
1
;
}
example/44_elementwise_permute/elementwise_permute_4D_fp32_col.cpp
0 → 100644
View file @
87e80971
#include <iostream>
#include <cstdlib>
#include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/element/binary_element_wise_operation.hpp"
#include "ck/tensor_operation/gpu/device/impl/device_elementwise_scale_impl.hpp"
#include "ck/library/utility/algorithm.hpp"
#include "ck/library/utility/check_err.hpp"
#include "ck/library/utility/device_memory.hpp"
#include "ck/library/utility/host_tensor.hpp"
#include "ck/library/utility/host_tensor_generator.hpp"
using
F16
=
ck
::
half_t
;
using
F32
=
float
;
using
ADataType
=
F32
;
using
BDataType
=
F32
;
using
PassThrough
=
ck
::
tensor_operation
::
element_wise
::
PassThrough
;
using
UnaryOp
=
ck
::
tensor_operation
::
element_wise
::
UnarySquare
;
using
Scale
=
ck
::
tensor_operation
::
element_wise
::
Scale
;
using
DeviceElementwisePermuteInstance
=
ck
::
tensor_operation
::
device
::
DeviceElementwiseImpl
<
ck
::
Tuple
<
ADataType
>
,
// InDataTypeTuple
ck
::
Tuple
<
BDataType
>
,
// OutDataTypeTuple
PassThrough
,
// ElementwiseOp
UnaryOp
,
// UnaryOp
Scale
,
// Scalar
4
,
// NumDim
1
,
// MPerThread
ck
::
Sequence
<
1
>
,
// InScalarPerVectorSeq
ck
::
Sequence
<
1
>>
;
// OutScalarPerVectorSeq
template
<
typename
HostTensorA
,
typename
HostTensorB
,
typename
FunctorA
,
typename
FunctorB
>
void
host_elementwise4D
(
HostTensorB
&
B_nhwc
,
const
HostTensorA
&
A_nchw
,
FunctorA
functor_a
,
FunctorB
functor_b
,
float
scale
)
{
std
::
size_t
N
=
A_nchw
.
mDesc
.
GetLengths
()[
0
];
std
::
size_t
C
=
A_nchw
.
mDesc
.
GetLengths
()[
1
];
std
::
size_t
H
=
A_nchw
.
mDesc
.
GetLengths
()[
2
];
std
::
size_t
W
=
A_nchw
.
mDesc
.
GetLengths
()[
3
];
for
(
std
::
size_t
w
=
0
;
w
<
W
;
++
w
)
for
(
std
::
size_t
h
=
0
;
h
<
H
;
++
h
)
for
(
std
::
size_t
c
=
0
;
c
<
C
;
++
c
)
for
(
std
::
size_t
n
=
0
;
n
<
N
;
++
n
)
{
ADataType
tmp_val
;
auto
a_val
=
A_nchw
.
mData
[(
n
)
+
(
c
*
N
)
+
(
h
*
C
*
N
)
+
(
w
*
H
*
C
*
N
)];
functor_b
(
tmp_val
,
a_val
);
functor_a
(
B_nhwc
.
mData
[(
n
)
+
(
c
*
W
*
H
*
N
)
+
(
h
*
N
)
+
(
w
*
H
*
N
)],
scale
*
tmp_val
);
}
}
int
main
()
{
bool
do_verification
=
true
;
bool
time_kernel
=
true
;
std
::
vector
<
std
::
size_t
>
nchw
=
{
5
,
4
,
2
,
3
};
std
::
vector
<
std
::
size_t
>
nhwc
=
{
5
,
2
,
3
,
4
};
Tensor
<
ADataType
>
a
(
nchw
);
Tensor
<
BDataType
>
b
(
nhwc
);
float
scale
=
1.
f
;
auto
i
=
0
;
for
(
std
::
size_t
w
=
0
;
w
<
a
.
mDesc
.
GetLengths
()[
3
];
++
w
)
for
(
std
::
size_t
h
=
0
;
h
<
a
.
mDesc
.
GetLengths
()[
2
];
++
h
)
for
(
std
::
size_t
c
=
0
;
c
<
a
.
mDesc
.
GetLengths
()[
1
];
++
c
)
for
(
std
::
size_t
n
=
0
;
n
<
a
.
mDesc
.
GetLengths
()[
0
];
++
n
)
{
a
.
mData
[(
n
*
nchw
[
1
]
*
nchw
[
2
]
*
nchw
[
3
])
+
(
c
*
nchw
[
2
]
*
nchw
[
3
])
+
(
h
*
nchw
[
3
])
+
w
]
=
i
;
i
++
;
}
DeviceMem
a_device_buf
(
sizeof
(
ADataType
)
*
a
.
mDesc
.
GetElementSpaceSize
());
DeviceMem
b_device_buf
(
sizeof
(
BDataType
)
*
b
.
mDesc
.
GetElementSpaceSize
());
a_device_buf
.
ToDevice
(
a
.
mData
.
data
());
std
::
array
<
const
void
*
,
1
>
input
=
{
a_device_buf
.
GetDeviceBuffer
()};
std
::
array
<
void
*
,
1
>
output
=
{
b_device_buf
.
GetDeviceBuffer
()};
std
::
array
<
ck
::
index_t
,
4
>
ab_lengths
;
std
::
array
<
ck
::
index_t
,
4
>
a_strides
=
{
1
,
static_cast
<
int
>
(
nchw
[
0
]),
static_cast
<
int
>
(
nchw
[
0
]
*
nchw
[
1
]),
static_cast
<
int
>
(
nchw
[
0
]
*
nchw
[
1
]
*
nchw
[
2
])};
std
::
array
<
ck
::
index_t
,
4
>
b_strides
=
{
1
,
static_cast
<
int
>
(
nhwc
[
0
]
*
nhwc
[
1
]
*
nhwc
[
2
]),
static_cast
<
int
>
(
nhwc
[
0
]),
static_cast
<
int
>
(
nhwc
[
0
]
*
nhwc
[
1
])};
ck
::
ranges
::
copy
(
nchw
,
ab_lengths
.
begin
());
auto
broadcastPermute
=
DeviceElementwisePermuteInstance
{};
auto
argument
=
broadcastPermute
.
MakeArgumentPointer
(
ab_lengths
,
{
a_strides
},
{
b_strides
},
input
,
output
,
PassThrough
{},
UnaryOp
{},
Scale
{
scale
});
if
(
!
broadcastPermute
.
IsSupportedArgument
(
argument
.
get
()))
{
throw
std
::
runtime_error
(
"The runtime parameters seems not supported by the device instance, exiting!"
);
};
std
::
cout
<<
"A (nchw): "
<<
a
.
mDesc
<<
std
::
endl
;
std
::
cout
<<
"B (nhwc): "
<<
b
.
mDesc
<<
std
::
endl
;
auto
broadcastPermute_invoker_ptr
=
broadcastPermute
.
MakeInvokerPointer
();
float
ave_time
=
broadcastPermute_invoker_ptr
->
Run
(
argument
.
get
(),
StreamConfig
{
nullptr
,
time_kernel
});
std
::
size_t
flop
=
std
::
size_t
(
2
)
*
nchw
[
0
]
*
nchw
[
1
]
*
nchw
[
2
]
*
nchw
[
3
];
std
::
size_t
num_btype
=
sizeof
(
ADataType
)
*
(
nchw
[
0
]
*
nchw
[
1
]
*
nchw
[
2
]
*
nchw
[
3
])
+
sizeof
(
BDataType
)
*
(
nchw
[
0
]
*
nchw
[
1
]
*
nchw
[
2
]
*
nchw
[
3
]);
float
tflops
=
static_cast
<
float
>
(
flop
)
/
1.E9
/
ave_time
;
float
gb_per_sec
=
num_btype
/
1.E6
/
ave_time
;
std
::
cout
<<
"Perf: "
<<
ave_time
<<
" ms, "
<<
tflops
<<
" TFlops, "
<<
gb_per_sec
<<
" GB/s"
<<
std
::
endl
;
bool
pass
=
true
;
if
(
do_verification
)
{
b_device_buf
.
FromDevice
(
b
.
mData
.
data
());
Tensor
<
BDataType
>
host_b
(
nhwc
);
host_elementwise4D
(
host_b
,
a
,
PassThrough
{},
UnaryOp
{},
scale
);
pass
&=
ck
::
utils
::
check_err
(
b
.
mData
,
host_b
.
mData
,
"Error: Incorrect results b"
,
1e-3
,
1e-3
);
}
return
pass
?
0
:
1
;
}
example/44_elementwise_permute/elementwise_permute_4D_fp32_row.cpp
0 → 100644
View file @
87e80971
#include <iostream>
#include <cstdlib>
#include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/element/binary_element_wise_operation.hpp"
#include "ck/tensor_operation/gpu/device/impl/device_elementwise_scale_impl.hpp"
#include "ck/library/utility/algorithm.hpp"
#include "ck/library/utility/check_err.hpp"
#include "ck/library/utility/device_memory.hpp"
#include "ck/library/utility/host_tensor.hpp"
#include "ck/library/utility/host_tensor_generator.hpp"
using
F16
=
ck
::
half_t
;
using
F32
=
float
;
using
ADataType
=
F32
;
using
BDataType
=
F32
;
using
PassThrough
=
ck
::
tensor_operation
::
element_wise
::
PassThrough
;
using
UnaryOp
=
ck
::
tensor_operation
::
element_wise
::
UnarySquare
;
using
Scale
=
ck
::
tensor_operation
::
element_wise
::
Scale
;
using
DeviceElementwisePermuteInstance
=
ck
::
tensor_operation
::
device
::
DeviceElementwiseImpl
<
ck
::
Tuple
<
ADataType
>
,
// InDataTypeTuple
ck
::
Tuple
<
BDataType
>
,
// OutDataTypeTuple
PassThrough
,
// ElementwiseOp
UnaryOp
,
// UnaryOp
Scale
,
// Scalar
4
,
// NumDim
8
,
// MPerThread
ck
::
Sequence
<
8
>
,
// InScalarPerVectorSeq
ck
::
Sequence
<
1
>>
;
// OutScalarPerVectorSeq
template
<
typename
HostTensorA
,
typename
HostTensorB
,
typename
FunctorA
,
typename
FunctorB
>
void
host_elementwise4D
(
HostTensorB
&
B_nhwc
,
const
HostTensorA
&
A_nchw
,
FunctorA
functor_a
,
FunctorB
functor_b
,
float
scale
)
{
for
(
std
::
size_t
n
=
0
;
n
<
A_nchw
.
mDesc
.
GetLengths
()[
0
];
++
n
)
for
(
std
::
size_t
c
=
0
;
c
<
A_nchw
.
mDesc
.
GetLengths
()[
1
];
++
c
)
for
(
std
::
size_t
h
=
0
;
h
<
A_nchw
.
mDesc
.
GetLengths
()[
2
];
++
h
)
for
(
std
::
size_t
w
=
0
;
w
<
A_nchw
.
mDesc
.
GetLengths
()[
3
];
++
w
)
{
ADataType
tmp_val
;
auto
a_val
=
A_nchw
(
n
,
c
,
h
,
w
);
functor_b
(
tmp_val
,
a_val
);
functor_a
(
B_nhwc
(
n
,
h
,
w
,
c
),
scale
*
tmp_val
);
}
}
int
main
()
{
bool
do_verification
=
true
;
bool
time_kernel
=
true
;
std
::
vector
<
std
::
size_t
>
nchw
=
{
16
,
128
,
32
,
64
};
std
::
vector
<
std
::
size_t
>
nhwc
=
{
16
,
32
,
64
,
128
};
Tensor
<
ADataType
>
a
(
nchw
);
Tensor
<
BDataType
>
b
(
nhwc
);
float
scale
=
2.
f
;
a
.
GenerateTensorValue
(
GeneratorTensor_3
<
ADataType
>
{
0.0
,
1.0
});
DeviceMem
a_device_buf
(
sizeof
(
ADataType
)
*
a
.
mDesc
.
GetElementSpaceSize
());
DeviceMem
b_device_buf
(
sizeof
(
BDataType
)
*
b
.
mDesc
.
GetElementSpaceSize
());
a_device_buf
.
ToDevice
(
a
.
mData
.
data
());
std
::
array
<
const
void
*
,
1
>
input
=
{
a_device_buf
.
GetDeviceBuffer
()};
std
::
array
<
void
*
,
1
>
output
=
{
b_device_buf
.
GetDeviceBuffer
()};
std
::
array
<
ck
::
index_t
,
4
>
ab_lengths
;
std
::
array
<
ck
::
index_t
,
4
>
a_strides
=
{
static_cast
<
int
>
(
nchw
[
1
]
*
nchw
[
2
]
*
nchw
[
3
]),
static_cast
<
int
>
(
nchw
[
2
]
*
nchw
[
3
]),
static_cast
<
int
>
(
nchw
[
3
]),
1
};
std
::
array
<
ck
::
index_t
,
4
>
b_strides
=
{
static_cast
<
int
>
(
nhwc
[
1
]
*
nhwc
[
2
]
*
nhwc
[
3
]),
1
,
static_cast
<
int
>
(
nhwc
[
2
]
*
nhwc
[
3
]),
static_cast
<
int
>
(
nhwc
[
3
])};
ck
::
ranges
::
copy
(
nchw
,
ab_lengths
.
begin
());
auto
broadcastPermute
=
DeviceElementwisePermuteInstance
{};
auto
argument
=
broadcastPermute
.
MakeArgumentPointer
(
ab_lengths
,
{
a_strides
},
{
b_strides
},
input
,
output
,
PassThrough
{},
UnaryOp
{},
Scale
{
scale
});
if
(
!
broadcastPermute
.
IsSupportedArgument
(
argument
.
get
()))
{
throw
std
::
runtime_error
(
"The runtime parameters seems not supported by the device instance, exiting!"
);
};
std
::
cout
<<
"A (nchw): "
<<
a
.
mDesc
<<
std
::
endl
;
std
::
cout
<<
"B (nhwc): "
<<
b
.
mDesc
<<
std
::
endl
;
auto
broadcastPermute_invoker_ptr
=
broadcastPermute
.
MakeInvokerPointer
();
float
ave_time
=
broadcastPermute_invoker_ptr
->
Run
(
argument
.
get
(),
StreamConfig
{
nullptr
,
time_kernel
});
std
::
size_t
flop
=
std
::
size_t
(
2
)
*
nchw
[
0
]
*
nchw
[
1
]
*
nchw
[
2
]
*
nchw
[
3
];
std
::
size_t
num_btype
=
sizeof
(
ADataType
)
*
(
nchw
[
0
]
*
nchw
[
1
]
*
nchw
[
2
]
*
nchw
[
3
])
+
sizeof
(
BDataType
)
*
(
nchw
[
0
]
*
nchw
[
1
]
*
nchw
[
2
]
*
nchw
[
3
]);
float
tflops
=
static_cast
<
float
>
(
flop
)
/
1.E9
/
ave_time
;
float
gb_per_sec
=
num_btype
/
1.E6
/
ave_time
;
std
::
cout
<<
"Perf: "
<<
ave_time
<<
" ms, "
<<
tflops
<<
" TFlops, "
<<
gb_per_sec
<<
" GB/s"
<<
std
::
endl
;
bool
pass
=
true
;
if
(
do_verification
)
{
b_device_buf
.
FromDevice
(
b
.
mData
.
data
());
Tensor
<
BDataType
>
host_b
(
nhwc
);
host_elementwise4D
(
host_b
,
a
,
PassThrough
{},
UnaryOp
{},
scale
);
pass
&=
ck
::
utils
::
check_err
(
b
.
mData
,
host_b
.
mData
,
"Error: Incorrect results b"
,
1e-3
,
1e-3
);
}
return
pass
?
0
:
1
;
}
include/ck/tensor_operation/gpu/device/device_elementwise_scale.hpp
0 → 100644
View file @
87e80971
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#include <memory>
#include <array>
#include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/device/device_base.hpp"
namespace
ck
{
namespace
tensor_operation
{
namespace
device
{
template
<
typename
InDataTypeTuple
,
typename
OutDataTypeTuple
,
typename
ElementwiseOperation
,
typename
UnaryOperation
,
typename
Scale
,
index_t
NumDim
>
struct
DeviceElementwise
:
public
BaseOperator
{
static
constexpr
int
NumInput
=
InDataTypeTuple
::
Size
();
static
constexpr
int
NumOutput
=
OutDataTypeTuple
::
Size
();
virtual
std
::
unique_ptr
<
BaseArgument
>
MakeArgumentPointer
(
const
std
::
array
<
index_t
,
NumDim
>
lengths
,
const
std
::
array
<
std
::
array
<
index_t
,
NumDim
>
,
NumInput
>
inStridesArray
,
const
std
::
array
<
std
::
array
<
index_t
,
NumDim
>
,
NumOutput
>
outStridesArray
,
const
std
::
array
<
const
void
*
,
NumInput
>
in_dev_buffers
,
const
std
::
array
<
void
*
,
NumOutput
>
out_dev_buffers
,
ElementwiseOperation
elementwise_op
,
UnaryOperation
unary_op
,
Scale
scale_op
)
=
0
;
virtual
std
::
unique_ptr
<
BaseInvoker
>
MakeInvokerPointer
()
=
0
;
};
// namespace device
template
<
typename
InDataTypeTuple
,
typename
OutDataTypeTuple
,
typename
ElementwiseOperation
,
typename
UnaryOperation
,
typename
Scale
,
index_t
NumDim
>
using
DeviceElementwisePtr
=
std
::
unique_ptr
<
DeviceElementwise
<
InDataTypeTuple
,
OutDataTypeTuple
,
ElementwiseOperation
,
UnaryOperation
,
Scale
,
NumDim
>>
;
}
// namespace device
}
// namespace tensor_operation
}
// namespace ck
include/ck/tensor_operation/gpu/device/impl/device_elementwise_scale_impl.hpp
0 → 100644
View file @
87e80971
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#include <iostream>
#include <sstream>
#include "ck/utility/math.hpp"
#include "ck/utility/sequence.hpp"
#include "ck/tensor_operation/gpu/device/device_elementwise_scale.hpp"
#include "ck/tensor_operation/gpu/grid/gridwise_elementwise_1d_scale.hpp"
#include "ck/tensor_description/tensor_descriptor_helper.hpp"
#include "ck/host_utility/kernel_launch.hpp"
#include "ck/host_utility/stream_utility.hpp"
namespace
ck
{
namespace
tensor_operation
{
namespace
device
{
template
<
typename
InDataTypeTuple
,
typename
OutDataTypeTuple
,
typename
ElementwiseOperation
,
typename
UnaryOperation
,
typename
Scale
,
index_t
NumDim
,
index_t
MPerThread
,
typename
InScalarPerVectorSeq
,
typename
OutScalarPerVectorSeq
>
struct
DeviceElementwiseImpl
:
public
DeviceElementwise
<
InDataTypeTuple
,
OutDataTypeTuple
,
ElementwiseOperation
,
UnaryOperation
,
Scale
,
NumDim
>
{
static
constexpr
int
NumInput
=
InDataTypeTuple
::
Size
();
static
constexpr
int
NumOutput
=
OutDataTypeTuple
::
Size
();
static_assert
(
NumInput
==
InScalarPerVectorSeq
::
Size
()
&&
NumOutput
==
OutScalarPerVectorSeq
::
Size
(),
"Tuple size is inconsistent with the number of in/out!"
);
static
auto
GenerateInDataTypePointerTuple
()
{
return
generate_tuple
(
[
&
](
auto
I
)
{
using
DataType
=
remove_cvref_t
<
decltype
(
InDataTypeTuple
{}[
I
])
>
;
return
static_cast
<
const
DataType
*>
(
nullptr
);
},
Number
<
NumInput
>
{});
};
static
auto
GenerateOutDataTypePointerTuple
()
{
return
generate_tuple
(
[
&
](
auto
I
)
{
using
DataType
=
remove_cvref_t
<
decltype
(
OutDataTypeTuple
{}[
I
])
>
;
return
static_cast
<
DataType
*>
(
nullptr
);
},
Number
<
NumOutput
>
{});
};
using
InDataTypePointerTuple
=
decltype
(
GenerateInDataTypePointerTuple
());
using
OutDataTypePointerTuple
=
decltype
(
GenerateOutDataTypePointerTuple
());
template
<
typename
Desc_M
>
static
auto
PadDescriptor_M_1d
(
Desc_M
desc_m
,
index_t
gridSize
,
index_t
blockSize
)
{
constexpr
auto
I0
=
Number
<
0
>
{};
const
auto
m
=
desc_m
.
GetLength
(
I0
);
const
index_t
loop_step
=
gridSize
*
blockSize
*
MPerThread
;
const
auto
pad
=
math
::
integer_least_multiple
(
m
,
loop_step
)
-
m
;
const
auto
desc_m_pad
=
transform_tensor_descriptor
(
desc_m
,
make_tuple
(
make_right_pad_transform
(
m
,
pad
)),
make_tuple
(
Sequence
<
0
>
{}),
make_tuple
(
Sequence
<
0
>
{}));
return
desc_m_pad
;
}
static
auto
MakeDescriptor_M
(
const
std
::
array
<
index_t
,
NumDim
>&
lengths
,
const
std
::
array
<
index_t
,
NumDim
>&
stride
,
index_t
gridSize
,
index_t
blockSize
)
{
auto
tupleOfShape
=
generate_tuple
([
&
](
auto
I
)
{
return
lengths
[
I
];
},
Number
<
NumDim
>
{});
auto
tupleOfStride
=
generate_tuple
([
&
](
auto
I
)
{
return
stride
[
I
];
},
Number
<
NumDim
>
{});
// nd desc - [s0, s1, s2, ...]
const
auto
desc
=
make_naive_tensor_descriptor
(
tupleOfShape
,
tupleOfStride
);
// merge nd to 1d desc - [s0 * s1 * ...]
if
constexpr
(
NumDim
>
1
)
{
const
auto
desc_m
=
transform_tensor_descriptor
(
desc
,
make_tuple
(
make_merge_transform
(
tupleOfShape
)),
make_tuple
(
generate_sequence_v2
([
&
](
auto
I
)
{
return
I
;
},
Number
<
NumDim
>
{})),
make_tuple
(
Sequence
<
0
>
{}));
return
PadDescriptor_M_1d
(
desc_m
,
gridSize
,
blockSize
);
}
else
return
PadDescriptor_M_1d
(
desc
,
gridSize
,
blockSize
);
}
template
<
index_t
TupleSize
>
static
auto
GenerateInOutGrid1dDescTuple
(
Number
<
TupleSize
>
)
{
return
generate_tuple
(
[
&
](
auto
)
{
if
constexpr
(
NumDim
>
1
)
{
return
MakeDescriptor_M
({
1
,
1
},
{
1
,
1
},
1
,
1
);
}
else
{
return
MakeDescriptor_M
({
1
},
{
1
},
1
,
1
);
};
},
Number
<
TupleSize
>
{});
};
using
InGrid1dDescTuple
=
decltype
(
GenerateInOutGrid1dDescTuple
(
Number
<
NumInput
>
{}));
using
OutGrid1dDescTuple
=
decltype
(
GenerateInOutGrid1dDescTuple
(
Number
<
NumOutput
>
{}));
using
GridwiseElementwise
=
GridwiseElementwise_1D
<
InGrid1dDescTuple
,
OutGrid1dDescTuple
,
InDataTypePointerTuple
,
OutDataTypePointerTuple
,
ElementwiseOperation
,
UnaryOperation
,
Scale
,
MPerThread
,
InScalarPerVectorSeq
,
OutScalarPerVectorSeq
>
;
struct
Argument
:
public
BaseArgument
{
Argument
(
const
std
::
array
<
index_t
,
NumDim
>
lengths
,
const
std
::
array
<
std
::
array
<
index_t
,
NumDim
>
,
NumInput
>
inStridesArray
,
const
std
::
array
<
std
::
array
<
index_t
,
NumDim
>
,
NumOutput
>
outStridesArray
,
const
std
::
array
<
const
void
*
,
NumInput
>
in_dev_buffers
,
const
std
::
array
<
void
*
,
NumOutput
>
out_dev_buffers
,
ElementwiseOperation
elementwise_op
,
UnaryOperation
unary_op
,
Scale
scale_op
)
:
lengths_
(
lengths
),
inStridesArray_
(
inStridesArray
),
outStridesArray_
(
outStridesArray
),
elementwise_op_
(
elementwise_op
),
unary_op_
(
unary_op
),
scale_op_
(
scale_op
),
blockSize_
(
256
)
{
in_dev_buffers_
=
generate_tuple
(
[
&
](
auto
I
)
{
using
DataType
=
remove_cvref_t
<
decltype
(
InDataTypeTuple
{}[
I
])
>
;
return
static_cast
<
const
DataType
*>
(
in_dev_buffers
[
I
.
value
]);
},
Number
<
NumInput
>
{});
out_dev_buffers_
=
generate_tuple
(
[
&
](
auto
I
)
{
using
DataType
=
remove_cvref_t
<
decltype
(
OutDataTypeTuple
{}[
I
])
>
;
return
static_cast
<
DataType
*>
(
out_dev_buffers
[
I
.
value
]);
},
Number
<
NumOutput
>
{});
}
InDataTypePointerTuple
in_dev_buffers_
;
OutDataTypePointerTuple
out_dev_buffers_
;
std
::
array
<
index_t
,
NumDim
>
lengths_
;
std
::
array
<
std
::
array
<
index_t
,
NumDim
>
,
NumInput
>
inStridesArray_
;
std
::
array
<
std
::
array
<
index_t
,
NumDim
>
,
NumOutput
>
outStridesArray_
;
ElementwiseOperation
elementwise_op_
;
UnaryOperation
unary_op_
;
Scale
scale_op_
;
index_t
blockSize_
;
};
struct
Invoker
:
public
BaseInvoker
{
float
Run
(
const
Argument
&
arg
,
const
StreamConfig
&
stream_config
=
StreamConfig
{})
{
index_t
gridSize
=
getAvailableComputeUnitCount
(
stream_config
);
auto
in_grid_1d_desc_tuple
=
generate_tuple
(
[
&
](
auto
I
)
{
return
MakeDescriptor_M
(
arg
.
lengths_
,
arg
.
inStridesArray_
[
I
.
value
],
gridSize
,
arg
.
blockSize_
);
},
Number
<
NumInput
>
{});
auto
out_grid_1d_desc_tuple
=
generate_tuple
(
[
&
](
auto
I
)
{
return
MakeDescriptor_M
(
arg
.
lengths_
,
arg
.
outStridesArray_
[
I
.
value
],
gridSize
,
arg
.
blockSize_
);
},
Number
<
NumOutput
>
{});
const
auto
kernel
=
kernel_elementwise_1d
<
GridwiseElementwise
,
InGrid1dDescTuple
,
OutGrid1dDescTuple
,
InDataTypePointerTuple
,
OutDataTypePointerTuple
,
ElementwiseOperation
,
UnaryOperation
,
Scale
>
;
float
elapsed_time
=
launch_and_time_kernel
(
stream_config
,
kernel
,
dim3
(
gridSize
),
dim3
(
arg
.
blockSize_
),
0
,
in_grid_1d_desc_tuple
,
out_grid_1d_desc_tuple
,
arg
.
in_dev_buffers_
,
arg
.
out_dev_buffers_
,
arg
.
elementwise_op_
,
arg
.
unary_op_
,
arg
.
scale_op_
);
return
elapsed_time
;
}
// polymorphic
float
Run
(
const
BaseArgument
*
p_arg
,
const
StreamConfig
&
stream_config
=
StreamConfig
{})
override
{
return
Run
(
*
dynamic_cast
<
const
Argument
*>
(
p_arg
),
stream_config
);
}
};
static
bool
IsSupportedArgument
(
const
Argument
&
arg
)
{
if
(
arg
.
lengths_
.
back
()
%
MPerThread
!=
0
)
return
false
;
auto
IsScalarPerVectorValid
=
[
&
](
const
std
::
array
<
index_t
,
NumDim
>&
lengths
,
const
std
::
array
<
index_t
,
NumDim
>&
strides
,
index_t
scalarPerVector
)
{
if
(
strides
.
back
()
==
1
&&
lengths
.
back
()
%
scalarPerVector
==
0
)
return
true
;
if
(
strides
.
back
()
!=
1
&&
scalarPerVector
==
1
)
return
true
;
return
false
;
};
bool
valid
=
true
;
static_for
<
0
,
NumInput
,
1
>
{}([
&
](
auto
I
)
{
if
(
!
IsScalarPerVectorValid
(
arg
.
lengths_
,
arg
.
inStridesArray_
[
I
.
value
],
InScalarPerVectorSeq
::
At
(
I
)))
valid
=
false
;
});
static_for
<
0
,
NumOutput
,
1
>
{}([
&
](
auto
I
)
{
if
(
!
IsScalarPerVectorValid
(
arg
.
lengths_
,
arg
.
outStridesArray_
[
I
.
value
],
OutScalarPerVectorSeq
::
At
(
I
)))
valid
=
false
;
});
return
valid
;
};
bool
IsSupportedArgument
(
const
BaseArgument
*
p_arg
)
override
{
return
IsSupportedArgument
(
*
dynamic_cast
<
const
Argument
*>
(
p_arg
));
}
static
auto
MakeArgument
(
const
std
::
array
<
index_t
,
NumDim
>
lengths
,
const
std
::
array
<
std
::
array
<
index_t
,
NumDim
>
,
NumInput
>
inStridesArray
,
const
std
::
array
<
std
::
array
<
index_t
,
NumDim
>
,
NumOutput
>
outStridesArray
,
const
std
::
array
<
const
void
*
,
NumInput
>
in_dev_buffers
,
const
std
::
array
<
void
*
,
NumOutput
>
out_dev_buffers
,
ElementwiseOperation
elementwise_op
,
UnaryOperation
unary_op
,
Scale
scale_op
)
{
return
Argument
{
lengths
,
inStridesArray
,
outStridesArray
,
in_dev_buffers
,
out_dev_buffers
,
elementwise_op
,
unary_op
,
scale_op
};
}
std
::
unique_ptr
<
BaseArgument
>
MakeArgumentPointer
(
const
std
::
array
<
index_t
,
NumDim
>
lengths
,
const
std
::
array
<
std
::
array
<
index_t
,
NumDim
>
,
NumInput
>
inStridesArray
,
const
std
::
array
<
std
::
array
<
index_t
,
NumDim
>
,
NumOutput
>
outStridesArray
,
const
std
::
array
<
const
void
*
,
NumInput
>
in_dev_buffers
,
const
std
::
array
<
void
*
,
NumOutput
>
out_dev_buffers
,
ElementwiseOperation
elementwise_op
,
UnaryOperation
unary_op
,
Scale
scale_op
)
override
{
return
std
::
make_unique
<
Argument
>
(
lengths
,
inStridesArray
,
outStridesArray
,
in_dev_buffers
,
out_dev_buffers
,
elementwise_op
,
unary_op
,
scale_op
);
}
static
auto
MakeInvoker
()
{
return
Invoker
{};
}
std
::
unique_ptr
<
BaseInvoker
>
MakeInvokerPointer
()
override
{
return
std
::
make_unique
<
Invoker
>
();
};
};
// namespace device
}
// namespace device
}
// namespace tensor_operation
}
// namespace ck
include/ck/tensor_operation/gpu/element/unary_element_wise_operation.hpp
View file @
87e80971
...
...
@@ -360,8 +360,8 @@ struct UnarySquare
template
<
typename
T
>
__host__
__device__
void
operator
()(
T
&
y
,
const
T
&
x
)
const
{
static_assert
(
is_same_v
<
T
,
float
>
||
is_same_v
<
T
,
double
>
||
is_same_v
<
T
,
int32_t
>
||
is_same_v
<
T
,
int8_t
>
static_assert
(
is_same_v
<
T
,
float
>
||
is_same_v
<
T
,
half_t
>
||
is_same_v
<
T
,
double
>
||
is_same_v
<
T
,
int32_t
>
||
is_same_v
<
T
,
int8_t
>
#ifdef CK_EXPERIMENTAL_BIT_INT_EXTENSION_INT4
||
is_same_v
<
T
,
int4_t
>
#endif
...
...
include/ck/tensor_operation/gpu/grid/gridwise_elementwise_1d_scale.hpp
0 → 100644
View file @
87e80971
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#include "ck/tensor_description/cluster_descriptor.hpp"
#include "ck/utility/data_type.hpp"
#include "ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
namespace
ck
{
template
<
typename
GridwiseElementwise1dFunctor
,
typename
InGrid1dDescTuple
,
typename
OutGrid1dDescTuple
,
typename
InDataTypePointerTuple
,
typename
OutDataTypePointerTuple
,
typename
ElementwiseOperation
,
typename
UnaryOperation
,
typename
Scale
>
__global__
void
kernel_elementwise_1d
(
const
InGrid1dDescTuple
in_grid_1d_desc_tuple
,
const
OutGrid1dDescTuple
out_grid_1d_desc_tuple
,
const
InDataTypePointerTuple
p_in_global_tuple
,
const
OutDataTypePointerTuple
p_out_global_tuple
,
const
ElementwiseOperation
elementwise_op
,
const
UnaryOperation
unary_op
,
const
Scale
scale_op
)
{
GridwiseElementwise1dFunctor
::
Run
(
in_grid_1d_desc_tuple
,
out_grid_1d_desc_tuple
,
p_in_global_tuple
,
p_out_global_tuple
,
elementwise_op
,
unary_op
,
scale_op
);
}
template
<
typename
InGrid1dDescTuple
,
typename
OutGrid1dDescTuple
,
typename
InDataTypePointerTuple
,
typename
OutDataTypePointerTuple
,
typename
ElementwiseOperation
,
typename
UnaryOperation
,
typename
Scale
,
index_t
MPerThread
,
typename
InScalarPerVectorSeq
,
typename
OutScalarPerVectorSeq
>
struct
GridwiseElementwise_1D
{
static
constexpr
index_t
NumInput
=
InDataTypePointerTuple
::
Size
();
static
constexpr
index_t
NumOutput
=
OutDataTypePointerTuple
::
Size
();
static_assert
(
NumInput
==
InScalarPerVectorSeq
::
Size
()
&&
NumOutput
==
OutScalarPerVectorSeq
::
Size
()
&&
NumInput
==
InGrid1dDescTuple
::
Size
()
&&
NumOutput
==
OutGrid1dDescTuple
::
Size
(),
"Tuple size is inconsistent with the number of in/out!"
);
static
constexpr
auto
I0
=
Number
<
0
>
{};
static
constexpr
auto
thread_buffer_desc_m
=
make_naive_tensor_descriptor_packed
(
make_tuple
(
Number
<
MPerThread
>
{}));
using
PassThroughOp
=
tensor_operation
::
element_wise
::
PassThrough
;
__device__
static
void
Run
(
const
InGrid1dDescTuple
in_grid_1d_desc_tuple
,
const
OutGrid1dDescTuple
out_grid_1d_desc_tuple
,
const
InDataTypePointerTuple
p_in_global_tuple
,
const
OutDataTypePointerTuple
p_out_global_tuple
,
const
ElementwiseOperation
elementwise_op
,
const
UnaryOperation
unary_op
,
const
Scale
scale_op
)
{
const
index_t
thread_global_id
=
get_thread_global_1d_id
();
auto
in_thread_buf_tuple
=
generate_tuple
(
[
&
](
auto
I
)
{
using
DataTypePointer
=
remove_cvref_t
<
decltype
(
InDataTypePointerTuple
{}[
I
])
>
;
using
DataType
=
remove_cv_t
<
remove_pointer_t
<
DataTypePointer
>>
;
return
StaticBuffer
<
AddressSpaceEnum
::
Vgpr
,
DataType
,
MPerThread
,
true
>
{};
},
Number
<
NumInput
>
{});
auto
out_thread_buf_tuple
=
generate_tuple
(
[
&
](
auto
I
)
{
using
DataTypePointer
=
remove_cvref_t
<
decltype
(
OutDataTypePointerTuple
{}[
I
])
>
;
using
DataType
=
remove_pointer_t
<
DataTypePointer
>
;
return
StaticBuffer
<
AddressSpaceEnum
::
Vgpr
,
DataType
,
MPerThread
,
true
>
{};
},
Number
<
NumOutput
>
{});
auto
in_global_buf_tuple
=
generate_tuple
(
[
&
](
auto
I
)
{
static_assert
(
in_grid_1d_desc_tuple
[
I
].
GetNumOfDimension
()
==
1
);
return
make_dynamic_buffer
<
AddressSpaceEnum
::
Global
>
(
p_in_global_tuple
[
I
],
in_grid_1d_desc_tuple
[
I
].
GetElementSpaceSize
());
},
Number
<
NumInput
>
{});
auto
out_global_buf_tuple
=
generate_tuple
(
[
&
](
auto
I
)
{
static_assert
(
out_grid_1d_desc_tuple
[
I
].
GetNumOfDimension
()
==
1
);
return
make_dynamic_buffer
<
AddressSpaceEnum
::
Global
>
(
p_out_global_tuple
[
I
],
out_grid_1d_desc_tuple
[
I
].
GetElementSpaceSize
());
},
Number
<
NumOutput
>
{});
const
auto
thread_global_offset
=
make_multi_index
(
thread_global_id
*
MPerThread
);
const
index_t
blockSize
=
get_block_size
();
const
index_t
blockPerGrid
=
get_grid_size
();
const
auto
M
=
in_grid_1d_desc_tuple
[
I0
].
GetLength
(
I0
);
const
index_t
loop_step
=
blockPerGrid
*
blockSize
*
MPerThread
;
const
auto
loop_step_index
=
make_multi_index
(
loop_step
);
auto
in_global_load_tuple
=
generate_tuple
(
[
&
](
auto
I
)
{
using
DataTypePointer
=
remove_cvref_t
<
decltype
(
InDataTypePointerTuple
{}[
I
])
>
;
using
DataType
=
remove_cv_t
<
remove_pointer_t
<
DataTypePointer
>>
;
return
ThreadwiseTensorSliceTransfer_v2
<
DataType
,
DataType
,
decltype
(
in_grid_1d_desc_tuple
[
I
]),
decltype
(
thread_buffer_desc_m
),
Sequence
<
MPerThread
>
,
// SliceLengths
Sequence
<
0
>
,
// DimAccessOrder
0
,
// SrcVectorDim
InScalarPerVectorSeq
::
At
(
I
),
// ScalarPerVector
1
,
// SrcScalarStrideInVector
false
>
{
in_grid_1d_desc_tuple
[
I
],
thread_global_offset
};
},
Number
<
NumInput
>
{});
auto
out_global_store_tuple
=
generate_tuple
(
[
&
](
auto
I
)
{
using
DataTypePointer
=
remove_cvref_t
<
decltype
(
OutDataTypePointerTuple
{}[
I
])
>
;
using
DataType
=
remove_pointer_t
<
DataTypePointer
>
;
return
ThreadwiseTensorSliceTransfer_v1r3
<
DataType
,
DataType
,
decltype
(
thread_buffer_desc_m
),
decltype
(
out_grid_1d_desc_tuple
[
I
]),
PassThroughOp
,
Sequence
<
MPerThread
>
,
// SliceLengths
Sequence
<
0
>
,
// DimAccessOrder
0
,
// SrcVectorDim
OutScalarPerVectorSeq
::
At
(
I
),
InMemoryDataOperationEnum
::
Set
,
1
,
false
>
(
out_grid_1d_desc_tuple
[
I
],
thread_global_offset
,
PassThroughOp
{});
},
Number
<
NumOutput
>
{});
index_t
num_iter
=
M
/
(
loop_step
);
do
{
static_for
<
0
,
NumInput
,
1
>
{}([
&
](
auto
I
)
{
in_global_load_tuple
(
I
).
Run
(
in_grid_1d_desc_tuple
[
I
],
in_global_buf_tuple
[
I
],
thread_buffer_desc_m
,
make_tuple
(
I0
),
in_thread_buf_tuple
(
I
));
in_global_load_tuple
(
I
).
MoveSrcSliceWindow
(
in_grid_1d_desc_tuple
[
I
],
loop_step_index
);
});
static_for
<
0
,
MPerThread
,
1
>
{}([
&
](
auto
iM
)
{
// get reference to in data
auto
uop_data_refs
=
generate_tie
(
// return type should be lvalue
[
&
](
auto
I
)
->
auto
&
{
return
in_thread_buf_tuple
(
I
)(
iM
);
},
Number
<
NumInput
>
{});
// get reference to dst data
auto
out_data_refs
=
generate_tie
(
// return type should be lvalue
[
&
](
auto
I
)
->
auto
&
{
return
out_thread_buf_tuple
(
I
)(
iM
);
},
Number
<
NumOutput
>
{});
unpack2
(
unary_op
,
uop_data_refs
,
uop_data_refs
);
auto
sop_in_data_refs
=
generate_tie
(
// return type should be lvalue
[
&
](
auto
I
)
->
auto
&
{
return
in_thread_buf_tuple
(
I
)(
iM
);
},
Number
<
NumInput
>
{});
auto
sop_out_data_refs
=
generate_tie
(
// return type should be lvalue
[
&
](
auto
I
)
->
auto
&
{
return
in_thread_buf_tuple
(
I
)(
iM
);
},
Number
<
NumInput
>
{});
unpack2
(
scale_op
,
sop_out_data_refs
,
sop_in_data_refs
);
const
auto
in_data_refs
=
generate_tie
(
// return type should be lvalue
[
&
](
auto
I
)
->
const
auto
&
{
return
in_thread_buf_tuple
(
I
)(
iM
);
},
Number
<
NumInput
>
{});
unpack2
(
elementwise_op
,
out_data_refs
,
in_data_refs
);
});
static_for
<
0
,
NumOutput
,
1
>
{}([
&
](
auto
I
)
{
out_global_store_tuple
(
I
).
Run
(
thread_buffer_desc_m
,
make_tuple
(
I0
),
out_thread_buf_tuple
[
I
],
out_grid_1d_desc_tuple
[
I
],
out_global_buf_tuple
(
I
));
out_global_store_tuple
(
I
).
MoveDstSliceWindow
(
out_grid_1d_desc_tuple
[
I
],
loop_step_index
);
});
}
while
(
--
num_iter
);
}
};
}
// namespace ck
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment