Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
composable_kernel
Commits
72e0c1c5
"vscode:/vscode.git/clone" did not exist on "d1894bdbbb993bc4ada02a9b21eb7f60f64966cd"
Commit
72e0c1c5
authored
Jun 19, 2023
by
Rostyslav Geyyer
Browse files
Merge branch 'develop' into lwpck-739
parents
898866e0
f0c620c4
Changes
103
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
1320 additions
and
35 deletions
+1320
-35
CHANGELOG.md
CHANGELOG.md
+2
-0
client_example/03_gemm_layernorm/gemm_add_add_layernorm_naive.cpp
...xample/03_gemm_layernorm/gemm_add_add_layernorm_naive.cpp
+10
-9
client_example/06_softmax/softmax4d.cpp
client_example/06_softmax/softmax4d.cpp
+25
-7
client_example/18_groupnorm/groupnorm_swish.cpp
client_example/18_groupnorm/groupnorm_swish.cpp
+24
-0
example/49_maxpool2d_bwd/CMakeLists.txt
example/49_maxpool2d_bwd/CMakeLists.txt
+3
-0
example/49_maxpool2d_bwd/maxpool2d_bwd_bf16.cpp
example/49_maxpool2d_bwd/maxpool2d_bwd_bf16.cpp
+62
-0
example/49_maxpool2d_bwd/maxpool2d_bwd_common.hpp
example/49_maxpool2d_bwd/maxpool2d_bwd_common.hpp
+222
-0
example/49_maxpool2d_bwd/maxpool2d_bwd_fp16.cpp
example/49_maxpool2d_bwd/maxpool2d_bwd_fp16.cpp
+62
-0
example/49_maxpool2d_bwd/maxpool2d_bwd_fp32.cpp
example/49_maxpool2d_bwd/maxpool2d_bwd_fp32.cpp
+62
-0
example/50_put_element/CMakeLists.txt
example/50_put_element/CMakeLists.txt
+1
-0
example/50_put_element/put_element_fp16.cpp
example/50_put_element/put_element_fp16.cpp
+88
-0
include/ck/host_utility/stream_utility.hpp
include/ck/host_utility/stream_utility.hpp
+1
-1
include/ck/tensor_operation/gpu/device/device_index_pool_bwd.hpp
.../ck/tensor_operation/gpu/device/device_index_pool_bwd.hpp
+32
-0
include/ck/tensor_operation/gpu/device/device_put_element.hpp
...ude/ck/tensor_operation/gpu/device/device_put_element.hpp
+36
-0
include/ck/tensor_operation/gpu/device/device_softmax.hpp
include/ck/tensor_operation/gpu/device/device_softmax.hpp
+11
-6
include/ck/tensor_operation/gpu/device/impl/device_index_pool_bwd_impl.hpp
..._operation/gpu/device/impl/device_index_pool_bwd_impl.hpp
+316
-0
include/ck/tensor_operation/gpu/device/impl/device_put_element_impl.hpp
...sor_operation/gpu/device/impl/device_put_element_impl.hpp
+155
-0
include/ck/tensor_operation/gpu/device/impl/device_softmax_impl.hpp
.../tensor_operation/gpu/device/impl/device_softmax_impl.hpp
+5
-12
include/ck/tensor_operation/gpu/element/unary_element_wise_operation.hpp
...or_operation/gpu/element/unary_element_wise_operation.hpp
+48
-0
include/ck/tensor_operation/gpu/grid/gridwise_put_element_1d.hpp
.../ck/tensor_operation/gpu/grid/gridwise_put_element_1d.hpp
+155
-0
No files found.
CHANGELOG.md
View file @
72e0c1c5
...
@@ -20,6 +20,8 @@ Full documentation for Composable Kernel is not yet available.
...
@@ -20,6 +20,8 @@ Full documentation for Composable Kernel is not yet available.
-
Added multi-embeddings support (#542).
-
Added multi-embeddings support (#542).
-
Added Navi3x blockwise GEMM and real GEMM support (#541).
-
Added Navi3x blockwise GEMM and real GEMM support (#541).
-
Added Navi grouped ConvBwdWeight support (#505).
-
Added Navi grouped ConvBwdWeight support (#505).
-
Added pool3d forward (#697).
-
Added maxpool backward (#750).
### Changed
### Changed
-
Changed ...
-
Changed ...
client_example/03_gemm_layernorm/gemm_add_add_layernorm_naive.cpp
View file @
72e0c1c5
...
@@ -172,18 +172,19 @@ int main()
...
@@ -172,18 +172,19 @@ int main()
BLayout
,
BLayout
,
CLayout
>
();
CLayout
>
();
const
auto
normalize_ptrs
=
ck
::
tensor_operation
::
device
::
instance
::
get_device_normalize_from_mean_meansquare_instances
<
CDataType
,
ReduceDataType
,
ReduceDataType
,
GammaDataType
,
BetaDataType
,
LayerNormOutDataType
>
();
std
::
cout
<<
"found "
<<
gemm_reduce_ptrs
.
size
()
std
::
cout
<<
"found "
<<
gemm_reduce_ptrs
.
size
()
<<
" gemm_reduceMean_reduceSquareMean instances"
<<
std
::
endl
;
<<
" gemm_reduceMean_reduceSquareMean instances"
<<
std
::
endl
;
using
NormalizeDeviceOp
=
ck
::
tensor_operation
::
device
::
DeviceElementwise
<
ck
::
Tuple
<
CDataType
,
ReduceDataType
,
ReduceDataType
,
GammaDataType
,
BetaDataType
>
,
ck
::
Tuple
<
LayerNormOutDataType
>
,
ck
::
tensor_operation
::
element_wise
::
Normalize
,
2
>
;
const
auto
normalize_ptrs
=
ck
::
tensor_operation
::
device
::
instance
::
DeviceOperationInstanceFactory
<
NormalizeDeviceOp
>::
GetInstances
();
std
::
cout
<<
"found "
<<
normalize_ptrs
.
size
()
<<
" normalize instances"
<<
std
::
endl
;
std
::
cout
<<
"found "
<<
normalize_ptrs
.
size
()
<<
" normalize instances"
<<
std
::
endl
;
auto
f_matrix_space_size
=
auto
f_matrix_space_size
=
...
...
client_example/06_softmax/softmax4d.cpp
View file @
72e0c1c5
...
@@ -53,12 +53,35 @@ int main(int argc, char* argv[])
...
@@ -53,12 +53,35 @@ int main(int argc, char* argv[])
SimpleDeviceMem
in
(
sizeof
(
InDataType
)
*
num_elements
);
SimpleDeviceMem
in
(
sizeof
(
InDataType
)
*
num_elements
);
SimpleDeviceMem
out
(
sizeof
(
OutDataType
)
*
num_elements
);
SimpleDeviceMem
out
(
sizeof
(
OutDataType
)
*
num_elements
);
using
DeviceOp
=
ck
::
tensor_operation
::
device
::
using
DeviceOp
=
ck
::
tensor_operation
::
device
::
DeviceSoftmax
<
InDataType
,
DeviceSoftmax
<
InDataType
,
AccDataType
,
OutDataType
,
PassThrough
,
PassThrough
,
Rank
>
;
AccDataType
,
OutDataType
,
PassThrough
,
PassThrough
,
Rank
,
NumReduceDim
>
;
// get device op instances
// get device op instances
const
auto
op_ptrs
=
ck
::
tensor_operation
::
device
::
instance
::
DeviceOperationInstanceFactory
<
const
auto
op_ptrs
=
ck
::
tensor_operation
::
device
::
instance
::
DeviceOperationInstanceFactory
<
DeviceOp
>::
GetInstances
();
DeviceOp
>::
GetInstances
();
auto
&
generic_op_ptr
=
op_ptrs
[
0
];
auto
generic_argument_ptr
=
generic_op_ptr
->
MakeArgumentPointer
(
in_lengths
,
in_strides
,
reduce_dims
,
alpha
,
beta
,
in
.
GetDeviceBuffer
(),
out
.
GetDeviceBuffer
(),
PassThrough
{},
PassThrough
{});
if
(
!
generic_op_ptr
->
IsSupportedArgument
(
generic_argument_ptr
.
get
()))
{
throw
std
::
runtime_error
(
"The generic kernel instance should be able to support any input shapes"
);
};
std
::
cout
<<
"found "
<<
op_ptrs
.
size
()
<<
" instances"
<<
std
::
endl
;
std
::
cout
<<
"found "
<<
op_ptrs
.
size
()
<<
" instances"
<<
std
::
endl
;
std
::
string
best_op_name
;
std
::
string
best_op_name
;
...
@@ -74,11 +97,6 @@ int main(int argc, char* argv[])
...
@@ -74,11 +97,6 @@ int main(int argc, char* argv[])
{
{
auto
&
op_ptr
=
op_ptrs
[
i
];
auto
&
op_ptr
=
op_ptrs
[
i
];
if
(
op_ptr
->
GetRank
()
!=
Rank
||
op_ptr
->
GetNumReduceDim
()
!=
NumReduceDim
)
{
continue
;
}
auto
argument_ptr
=
op_ptr
->
MakeArgumentPointer
(
in_lengths
,
auto
argument_ptr
=
op_ptr
->
MakeArgumentPointer
(
in_lengths
,
in_strides
,
in_strides
,
reduce_dims
,
reduce_dims
,
...
...
client_example/18_groupnorm/groupnorm_swish.cpp
View file @
72e0c1c5
...
@@ -72,6 +72,30 @@ int main(int argc, char* argv[])
...
@@ -72,6 +72,30 @@ int main(int argc, char* argv[])
std
::
cout
<<
"found "
<<
op_ptrs
.
size
()
<<
" instances"
<<
std
::
endl
;
std
::
cout
<<
"found "
<<
op_ptrs
.
size
()
<<
" instances"
<<
std
::
endl
;
const
auto
&
generic_op_ptr
=
op_ptrs
[
0
];
auto
generic_argument_ptr
=
generic_op_ptr
->
MakeArgumentPointer
({
N
,
H
,
W
,
G
,
C
},
// lengths
xy_strides
,
// xStrides
gamma_beta_strides
,
// gammaStrides
gamma_beta_strides
,
// betaStrides
xy_strides
,
// yStrides
{
1
,
2
,
4
},
// reduceDims
1e-6
,
x_device_buf
.
GetDeviceBuffer
(),
gamma_device_buf
.
GetDeviceBuffer
(),
beta_device_buf
.
GetDeviceBuffer
(),
y_device_buf
.
GetDeviceBuffer
(),
nullptr
,
nullptr
,
Swish
{});
if
(
!
generic_op_ptr
->
IsSupportedArgument
(
generic_argument_ptr
.
get
()))
{
throw
std
::
runtime_error
(
"The generic kernel instance should be able to support any input shapes"
);
};
std
::
string
best_op_name
;
std
::
string
best_op_name
;
bool
found
=
false
;
bool
found
=
false
;
int
best_op_id
=
-
1
;
int
best_op_id
=
-
1
;
...
...
example/49_maxpool2d_bwd/CMakeLists.txt
0 → 100644
View file @
72e0c1c5
add_example_executable
(
example_maxpool2d_bwd_bf16 maxpool2d_bwd_bf16.cpp
)
add_example_executable
(
example_maxpool2d_bwd_fp16 maxpool2d_bwd_fp16.cpp
)
add_example_executable
(
example_maxpool2d_bwd_fp32 maxpool2d_bwd_fp32.cpp
)
example/49_maxpool2d_bwd/maxpool2d_bwd_bf16.cpp
0 → 100644
View file @
72e0c1c5
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
#include <iostream>
#include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/utility/reduction_enums.hpp"
#include "maxpool2d_bwd_common.hpp"
using
InDataType
=
ck
::
bhalf_t
;
using
OutDataType
=
ck
::
bhalf_t
;
using
IndexDataType
=
int32_t
;
using
ComputeDataType
=
float
;
using
DInDataType
=
ck
::
bhalf_t
;
using
DOutDataType
=
ck
::
bhalf_t
;
static
constexpr
bool
PropagateNan
=
false
;
int
main
()
{
bool
do_verification
=
true
;
bool
time_kernel
=
false
;
// Pool shape
ck
::
index_t
N
=
1
;
ck
::
index_t
C
=
1
;
ck
::
index_t
Y
=
3
;
ck
::
index_t
X
=
3
;
ck
::
index_t
Hi
=
32
;
ck
::
index_t
Wi
=
32
;
ck
::
index_t
window_stride_h
=
1
;
ck
::
index_t
window_stride_w
=
1
;
ck
::
index_t
in_left_pad_h
=
0
;
ck
::
index_t
in_left_pad_w
=
0
;
ck
::
index_t
in_right_pad_h
=
0
;
ck
::
index_t
in_right_pad_w
=
0
;
bool
pass
=
maxpool_bwd_test
<
InDataType
,
OutDataType
,
IndexDataType
,
ComputeDataType
,
DInDataType
,
DOutDataType
,
PropagateNan
>
(
do_verification
,
time_kernel
,
N
,
C
,
Y
,
X
,
Hi
,
Wi
,
window_stride_h
,
window_stride_w
,
in_left_pad_h
,
in_left_pad_w
,
in_right_pad_h
,
in_right_pad_w
);
return
(
pass
?
0
:
1
);
}
example/49_maxpool2d_bwd/maxpool2d_bwd_common.hpp
0 → 100644
View file @
72e0c1c5
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#include <iostream>
#include "ck/ck.hpp"
#include "ck/utility/reduction_enums.hpp"
#include "ck/tensor_operation/gpu/device/impl/device_pool2d_fwd_nhwc_nhwc.hpp"
#include "ck/tensor_operation/gpu/device/impl/device_index_pool_bwd_impl.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/library/utility/check_err.hpp"
#include "ck/library/utility/device_memory.hpp"
#include "ck/library/utility/host_tensor.hpp"
#include "ck/library/utility/host_tensor_generator.hpp"
#include "ck/library/utility/literals.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_pool_fwd.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_maxpool_bwd.hpp"
template
<
typename
InDataType
,
typename
OutDataType
,
typename
IndexDataType
,
typename
ComputeDataType
,
typename
DInDataType
,
typename
DOutDataType
,
bool
PropagateNan
>
bool
maxpool_bwd_test
(
bool
do_verification
,
bool
time_kernel
,
ck
::
index_t
N
,
ck
::
index_t
C
,
ck
::
index_t
Y
,
ck
::
index_t
X
,
ck
::
index_t
Hi
,
ck
::
index_t
Wi
,
ck
::
index_t
window_stride_h
,
ck
::
index_t
window_stride_w
,
ck
::
index_t
in_left_pad_h
,
ck
::
index_t
in_left_pad_w
,
ck
::
index_t
in_right_pad_h
,
ck
::
index_t
in_right_pad_w
)
{
using
PassThrough
=
ck
::
tensor_operation
::
element_wise
::
PassThrough
;
using
DevicePoolFwdInstance
=
ck
::
tensor_operation
::
device
::
DevicePool2dFwd_Input_N_Hi_Wi_C_Output_N_Ho_Wo_C
<
InDataType
,
// InDataType
OutDataType
,
// OutDataType
IndexDataType
,
// IndexDataType
ComputeDataType
,
// ComputeDataType
ck
::
ReduceTensorOp
::
MAX
,
true
,
// OutputIndex
64
,
// BlockSize
64
,
// ReduceMThreadClusterSize
1
,
// ReduceKThreadClusterSize
4
,
// ReduceMThreadSliceSize
1
,
// ReduceKThreadSliceSize
1
>
;
// InSrcOutDstVectorSize
using
DeviceMaxPoolBwdInstance
=
ck
::
tensor_operation
::
device
::
DeviceIndexPoolBwdImpl
<
DOutDataType
,
IndexDataType
,
DInDataType
,
4
>
;
const
ck
::
index_t
Ho
=
(
Hi
+
in_left_pad_h
+
in_right_pad_h
-
Y
)
/
window_stride_h
+
1
;
const
ck
::
index_t
Wo
=
(
Wi
+
in_left_pad_w
+
in_right_pad_w
-
X
)
/
window_stride_w
+
1
;
const
std
::
vector
<
ck
::
index_t
>
window_spatial_lengths
{
Y
,
X
};
const
std
::
vector
<
ck
::
index_t
>
window_strides
{
window_stride_h
,
window_stride_w
};
const
std
::
vector
<
ck
::
index_t
>
input_left_pads
{
in_left_pad_h
,
in_left_pad_w
};
const
std
::
vector
<
ck
::
index_t
>
input_right_pads
{
in_right_pad_h
,
in_right_pad_w
};
auto
f_host_tensor_descriptor
=
[](
std
::
size_t
N_
,
std
::
size_t
C_
,
std
::
size_t
H
,
std
::
size_t
W
)
{
using
namespace
ck
::
literals
;
// reference need Tensor with NCHW order
return
HostTensorDescriptor
({
N_
,
C_
,
H
,
W
},
{
C_
*
H
*
W
,
1
_uz
,
W
*
C_
,
C_
});
};
// in
Tensor
<
InDataType
>
in_n_c_hi_wi
(
f_host_tensor_descriptor
(
N
,
C
,
Hi
,
Wi
));
// out
Tensor
<
OutDataType
>
out_n_c_ho_wo_host
(
f_host_tensor_descriptor
(
N
,
C
,
Ho
,
Wo
));
Tensor
<
OutDataType
>
out_n_c_ho_wo_device
(
f_host_tensor_descriptor
(
N
,
C
,
Ho
,
Wo
));
// indices
Tensor
<
IndexDataType
>
indices_n_c_ho_wo_device
(
f_host_tensor_descriptor
(
N
,
C
,
Ho
,
Wo
));
Tensor
<
IndexDataType
>
indices_n_c_ho_wo_host
(
f_host_tensor_descriptor
(
N
,
C
,
Ho
,
Wo
));
// dout
Tensor
<
DOutDataType
>
dout_n_c_ho_wo
(
f_host_tensor_descriptor
(
N
,
C
,
Ho
,
Wo
));
// din
Tensor
<
DInDataType
>
din_n_c_hi_wi_host
(
f_host_tensor_descriptor
(
N
,
C
,
Hi
,
Wi
));
Tensor
<
DInDataType
>
din_n_c_hi_wi_device
(
f_host_tensor_descriptor
(
N
,
C
,
Hi
,
Wi
));
std
::
cout
<<
"in_n_c_hi_wi: "
<<
in_n_c_hi_wi
.
mDesc
<<
std
::
endl
;
std
::
cout
<<
"out_n_c_ho_wo: "
<<
out_n_c_ho_wo_host
.
mDesc
<<
std
::
endl
;
std
::
cout
<<
"indices_n_c_ho_wo: "
<<
indices_n_c_ho_wo_host
.
mDesc
<<
std
::
endl
;
std
::
cout
<<
"dout_n_c_ho_wo: "
<<
dout_n_c_ho_wo
.
mDesc
<<
std
::
endl
;
std
::
cout
<<
"din_n_c_hi_wi: "
<<
din_n_c_hi_wi_host
.
mDesc
<<
std
::
endl
;
in_n_c_hi_wi
.
GenerateTensorValue
(
GeneratorTensor_3
<
InDataType
>
{
-
1.0
,
1.0
});
dout_n_c_ho_wo
.
GenerateTensorValue
(
GeneratorTensor_3
<
DOutDataType
>
{
-
1.0
,
1.0
});
DeviceMem
in_device_buf
(
sizeof
(
InDataType
)
*
in_n_c_hi_wi
.
mDesc
.
GetElementSpaceSize
());
DeviceMem
out_device_buf
(
sizeof
(
OutDataType
)
*
out_n_c_ho_wo_device
.
mDesc
.
GetElementSpaceSize
());
DeviceMem
indices_device_buf
(
sizeof
(
IndexDataType
)
*
indices_n_c_ho_wo_device
.
mDesc
.
GetElementSpaceSize
());
DeviceMem
dout_device_buf
(
sizeof
(
DOutDataType
)
*
dout_n_c_ho_wo
.
mDesc
.
GetElementSpaceSize
());
DeviceMem
din_device_buf
(
sizeof
(
DInDataType
)
*
din_n_c_hi_wi_device
.
mDesc
.
GetElementSpaceSize
());
in_device_buf
.
ToDevice
(
in_n_c_hi_wi
.
mData
.
data
());
dout_device_buf
.
ToDevice
(
dout_n_c_ho_wo
.
mData
.
data
());
auto
pool_fwd
=
DevicePoolFwdInstance
{};
auto
pool_fwd_invoker_ptr
=
pool_fwd
.
MakeInvokerPointer
();
auto
pool_fwd_argument_ptr
=
pool_fwd
.
MakeArgumentPointer
(
static_cast
<
InDataType
*>
(
in_device_buf
.
GetDeviceBuffer
()),
static_cast
<
OutDataType
*>
(
out_device_buf
.
GetDeviceBuffer
()),
static_cast
<
IndexDataType
*>
(
indices_device_buf
.
GetDeviceBuffer
()),
{
N
,
C
,
Hi
,
Wi
},
window_spatial_lengths
,
{
N
,
C
,
Ho
,
Wo
},
{
C
*
Hi
*
Wi
,
1
,
Wi
*
C
,
C
},
{
C
*
Ho
*
Wo
,
1
,
Wo
*
C
,
C
},
{
C
*
Ho
*
Wo
,
1
,
Wo
*
C
,
C
},
window_strides
,
input_left_pads
,
input_right_pads
,
{
2
,
3
});
if
(
!
pool_fwd
.
IsSupportedArgument
(
pool_fwd_argument_ptr
.
get
()))
{
throw
std
::
runtime_error
(
"wrong! pool_fwd with the specified compilation parameters does "
"not support this problem"
);
}
float
ave_time_fwd
=
pool_fwd_invoker_ptr
->
Run
(
pool_fwd_argument_ptr
.
get
(),
StreamConfig
{
nullptr
,
time_kernel
});
auto
pool_bwd
=
DeviceMaxPoolBwdInstance
{};
auto
pool_bwd_invoker_ptr
=
pool_bwd
.
MakeInvokerPointer
();
auto
pool_bwd_argument_ptr
=
pool_bwd
.
MakeArgumentPointer
(
static_cast
<
DOutDataType
*>
(
dout_device_buf
.
GetDeviceBuffer
()),
static_cast
<
IndexDataType
*>
(
indices_device_buf
.
GetDeviceBuffer
()),
static_cast
<
DInDataType
*>
(
din_device_buf
.
GetDeviceBuffer
()),
dout_n_c_ho_wo
.
mDesc
.
GetElementSpaceSize
(),
din_n_c_hi_wi_device
.
mDesc
.
GetElementSpaceSize
(),
window_spatial_lengths
,
window_strides
);
if
(
!
pool_bwd
.
IsSupportedArgument
(
pool_bwd_argument_ptr
.
get
()))
{
throw
std
::
runtime_error
(
"wrong! pool_bwd with the specified compilation parameters does "
"not support this problem"
);
}
size_t
pool_bwd_workspace_sz
=
pool_bwd
.
GetWorkSpaceSize
(
pool_bwd_argument_ptr
.
get
());
DeviceMem
pool_bwd_workspace_device_buf
(
pool_bwd_workspace_sz
);
pool_bwd
.
SetWorkSpacePointer
(
pool_bwd_argument_ptr
.
get
(),
pool_bwd_workspace_device_buf
.
GetDeviceBuffer
());
float
ave_time_bwd
=
pool_bwd_invoker_ptr
->
Run
(
pool_bwd_argument_ptr
.
get
(),
StreamConfig
{
nullptr
,
time_kernel
});
std
::
cout
<<
"Pool fwd perf: "
<<
ave_time_fwd
<<
" ms"
<<
std
::
endl
;
std
::
cout
<<
"Pool bwd perf: "
<<
ave_time_bwd
<<
" ms"
<<
std
::
endl
;
bool
pass
=
true
;
if
(
do_verification
)
{
using
ReferencePoolingFwdInstance
=
ck
::
tensor_operation
::
host
::
ReferencePoolingFwd
<
4
,
2
,
InDataType
,
OutDataType
,
ComputeDataType
,
IndexDataType
,
ck
::
ReduceTensorOp
::
MAX
,
PropagateNan
,
true
>
;
auto
ref_pooling_fwd
=
ReferencePoolingFwdInstance
{};
auto
ref_pooling_fwd_invoker
=
ref_pooling_fwd
.
MakeInvoker
();
auto
ref_pooling_fwd_argument
=
ref_pooling_fwd
.
MakeArgument
(
in_n_c_hi_wi
,
out_n_c_ho_wo_host
,
indices_n_c_ho_wo_host
,
window_spatial_lengths
,
window_strides
,
input_left_pads
,
input_right_pads
);
ref_pooling_fwd_invoker
.
Run
(
ref_pooling_fwd_argument
);
using
ReferencePoolingBwdInstance
=
ck
::
tensor_operation
::
host
::
ReferenceMaxPoolBwd
<
DOutDataType
,
IndexDataType
,
ComputeDataType
,
DInDataType
,
PassThrough
>
;
auto
ref_pooling_bwd
=
ReferencePoolingBwdInstance
{};
auto
ref_pooling_bwd_invoker
=
ref_pooling_bwd
.
MakeInvoker
();
auto
ref_pooling_bwd_argument
=
ref_pooling_bwd
.
MakeArgument
(
dout_n_c_ho_wo
,
indices_n_c_ho_wo_host
,
din_n_c_hi_wi_host
,
PassThrough
{});
ref_pooling_bwd_invoker
.
Run
(
ref_pooling_bwd_argument
);
out_device_buf
.
FromDevice
(
out_n_c_ho_wo_device
.
mData
.
data
());
indices_device_buf
.
FromDevice
(
indices_n_c_ho_wo_device
.
mData
.
data
());
din_device_buf
.
FromDevice
(
din_n_c_hi_wi_device
.
mData
.
data
());
pass
=
pass
&&
ck
::
utils
::
check_err
(
out_n_c_ho_wo_device
,
out_n_c_ho_wo_host
);
pass
=
pass
&&
ck
::
utils
::
check_err
(
indices_n_c_ho_wo_device
,
indices_n_c_ho_wo_host
);
pass
=
pass
&&
ck
::
utils
::
check_err
(
din_n_c_hi_wi_device
,
din_n_c_hi_wi_host
);
}
return
(
pass
);
};
example/49_maxpool2d_bwd/maxpool2d_bwd_fp16.cpp
0 → 100644
View file @
72e0c1c5
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
#include <iostream>
#include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/utility/reduction_enums.hpp"
#include "maxpool2d_bwd_common.hpp"
using
InDataType
=
ck
::
half_t
;
using
OutDataType
=
ck
::
half_t
;
using
IndexDataType
=
int32_t
;
using
ComputeDataType
=
float
;
using
DInDataType
=
ck
::
half_t
;
using
DOutDataType
=
ck
::
half_t
;
static
constexpr
bool
PropagateNan
=
false
;
int
main
()
{
bool
do_verification
=
true
;
bool
time_kernel
=
false
;
// Pool shape
ck
::
index_t
N
=
1
;
ck
::
index_t
C
=
1
;
ck
::
index_t
Y
=
3
;
ck
::
index_t
X
=
3
;
ck
::
index_t
Hi
=
32
;
ck
::
index_t
Wi
=
32
;
ck
::
index_t
window_stride_h
=
1
;
ck
::
index_t
window_stride_w
=
1
;
ck
::
index_t
in_left_pad_h
=
0
;
ck
::
index_t
in_left_pad_w
=
0
;
ck
::
index_t
in_right_pad_h
=
0
;
ck
::
index_t
in_right_pad_w
=
0
;
bool
pass
=
maxpool_bwd_test
<
InDataType
,
OutDataType
,
IndexDataType
,
ComputeDataType
,
DInDataType
,
DOutDataType
,
PropagateNan
>
(
do_verification
,
time_kernel
,
N
,
C
,
Y
,
X
,
Hi
,
Wi
,
window_stride_h
,
window_stride_w
,
in_left_pad_h
,
in_left_pad_w
,
in_right_pad_h
,
in_right_pad_w
);
return
(
pass
?
0
:
1
);
}
example/49_maxpool2d_bwd/maxpool2d_bwd_fp32.cpp
0 → 100644
View file @
72e0c1c5
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
#include <iostream>
#include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/utility/reduction_enums.hpp"
#include "maxpool2d_bwd_common.hpp"
using
InDataType
=
float
;
using
OutDataType
=
float
;
using
IndexDataType
=
int32_t
;
using
ComputeDataType
=
float
;
using
DInDataType
=
float
;
using
DOutDataType
=
float
;
static
constexpr
bool
PropagateNan
=
false
;
int
main
()
{
bool
do_verification
=
true
;
bool
time_kernel
=
false
;
// Pool shape
ck
::
index_t
N
=
1
;
ck
::
index_t
C
=
1
;
ck
::
index_t
Y
=
2
;
ck
::
index_t
X
=
2
;
ck
::
index_t
Hi
=
32
;
ck
::
index_t
Wi
=
32
;
ck
::
index_t
window_stride_h
=
2
;
ck
::
index_t
window_stride_w
=
2
;
ck
::
index_t
in_left_pad_h
=
0
;
ck
::
index_t
in_left_pad_w
=
0
;
ck
::
index_t
in_right_pad_h
=
0
;
ck
::
index_t
in_right_pad_w
=
0
;
bool
pass
=
maxpool_bwd_test
<
InDataType
,
OutDataType
,
IndexDataType
,
ComputeDataType
,
DInDataType
,
DOutDataType
,
PropagateNan
>
(
do_verification
,
time_kernel
,
N
,
C
,
Y
,
X
,
Hi
,
Wi
,
window_stride_h
,
window_stride_w
,
in_left_pad_h
,
in_left_pad_w
,
in_right_pad_h
,
in_right_pad_w
);
return
(
pass
?
0
:
1
);
}
example/50_put_element/CMakeLists.txt
0 → 100644
View file @
72e0c1c5
add_example_executable
(
example_put_element_fp16 put_element_fp16.cpp
)
example/50_put_element/put_element_fp16.cpp
0 → 100644
View file @
72e0c1c5
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
#include <iostream>
#include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/tensor_operation/gpu/device/impl/device_put_element_impl.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/library/utility/check_err.hpp"
#include "ck/library/utility/device_memory.hpp"
#include "ck/library/utility/host_tensor.hpp"
#include "ck/library/utility/host_tensor_generator.hpp"
using
XDataType
=
ck
::
half_t
;
using
YDataType
=
ck
::
half_t
;
using
IndexDataType
=
int32_t
;
using
YElementwiseOp
=
ck
::
tensor_operation
::
element_wise
::
PassThrough
;
using
DeviceInstance
=
ck
::
tensor_operation
::
device
::
DevicePutElementImpl
<
XDataType
,
// XDataType
IndexDataType
,
// IndexDataType
YDataType
,
// YDataType
YElementwiseOp
,
ck
::
InMemoryDataOperationEnum
::
Set
,
1
>
;
int
main
()
{
bool
do_verification
=
true
;
bool
time_kernel
=
false
;
int
N
=
1024
;
Tensor
<
XDataType
>
x
(
HostTensorDescriptor
{
N
,
1
});
Tensor
<
IndexDataType
>
indices
(
HostTensorDescriptor
{
N
,
1
});
Tensor
<
YDataType
>
y
(
HostTensorDescriptor
{
N
,
1
});
x
.
GenerateTensorValue
(
GeneratorTensor_3
<
XDataType
>
{
-
1.0
,
1.0
});
for
(
int
i
=
0
;
i
<
N
;
++
i
)
indices
(
i
)
=
i
;
DeviceMem
x_device_buf
(
sizeof
(
XDataType
)
*
x
.
mDesc
.
GetElementSpaceSize
());
DeviceMem
y_device_buf
(
sizeof
(
YDataType
)
*
y
.
mDesc
.
GetElementSpaceSize
());
DeviceMem
indices_device_buf
(
sizeof
(
IndexDataType
)
*
indices
.
mDesc
.
GetElementSpaceSize
());
x_device_buf
.
ToDevice
(
x
.
mData
.
data
());
indices_device_buf
.
ToDevice
(
indices
.
mData
.
data
());
auto
put_instance
=
DeviceInstance
{};
auto
put_invoker_ptr
=
put_instance
.
MakeInvokerPointer
();
auto
put_argument_ptr
=
put_instance
.
MakeArgumentPointer
(
static_cast
<
XDataType
*>
(
x_device_buf
.
GetDeviceBuffer
()),
static_cast
<
IndexDataType
*>
(
indices_device_buf
.
GetDeviceBuffer
()),
static_cast
<
YDataType
*>
(
y_device_buf
.
GetDeviceBuffer
()),
N
,
N
,
YElementwiseOp
{});
if
(
!
put_instance
.
IsSupportedArgument
(
put_argument_ptr
.
get
()))
{
throw
std
::
runtime_error
(
"argument is not supported!"
);
}
float
ave_time
=
put_invoker_ptr
->
Run
(
put_argument_ptr
.
get
(),
StreamConfig
{
nullptr
,
time_kernel
});
std
::
cout
<<
"perf: "
<<
ave_time
<<
" ms"
<<
std
::
endl
;
bool
pass
=
true
;
if
(
do_verification
)
{
Tensor
<
YDataType
>
y_host
(
HostTensorDescriptor
{
N
,
1
});
for
(
int
i
=
0
;
i
<
N
;
++
i
)
{
IndexDataType
idx
=
indices
(
i
);
y_host
(
idx
)
=
x
(
i
);
}
y_device_buf
.
FromDevice
(
y
.
mData
.
data
());
pass
=
ck
::
utils
::
check_err
(
y
,
y_host
);
}
return
(
pass
?
0
:
1
);
}
include/ck/host_utility/stream_utility.hpp
View file @
72e0c1c5
...
@@ -8,7 +8,7 @@
...
@@ -8,7 +8,7 @@
#include "ck/stream_config.hpp"
#include "ck/stream_config.hpp"
#include "ck/host_utility/hip_check_error.hpp"
#include "ck/host_utility/hip_check_error.hpp"
static
int
getAvailableComputeUnitCount
(
const
StreamConfig
&
stream_config
)
static
inline
int
getAvailableComputeUnitCount
(
const
StreamConfig
&
stream_config
)
{
{
constexpr
int
MAX_MASK_DWORDS
=
64
;
constexpr
int
MAX_MASK_DWORDS
=
64
;
...
...
include/ck/tensor_operation/gpu/device/device_index_pool_bwd.hpp
0 → 100644
View file @
72e0c1c5
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#include <vector>
#include "ck/tensor_operation/gpu/device/device_base.hpp"
namespace
ck
{
namespace
tensor_operation
{
namespace
device
{
// For pooling which used indexable operation, such as MaxPool, MinPool...etc
template
<
typename
DOutDataType
,
typename
IndexDataType
,
typename
DInDataType
>
struct
DeviceIndexPoolBwd
:
public
BaseOperator
{
virtual
std
::
unique_ptr
<
BaseArgument
>
MakeArgumentPointer
(
const
void
*
p_dout
,
const
void
*
p_indices
,
void
*
p_din
,
index_t
dout_length
,
index_t
din_length
,
std
::
vector
<
ck
::
index_t
>
window_lengths
,
std
::
vector
<
ck
::
index_t
>
window_strides
)
=
0
;
virtual
std
::
unique_ptr
<
BaseInvoker
>
MakeInvokerPointer
()
=
0
;
};
}
// namespace device
}
// namespace tensor_operation
}
// namespace ck
include/ck/tensor_operation/gpu/device/device_put_element.hpp
0 → 100644
View file @
72e0c1c5
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#include <vector>
#include "ck/tensor_operation/gpu/device/device_base.hpp"
#include "ck/utility/reduction_enums.hpp"
namespace
ck
{
namespace
tensor_operation
{
namespace
device
{
// output[indices] = input
template
<
typename
InDataType
,
typename
IndexDataType
,
typename
OutDataType
,
typename
ElementwiseOperation
,
InMemoryDataOperationEnum
Op
>
struct
DevicePutElement
:
public
BaseOperator
{
virtual
std
::
unique_ptr
<
BaseArgument
>
MakeArgumentPointer
(
const
void
*
p_input
,
const
void
*
p_indices
,
void
*
p_output
,
index_t
input_length
,
index_t
output_length
,
ElementwiseOperation
elementwise_op
)
=
0
;
virtual
std
::
unique_ptr
<
BaseInvoker
>
MakeInvokerPointer
()
=
0
;
};
}
// namespace device
}
// namespace tensor_operation
}
// namespace ck
include/ck/tensor_operation/gpu/device/device_softmax.hpp
View file @
72e0c1c5
...
@@ -18,7 +18,8 @@ template <typename InDataType,
...
@@ -18,7 +18,8 @@ template <typename InDataType,
typename
OutDataType
,
typename
OutDataType
,
typename
InElementwiseOp
,
typename
InElementwiseOp
,
typename
AccElementwiseOp
,
typename
AccElementwiseOp
,
index_t
Rank
>
index_t
Rank
,
index_t
NumReduceDim
>
struct
DeviceSoftmax
:
public
BaseOperator
struct
DeviceSoftmax
:
public
BaseOperator
{
{
//
//
...
@@ -49,8 +50,6 @@ struct DeviceSoftmax : public BaseOperator
...
@@ -49,8 +50,6 @@ struct DeviceSoftmax : public BaseOperator
AccElementwiseOp
acc_elementwise_op
)
=
0
;
AccElementwiseOp
acc_elementwise_op
)
=
0
;
virtual
std
::
unique_ptr
<
BaseInvoker
>
MakeInvokerPointer
()
=
0
;
virtual
std
::
unique_ptr
<
BaseInvoker
>
MakeInvokerPointer
()
=
0
;
virtual
index_t
GetRank
()
const
=
0
;
virtual
index_t
GetNumReduceDim
()
const
=
0
;
};
};
template
<
typename
InDataType
,
template
<
typename
InDataType
,
...
@@ -58,9 +57,15 @@ template <typename InDataType,
...
@@ -58,9 +57,15 @@ template <typename InDataType,
typename
OutDataType
,
typename
OutDataType
,
typename
InElementwiseOp
,
typename
InElementwiseOp
,
typename
AccElementwiseOp
,
typename
AccElementwiseOp
,
index_t
Rank
>
index_t
Rank
,
using
DeviceSoftmaxPtr
=
std
::
unique_ptr
<
index_t
NumReduceDim
>
DeviceSoftmax
<
InDataType
,
AccDataType
,
OutDataType
,
InElementwiseOp
,
AccElementwiseOp
,
Rank
>>
;
using
DeviceSoftmaxPtr
=
std
::
unique_ptr
<
DeviceSoftmax
<
InDataType
,
AccDataType
,
OutDataType
,
InElementwiseOp
,
AccElementwiseOp
,
Rank
,
NumReduceDim
>>
;
}
// namespace device
}
// namespace device
}
// namespace tensor_operation
}
// namespace tensor_operation
...
...
include/ck/tensor_operation/gpu/device/impl/device_index_pool_bwd_impl.hpp
0 → 100644
View file @
72e0c1c5
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#include <iostream>
#include <sstream>
#include "ck/tensor_description/tensor_descriptor.hpp"
#include "ck/tensor_description/tensor_descriptor_helper.hpp"
#include "ck/tensor_operation/gpu/device/device_index_pool_bwd.hpp"
#include "ck/tensor_operation/gpu/grid/gridwise_put_element_1d.hpp"
#include "ck/tensor_operation/gpu/grid/gridwise_elementwise_1d.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/host_utility/device_prop.hpp"
#include "ck/host_utility/kernel_launch.hpp"
#include "ck/host_utility/stream_utility.hpp"
namespace
ck
{
namespace
tensor_operation
{
namespace
device
{
// output[indices] = input
template
<
typename
DOutDataType
,
typename
IndexDataType
,
typename
DInDataType
,
ck
::
index_t
InOutVectorSize
>
struct
DeviceIndexPoolBwdImpl
:
public
DeviceIndexPoolBwd
<
DOutDataType
,
IndexDataType
,
DInDataType
>
{
using
DInDataType_AutomicAddPreCast
=
conditional_t
<
is_same_v
<
DInDataType
,
float
>
||
is_same_v
<
DInDataType
,
double
>
,
DInDataType
,
float
>
;
using
PassThrough
=
ck
::
tensor_operation
::
element_wise
::
PassThrough
;
using
UnaryConvert
=
ck
::
tensor_operation
::
element_wise
::
UnaryConvert
;
static
constexpr
auto
I0
=
Number
<
0
>
{};
template
<
typename
Desc_M
>
static
auto
PadDescriptor_M_1d
(
Desc_M
desc_m
,
index_t
loop_step
)
{
const
auto
m
=
desc_m
.
GetLength
(
I0
);
const
auto
pad
=
math
::
integer_least_multiple
(
m
,
loop_step
)
-
m
;
const
auto
desc_m_pad
=
transform_tensor_descriptor
(
desc_m
,
make_tuple
(
make_right_pad_transform
(
m
,
pad
)),
make_tuple
(
Sequence
<
0
>
{}),
make_tuple
(
Sequence
<
0
>
{}));
return
desc_m_pad
;
}
static
auto
MakeDescriptor_M
(
index_t
length
,
index_t
loop_step
)
{
const
auto
desc_m
=
make_naive_tensor_descriptor_packed
(
make_tuple
(
length
));
return
PadDescriptor_M_1d
(
desc_m
,
loop_step
);
}
using
InOutGrid1dDesc
=
decltype
(
MakeDescriptor_M
(
1
,
1
));
using
GridwisePutElementSet
=
GridwisePutElement_1D
<
InOutGrid1dDesc
,
DOutDataType
,
IndexDataType
,
DInDataType
,
PassThrough
,
InMemoryDataOperationEnum
::
Set
,
InOutVectorSize
>
;
using
GridwisePutElementAtomicAdd
=
GridwisePutElement_1D
<
InOutGrid1dDesc
,
DOutDataType
,
IndexDataType
,
DInDataType_AutomicAddPreCast
,
PassThrough
,
InMemoryDataOperationEnum
::
AtomicAdd
,
InOutVectorSize
>
;
using
GridwiseCasting
=
GridwiseElementwise_1D
<
Tuple
<
InOutGrid1dDesc
>
,
Tuple
<
InOutGrid1dDesc
>
,
Tuple
<
const
DInDataType_AutomicAddPreCast
*>
,
Tuple
<
DInDataType
*>
,
UnaryConvert
,
InOutVectorSize
,
Sequence
<
InOutVectorSize
>
,
Sequence
<
InOutVectorSize
>>
;
struct
Argument
:
public
BaseArgument
{
Argument
(
const
DOutDataType
*
p_dout
,
const
IndexDataType
*
p_indices
,
DInDataType
*
p_din
,
index_t
dout_length
,
index_t
din_length
,
const
std
::
vector
<
ck
::
index_t
>&
window_lengths
,
const
std
::
vector
<
ck
::
index_t
>&
window_strides
)
:
p_dout_
{
p_dout
},
p_indices_
{
p_indices
},
p_din_
{
p_din
},
dout_length_raw_
{
dout_length
},
din_length_raw_
{
din_length
},
blockSize_
{
256
},
windowOverlap_
{
false
}
{
for
(
size_t
i
=
0
;
i
<
window_lengths
.
size
();
++
i
)
{
windowOverlap_
|=
window_lengths
.
at
(
i
)
>
window_strides
.
at
(
i
);
}
}
const
DOutDataType
*
p_dout_
;
const
IndexDataType
*
p_indices_
;
DInDataType
*
p_din_
;
index_t
dout_length_raw_
;
index_t
din_length_raw_
;
index_t
blockSize_
;
bool
windowOverlap_
;
};
struct
Invoker
:
public
BaseInvoker
{
float
Run
(
const
Argument
&
arg
,
const
StreamConfig
&
stream_config
=
StreamConfig
{})
{
index_t
gridSize
=
getAvailableComputeUnitCount
(
stream_config
);
index_t
loop_step
=
gridSize
*
arg
.
blockSize_
*
InOutVectorSize
;
InOutGrid1dDesc
din_grid_desc
=
MakeDescriptor_M
(
arg
.
din_length_raw_
,
loop_step
);
InOutGrid1dDesc
dout_grid_desc
=
MakeDescriptor_M
(
arg
.
dout_length_raw_
,
loop_step
);
if
constexpr
(
is_same_v
<
DInDataType
,
float
>
||
is_same_v
<
DInDataType
,
double
>
)
{
hip_check_error
(
hipMemsetAsync
(
arg
.
p_din_
,
0
,
arg
.
din_length_raw_
*
sizeof
(
DInDataType
),
stream_config
.
stream_id_
));
if
(
arg
.
windowOverlap_
)
{
const
auto
put_kernel
=
kernel_put_element_1d
<
GridwisePutElementAtomicAdd
,
InOutGrid1dDesc
,
DOutDataType
,
IndexDataType
,
DInDataType
,
PassThrough
>
;
return
launch_and_time_kernel
(
stream_config
,
put_kernel
,
dim3
(
gridSize
),
dim3
(
arg
.
blockSize_
),
0
,
dout_grid_desc
,
arg
.
p_dout_
,
arg
.
p_indices_
,
arg
.
p_din_
,
PassThrough
{});
}
else
{
const
auto
put_kernel
=
kernel_put_element_1d
<
GridwisePutElementSet
,
InOutGrid1dDesc
,
DOutDataType
,
IndexDataType
,
DInDataType
,
PassThrough
>
;
return
launch_and_time_kernel
(
stream_config
,
put_kernel
,
dim3
(
gridSize
),
dim3
(
arg
.
blockSize_
),
0
,
dout_grid_desc
,
arg
.
p_dout_
,
arg
.
p_indices_
,
arg
.
p_din_
,
PassThrough
{});
}
}
else
{
if
(
arg
.
windowOverlap_
)
{
if
(
arg
.
p_workspace_
==
nullptr
)
throw
std
::
runtime_error
(
"wrong! WorkSpace pointer has not been set"
);
hip_check_error
(
hipMemsetAsync
(
arg
.
p_workspace_
,
0
,
arg
.
din_length_raw_
*
sizeof
(
DInDataType_AutomicAddPreCast
),
stream_config
.
stream_id_
));
const
auto
put_kernel
=
kernel_put_element_1d
<
GridwisePutElementAtomicAdd
,
InOutGrid1dDesc
,
DOutDataType
,
IndexDataType
,
DInDataType_AutomicAddPreCast
,
PassThrough
>
;
const
auto
cast_kernel
=
kernel_elementwise_1d
<
GridwiseCasting
,
Tuple
<
InOutGrid1dDesc
>
,
Tuple
<
InOutGrid1dDesc
>
,
Tuple
<
const
DInDataType_AutomicAddPreCast
*>
,
Tuple
<
DInDataType
*>
,
UnaryConvert
>
;
float
elapsed_time
=
launch_and_time_kernel
(
stream_config
,
put_kernel
,
dim3
(
gridSize
),
dim3
(
arg
.
blockSize_
),
0
,
dout_grid_desc
,
arg
.
p_dout_
,
arg
.
p_indices_
,
static_cast
<
DInDataType_AutomicAddPreCast
*>
(
arg
.
p_workspace_
),
PassThrough
{});
elapsed_time
+=
launch_and_time_kernel
(
stream_config
,
cast_kernel
,
dim3
(
gridSize
),
dim3
(
arg
.
blockSize_
),
0
,
ck
::
make_tuple
(
din_grid_desc
),
ck
::
make_tuple
(
din_grid_desc
),
static_cast
<
DInDataType_AutomicAddPreCast
*>
(
arg
.
p_workspace_
),
arg
.
p_din_
,
UnaryConvert
{});
return
elapsed_time
;
}
else
{
const
auto
put_kernel
=
kernel_put_element_1d
<
GridwisePutElementSet
,
InOutGrid1dDesc
,
DOutDataType
,
IndexDataType
,
DInDataType
,
PassThrough
>
;
hip_check_error
(
hipMemsetAsync
(
arg
.
p_din_
,
0
,
arg
.
din_length_raw_
*
sizeof
(
DInDataType
),
stream_config
.
stream_id_
));
return
launch_and_time_kernel
(
stream_config
,
put_kernel
,
dim3
(
gridSize
),
dim3
(
arg
.
blockSize_
),
0
,
dout_grid_desc
,
arg
.
p_dout_
,
arg
.
p_indices_
,
arg
.
p_din_
,
PassThrough
{});
}
}
}
float
Run
(
const
BaseArgument
*
p_arg
,
const
StreamConfig
&
stream_config
=
StreamConfig
{})
override
{
return
Run
(
*
dynamic_cast
<
const
Argument
*>
(
p_arg
),
stream_config
);
}
};
size_t
GetWorkSpaceSize
(
const
BaseArgument
*
pArg
)
const
override
{
const
Argument
*
pArg_
=
dynamic_cast
<
const
Argument
*>
(
pArg
);
bool
needCast
=
pArg_
->
windowOverlap_
&&
!
(
is_same_v
<
DInDataType
,
float
>
||
is_same_v
<
DInDataType
,
double
>
);
if
(
!
needCast
)
return
0
;
else
return
pArg_
->
din_length_raw_
*
sizeof
(
DInDataType_AutomicAddPreCast
);
};
bool
IsSupportedArgument
(
const
BaseArgument
*
p_arg
)
override
{
const
Argument
*
pArg
=
dynamic_cast
<
const
Argument
*>
(
p_arg
);
if
(
pArg
->
din_length_raw_
%
InOutVectorSize
!=
0
||
pArg
->
dout_length_raw_
%
InOutVectorSize
!=
0
)
{
return
false
;
}
return
true
;
}
std
::
unique_ptr
<
BaseArgument
>
MakeArgumentPointer
(
const
void
*
p_dout
,
const
void
*
p_indices
,
void
*
p_din
,
index_t
dout_length
,
index_t
din_length
,
std
::
vector
<
ck
::
index_t
>
window_lengths
,
std
::
vector
<
ck
::
index_t
>
window_strides
)
override
{
// Assume p_dout, p_indices, p_din are packed memory space, dout_length and din_length are
// physical size of the packed tensor
return
std
::
make_unique
<
Argument
>
(
static_cast
<
const
DOutDataType
*>
(
p_dout
),
static_cast
<
const
IndexDataType
*>
(
p_indices
),
static_cast
<
DInDataType
*>
(
p_din
),
dout_length
,
din_length
,
window_lengths
,
window_strides
);
}
std
::
unique_ptr
<
BaseInvoker
>
MakeInvokerPointer
()
override
{
return
std
::
make_unique
<
Invoker
>
(
Invoker
{});
}
};
}
// namespace device
}
// namespace tensor_operation
}
// namespace ck
include/ck/tensor_operation/gpu/device/impl/device_put_element_impl.hpp
0 → 100644
View file @
72e0c1c5
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#include <iostream>
#include <sstream>
#include "ck/tensor_description/tensor_descriptor.hpp"
#include "ck/tensor_description/tensor_descriptor_helper.hpp"
#include "ck/tensor_operation/gpu/device/device_put_element.hpp"
#include "ck/tensor_operation/gpu/grid/gridwise_put_element_1d.hpp"
#include "ck/host_utility/device_prop.hpp"
#include "ck/host_utility/kernel_launch.hpp"
#include "ck/host_utility/stream_utility.hpp"
namespace
ck
{
namespace
tensor_operation
{
namespace
device
{
// output[indices] = input
template
<
typename
InDataType
,
typename
IndexDataType
,
typename
OutDataType
,
typename
ElementwiseOperation
,
InMemoryDataOperationEnum
MemOp
,
ck
::
index_t
InVectorSize
>
struct
DevicePutElementImpl
:
public
DevicePutElement
<
InDataType
,
IndexDataType
,
OutDataType
,
ElementwiseOperation
,
MemOp
>
{
template
<
typename
Desc_M
>
static
auto
PadDescriptor_M_1d
(
Desc_M
desc_m
,
index_t
gridSize
,
index_t
blockSize
)
{
constexpr
auto
I0
=
Number
<
0
>
{};
const
auto
m
=
desc_m
.
GetLength
(
I0
);
const
index_t
loop_step
=
gridSize
*
blockSize
*
InVectorSize
;
const
auto
pad
=
math
::
integer_least_multiple
(
m
,
loop_step
)
-
m
;
const
auto
desc_m_pad
=
transform_tensor_descriptor
(
desc_m
,
make_tuple
(
make_right_pad_transform
(
m
,
pad
)),
make_tuple
(
Sequence
<
0
>
{}),
make_tuple
(
Sequence
<
0
>
{}));
return
desc_m_pad
;
}
static
auto
MakeDescriptor_M
(
index_t
length
,
index_t
gridSize
,
index_t
blockSize
)
{
const
auto
desc_m
=
make_naive_tensor_descriptor_packed
(
make_tuple
(
length
));
return
PadDescriptor_M_1d
(
desc_m
,
gridSize
,
blockSize
);
}
using
InGrid1dDesc
=
decltype
(
MakeDescriptor_M
(
1
,
1
,
1
));
using
GridwisePutElement
=
GridwisePutElement_1D
<
InGrid1dDesc
,
InDataType
,
IndexDataType
,
OutDataType
,
ElementwiseOperation
,
MemOp
,
InVectorSize
>
;
struct
Argument
:
public
BaseArgument
{
Argument
(
const
InDataType
*
p_input
,
const
IndexDataType
*
p_indices
,
OutDataType
*
p_output
,
index_t
input_length
,
ElementwiseOperation
elementwise_op
)
:
p_input_
{
p_input
},
p_indices_
{
p_indices
},
p_output_
{
p_output
},
input_length_raw_
{
input_length
},
elementwise_op_
{
elementwise_op
},
blockSize_
{
256
}
{
}
const
InDataType
*
p_input_
;
const
IndexDataType
*
p_indices_
;
OutDataType
*
p_output_
;
index_t
input_length_raw_
;
ElementwiseOperation
elementwise_op_
;
index_t
blockSize_
;
};
struct
Invoker
:
public
BaseInvoker
{
float
Run
(
const
Argument
&
arg
,
const
StreamConfig
&
stream_config
=
StreamConfig
{})
{
index_t
gridSize
=
getAvailableComputeUnitCount
(
stream_config
);
InGrid1dDesc
in_grid_desc
=
MakeDescriptor_M
(
arg
.
input_length_raw_
,
gridSize
,
arg
.
blockSize_
);
const
auto
kernel
=
kernel_put_element_1d
<
GridwisePutElement
,
InGrid1dDesc
,
InDataType
,
IndexDataType
,
OutDataType
,
ElementwiseOperation
>
;
float
elapsed_time
=
launch_and_time_kernel
(
stream_config
,
kernel
,
dim3
(
gridSize
),
dim3
(
arg
.
blockSize_
),
0
,
in_grid_desc
,
arg
.
p_input_
,
arg
.
p_indices_
,
arg
.
p_output_
,
arg
.
elementwise_op_
);
return
elapsed_time
;
}
float
Run
(
const
BaseArgument
*
p_arg
,
const
StreamConfig
&
stream_config
=
StreamConfig
{})
override
{
return
Run
(
*
dynamic_cast
<
const
Argument
*>
(
p_arg
),
stream_config
);
}
};
bool
IsSupportedArgument
(
const
BaseArgument
*
p_arg
)
override
{
const
Argument
*
pArg
=
dynamic_cast
<
const
Argument
*>
(
p_arg
);
if
(
pArg
->
input_length_raw_
%
InVectorSize
!=
0
)
{
return
false
;
}
return
true
;
}
std
::
unique_ptr
<
BaseArgument
>
MakeArgumentPointer
(
const
void
*
p_input
,
const
void
*
p_indices
,
void
*
p_output
,
index_t
input_length
,
index_t
,
ElementwiseOperation
elementwise_op
)
override
{
return
std
::
make_unique
<
Argument
>
(
static_cast
<
const
InDataType
*>
(
p_input
),
static_cast
<
const
IndexDataType
*>
(
p_indices
),
static_cast
<
OutDataType
*>
(
p_output
),
input_length
,
elementwise_op
);
}
std
::
unique_ptr
<
BaseInvoker
>
MakeInvokerPointer
()
override
{
return
std
::
make_unique
<
Invoker
>
(
Invoker
{});
}
};
}
// namespace device
}
// namespace tensor_operation
}
// namespace ck
include/ck/tensor_operation/gpu/device/impl/device_softmax_impl.hpp
View file @
72e0c1c5
...
@@ -38,16 +38,9 @@ struct DeviceSoftmaxImpl : public DeviceSoftmax<InDataType,
...
@@ -38,16 +38,9 @@ struct DeviceSoftmaxImpl : public DeviceSoftmax<InDataType,
OutDataType
,
OutDataType
,
InElementwiseOp
,
InElementwiseOp
,
AccElementwiseOp
,
AccElementwiseOp
,
Rank
>
Rank
,
NumReduceDim
>
{
{
static
constexpr
index_t
kRank
=
Rank
;
static
constexpr
index_t
kNumReduceDim
=
NumReduceDim
;
static
constexpr
index_t
kNumInvariantDim
=
Rank
-
NumReduceDim
;
virtual
index_t
GetRank
()
const
override
{
return
kRank
;
}
virtual
index_t
GetNumReduceDim
()
const
override
{
return
kNumReduceDim
;
}
static
constexpr
index_t
NumInvariantDim
=
Rank
-
NumReduceDim
;
static
constexpr
index_t
NumInvariantDim
=
Rank
-
NumReduceDim
;
static
constexpr
index_t
NumSrcDim
=
Rank
;
static
constexpr
index_t
NumSrcDim
=
Rank
;
...
@@ -287,13 +280,13 @@ struct DeviceSoftmaxImpl : public DeviceSoftmax<InDataType,
...
@@ -287,13 +280,13 @@ struct DeviceSoftmaxImpl : public DeviceSoftmax<InDataType,
{
{
if
constexpr
(
InSrcVectorDim
==
0
)
if
constexpr
(
InSrcVectorDim
==
0
)
{
{
if
constexpr
(
k
NumInvariantDim
==
0
)
if
constexpr
(
NumInvariantDim
==
0
)
{
{
return
false
;
return
false
;
}
}
else
else
{
{
if
(
arg
.
inStrides_
[
k
NumInvariantDim
-
1
]
!=
1
&&
InSrcVectorSize
!=
1
)
if
(
arg
.
inStrides_
[
NumInvariantDim
-
1
]
!=
1
&&
InSrcVectorSize
!=
1
)
{
{
return
false
;
return
false
;
}
}
...
@@ -316,7 +309,7 @@ struct DeviceSoftmaxImpl : public DeviceSoftmax<InDataType,
...
@@ -316,7 +309,7 @@ struct DeviceSoftmaxImpl : public DeviceSoftmax<InDataType,
}
}
// To improve
// To improve
if
(
k
NumInvariantDim
>
0
&&
arg
.
invariant_lowest_length_
%
OutDstVectorSize
!=
0
)
if
(
NumInvariantDim
>
0
&&
arg
.
invariant_lowest_length_
%
OutDstVectorSize
!=
0
)
{
{
return
false
;
return
false
;
}
}
...
...
include/ck/tensor_operation/gpu/element/unary_element_wise_operation.hpp
View file @
72e0c1c5
...
@@ -6,6 +6,7 @@
...
@@ -6,6 +6,7 @@
#include "ck/utility/data_type.hpp"
#include "ck/utility/data_type.hpp"
#include "ck/utility/math.hpp"
#include "ck/utility/math.hpp"
#include "ck/utility/math_v2.hpp"
#include "ck/utility/math_v2.hpp"
#include "ck/utility/type_convert.hpp"
namespace
ck
{
namespace
ck
{
namespace
tensor_operation
{
namespace
tensor_operation
{
...
@@ -81,6 +82,36 @@ struct PassThrough
...
@@ -81,6 +82,36 @@ struct PassThrough
y
=
x
;
y
=
x
;
}
}
#endif
#endif
template
<
>
__host__
__device__
void
operator
()
<
f8_t
,
f8_t
>
(
f8_t
&
y
,
const
f8_t
&
x
)
const
{
y
=
x
;
}
template
<
>
__host__
__device__
void
operator
()
<
float
,
f8_t
>
(
float
&
y
,
const
f8_t
&
x
)
const
{
y
=
type_convert
<
float
>
(
x
);
}
template
<
>
__host__
__device__
void
operator
()
<
f8_t
,
float
>
(
f8_t
&
y
,
const
float
&
x
)
const
{
y
=
type_convert
<
f8_t
>
(
x
);
}
template
<
>
__host__
__device__
void
operator
()
<
half_t
,
f8_t
>
(
half_t
&
y
,
const
f8_t
&
x
)
const
{
y
=
type_convert
<
half_t
>
(
x
);
}
template
<
>
__host__
__device__
void
operator
()
<
f8_t
,
half_t
>
(
f8_t
&
y
,
const
half_t
&
x
)
const
{
y
=
type_convert
<
f8_t
>
(
x
);
}
};
};
struct
UnaryConvert
struct
UnaryConvert
...
@@ -109,6 +140,23 @@ struct ConvertBF16RTN
...
@@ -109,6 +140,23 @@ struct ConvertBF16RTN
}
}
};
};
struct
ConvertF8SR
{
// convert to fp8 using stochastic rounding (SR)
template
<
typename
Y
,
typename
X
>
__host__
__device__
void
operator
()(
Y
&
y
,
const
X
&
x
)
const
{
// check Y datatype
static_assert
(
is_same
<
Y
,
f8_t
>::
value
,
"Data type is not supported by this operation!"
);
// check X datatype
static_assert
(
is_same
<
X
,
float
>::
value
||
is_same
<
X
,
half_t
>::
value
,
"Data type is not supported by this operation!"
);
y
=
f8_convert_sr
<
Y
>
(
x
);
}
};
struct
Scale
struct
Scale
{
{
__host__
__device__
Scale
(
float
scale
)
:
scale_
(
scale
)
{}
__host__
__device__
Scale
(
float
scale
)
:
scale_
(
scale
)
{}
...
...
include/ck/tensor_operation/gpu/grid/gridwise_put_element_1d.hpp
0 → 100644
View file @
72e0c1c5
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#include "ck/utility/data_type.hpp"
#include "ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer.hpp"
namespace
ck
{
template
<
typename
GridwisePutElementwise1dFunctor
,
typename
InGrid1dDesc
,
typename
InDataType
,
typename
IndexDataType
,
typename
OutDataType
,
typename
ElementwiseOperation
>
__global__
void
kernel_put_element_1d
(
const
InGrid1dDesc
in_grid_1d_desc
,
const
InDataType
*
__restrict__
p_in_global
,
const
IndexDataType
*
__restrict__
p_indices_global
,
OutDataType
*
__restrict__
p_out_global
,
const
ElementwiseOperation
elementwise_op
)
{
GridwisePutElementwise1dFunctor
::
Run
(
in_grid_1d_desc
,
p_in_global
,
p_indices_global
,
p_out_global
,
elementwise_op
);
}
// output[indices] = input
template
<
typename
InGrid1dDesc
,
typename
InDataType
,
typename
IndexDataType
,
typename
OutDataType
,
typename
ElementwiseOperation
,
InMemoryDataOperationEnum
MemOp
,
index_t
InVectorSize
>
struct
GridwisePutElement_1D
{
static
constexpr
auto
I0
=
Number
<
0
>
{};
static
constexpr
auto
thread_buffer_desc_m
=
make_naive_tensor_descriptor_packed
(
make_tuple
(
Number
<
InVectorSize
>
{}));
__device__
static
void
Run
(
const
InGrid1dDesc
&
in_grid_1d_desc
,
const
InDataType
*
__restrict__
p_in_global
,
const
IndexDataType
*
__restrict__
p_indices_global
,
OutDataType
*
__restrict__
p_out_global
,
const
ElementwiseOperation
&
elementwise_op
)
{
// Global Memory
const
auto
in_global_buf
=
make_dynamic_buffer
<
AddressSpaceEnum
::
Global
>
(
p_in_global
,
in_grid_1d_desc
.
GetElementSpaceSize
());
const
auto
indices_global_buf
=
make_dynamic_buffer
<
AddressSpaceEnum
::
Global
>
(
p_indices_global
,
in_grid_1d_desc
.
GetElementSpaceSize
(),
NumericLimits
<
IndexDataType
>::
Lowest
());
// VGPR
StaticBuffer
<
AddressSpaceEnum
::
Vgpr
,
InDataType
,
InVectorSize
,
true
>
in_thread_buf
;
StaticBuffer
<
AddressSpaceEnum
::
Vgpr
,
IndexDataType
,
InVectorSize
,
true
>
indices_thread_buf
;
// Thread id, Block id and index
const
index_t
thread_global_id
=
get_thread_global_1d_id
();
const
auto
thread_global_offset
=
make_multi_index
(
thread_global_id
*
InVectorSize
);
const
index_t
blockSize
=
get_block_size
();
const
index_t
blockPerGrid
=
get_grid_size
();
const
auto
M
=
in_grid_1d_desc
.
GetLength
(
I0
);
const
index_t
loop_step
=
blockPerGrid
*
blockSize
*
InVectorSize
;
const
auto
loop_step_index
=
make_multi_index
(
loop_step
);
auto
in_global_load
=
ThreadwiseTensorSliceTransfer_v2
<
InDataType
,
InDataType
,
decltype
(
in_grid_1d_desc
),
decltype
(
thread_buffer_desc_m
),
Sequence
<
InVectorSize
>
,
// SliceLengths
Sequence
<
0
>
,
// DimAccessOrder
0
,
// SrcVectorDim
InVectorSize
,
// ScalarPerVector
1
,
// SrcScalarStrideInVector
false
>
{
in_grid_1d_desc
,
thread_global_offset
};
auto
indices_global_load
=
ThreadwiseTensorSliceTransfer_v2
<
IndexDataType
,
IndexDataType
,
decltype
(
in_grid_1d_desc
),
decltype
(
thread_buffer_desc_m
),
Sequence
<
InVectorSize
>
,
// SliceLengths
Sequence
<
0
>
,
// DimAccessOrder
0
,
// SrcVectorDim
InVectorSize
,
// ScalarPerVector
1
,
// SrcScalarStrideInVector
false
>
{
in_grid_1d_desc
,
thread_global_offset
};
index_t
num_iter
=
M
/
loop_step
;
do
{
in_global_load
.
Run
(
in_grid_1d_desc
,
in_global_buf
,
thread_buffer_desc_m
,
make_tuple
(
I0
),
in_thread_buf
);
in_global_load
.
MoveSrcSliceWindow
(
in_grid_1d_desc
,
loop_step_index
);
static_for
<
0
,
InVectorSize
,
1
>
{}(
[
&
](
auto
iM
)
{
elementwise_op
(
in_thread_buf
(
iM
),
in_thread_buf
[
iM
]);
});
indices_global_load
.
Run
(
in_grid_1d_desc
,
indices_global_buf
,
thread_buffer_desc_m
,
make_tuple
(
I0
),
indices_thread_buf
);
indices_global_load
.
MoveSrcSliceWindow
(
in_grid_1d_desc
,
loop_step_index
);
static_for
<
0
,
InVectorSize
,
1
>
{}([
&
](
auto
iM
)
{
if
(
indices_thread_buf
[
iM
]
>=
0
)
{
if
constexpr
(
MemOp
==
InMemoryDataOperationEnum
::
Set
)
{
// User should guarantee each index in p_indices_global is different
*
(
p_out_global
+
indices_thread_buf
[
iM
])
=
ck
::
type_convert
<
OutDataType
>
(
in_thread_buf
[
iM
]);
}
else
if
constexpr
(
MemOp
==
InMemoryDataOperationEnum
::
AtomicAdd
)
{
atomic_add
<
OutDataType
>
(
p_out_global
+
indices_thread_buf
[
iM
],
ck
::
type_convert
<
OutDataType
>
(
in_thread_buf
[
iM
]));
}
else
if
constexpr
(
MemOp
==
InMemoryDataOperationEnum
::
AtomicMax
)
{
atomic_max
<
OutDataType
>
(
p_out_global
+
indices_thread_buf
[
iM
],
ck
::
type_convert
<
OutDataType
>
(
in_thread_buf
[
iM
]));
}
else
if
constexpr
(
MemOp
==
InMemoryDataOperationEnum
::
Add
)
{
// User should guarantee each index in p_indices_global is different
*
(
p_out_global
+
indices_thread_buf
[
iM
])
+=
ck
::
type_convert
<
OutDataType
>
(
in_thread_buf
[
iM
]);
}
else
{
static_assert
(
MemOp
==
InMemoryDataOperationEnum
::
Set
||
MemOp
==
InMemoryDataOperationEnum
::
AtomicAdd
||
MemOp
==
InMemoryDataOperationEnum
::
AtomicMax
||
MemOp
==
InMemoryDataOperationEnum
::
Add
);
}
}
});
}
while
(
--
num_iter
);
}
};
}
// namespace ck
Prev
1
2
3
4
5
6
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment