Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
composable_kernel
Commits
5a7f7334
Commit
5a7f7334
authored
Aug 31, 2023
by
Jing Zhang
Browse files
merge develop
parents
14822d71
f5ec04f0
Changes
46
Show whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
760 additions
and
23 deletions
+760
-23
client_example/05_layernorm/layernorm2d.cpp
client_example/05_layernorm/layernorm2d.cpp
+8
-0
client_example/18_groupnorm/groupnorm_swish.cpp
client_example/18_groupnorm/groupnorm_swish.cpp
+8
-0
client_example/19_pool/CMakeLists.txt
client_example/19_pool/CMakeLists.txt
+11
-0
client_example/19_pool/avg_pool3d_bwd.cpp
client_example/19_pool/avg_pool3d_bwd.cpp
+191
-0
client_example/19_pool/avg_pool3d_fwd.cpp
client_example/19_pool/avg_pool3d_fwd.cpp
+0
-0
client_example/19_pool/max_pool2d_bwd.cpp
client_example/19_pool/max_pool2d_bwd.cpp
+280
-0
client_example/19_pool/max_pool2d_fwd.cpp
client_example/19_pool/max_pool2d_fwd.cpp
+0
-0
example/49_maxpool2d_bwd/maxpool2d_bwd_common.hpp
example/49_maxpool2d_bwd/maxpool2d_bwd_common.hpp
+4
-3
include/ck/tensor_operation/gpu/device/device_max_pool_bwd.hpp
...de/ck/tensor_operation/gpu/device/device_max_pool_bwd.hpp
+3
-2
include/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_xdl_fixed_nk.hpp
...tion/gpu/device/impl/device_grouped_gemm_xdl_fixed_nk.hpp
+1
-1
include/ck/tensor_operation/gpu/device/impl/device_max_pool_bwd_impl.hpp
...or_operation/gpu/device/impl/device_max_pool_bwd_impl.hpp
+15
-6
include/ck/tensor_operation/gpu/grid/gridwise_gemm_multiple_d_xdl_splitk_cshuffle.hpp
...gpu/grid/gridwise_gemm_multiple_d_xdl_splitk_cshuffle.hpp
+4
-4
include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_streamk.hpp
...ensor_operation/gpu/grid/gridwise_gemm_xdlops_streamk.hpp
+2
-1
include/ck/utility/reduction_operator.hpp
include/ck/utility/reduction_operator.hpp
+60
-1
library/include/ck/library/reference_tensor_operation/cpu/reference_maxpool_bwd.hpp
.../reference_tensor_operation/cpu/reference_maxpool_bwd.hpp
+10
-1
library/include/ck/library/reference_tensor_operation/cpu/reference_pool_fwd.hpp
...ary/reference_tensor_operation/cpu/reference_pool_fwd.hpp
+4
-2
library/include/ck/library/tensor_operation_instance/gpu/avg_pool3d_bwd.hpp
.../library/tensor_operation_instance/gpu/avg_pool3d_bwd.hpp
+59
-0
library/include/ck/library/tensor_operation_instance/gpu/max_pool_bwd.hpp
...ck/library/tensor_operation_instance/gpu/max_pool_bwd.hpp
+58
-0
library/include/ck/library/tensor_operation_instance/gpu/pool3d_fwd.hpp
...e/ck/library/tensor_operation_instance/gpu/pool3d_fwd.hpp
+31
-2
library/src/tensor_operation_instance/gpu/avg_pool3d_bwd/CMakeLists.txt
...nsor_operation_instance/gpu/avg_pool3d_bwd/CMakeLists.txt
+11
-0
No files found.
client_example/05_layernorm/layernorm2d.cpp
View file @
5a7f7334
...
@@ -100,6 +100,10 @@ int main(int argc, char* argv[])
...
@@ -100,6 +100,10 @@ int main(int argc, char* argv[])
if
(
op_ptr
->
IsSupportedArgument
(
argument_ptr
.
get
()))
if
(
op_ptr
->
IsSupportedArgument
(
argument_ptr
.
get
()))
{
{
size_t
workspace_sz
=
op_ptr
->
GetWorkSpaceSize
(
argument_ptr
.
get
());
SimpleDeviceMem
workspace
(
workspace_sz
);
op_ptr
->
SetWorkSpacePointer
(
argument_ptr
.
get
(),
workspace
.
GetDeviceBuffer
());
float
ave_time
=
invoker_ptr
->
Run
(
argument_ptr
.
get
(),
StreamConfig
{
nullptr
,
true
});
float
ave_time
=
invoker_ptr
->
Run
(
argument_ptr
.
get
(),
StreamConfig
{
nullptr
,
true
});
std
::
size_t
num_byte
=
sizeof
(
XDataType
)
*
M
*
N
+
sizeof
(
GammaDataType
)
*
N
+
std
::
size_t
num_byte
=
sizeof
(
XDataType
)
*
M
*
N
+
sizeof
(
GammaDataType
)
*
N
+
...
@@ -153,6 +157,10 @@ int main(int argc, char* argv[])
...
@@ -153,6 +157,10 @@ int main(int argc, char* argv[])
if
(
op_ptr
->
IsSupportedArgument
(
argument_ptr
.
get
()))
if
(
op_ptr
->
IsSupportedArgument
(
argument_ptr
.
get
()))
{
{
size_t
workspace_sz
=
op_ptr
->
GetWorkSpaceSize
(
argument_ptr
.
get
());
SimpleDeviceMem
workspace
(
workspace_sz
);
op_ptr
->
SetWorkSpacePointer
(
argument_ptr
.
get
(),
workspace
.
GetDeviceBuffer
());
invoker_ptr
->
Run
(
argument_ptr
.
get
(),
StreamConfig
{
nullptr
,
false
});
invoker_ptr
->
Run
(
argument_ptr
.
get
(),
StreamConfig
{
nullptr
,
false
});
}
}
...
...
client_example/18_groupnorm/groupnorm_swish.cpp
View file @
5a7f7334
...
@@ -129,6 +129,10 @@ int main(int argc, char* argv[])
...
@@ -129,6 +129,10 @@ int main(int argc, char* argv[])
if
(
op_ptr
->
IsSupportedArgument
(
argument_ptr
.
get
()))
if
(
op_ptr
->
IsSupportedArgument
(
argument_ptr
.
get
()))
{
{
size_t
workspace_sz
=
op_ptr
->
GetWorkSpaceSize
(
argument_ptr
.
get
());
SimpleDeviceMem
workspace
(
workspace_sz
);
op_ptr
->
SetWorkSpacePointer
(
argument_ptr
.
get
(),
workspace
.
GetDeviceBuffer
());
float
ave_time
=
invoker_ptr
->
Run
(
argument_ptr
.
get
(),
StreamConfig
{
nullptr
,
true
});
float
ave_time
=
invoker_ptr
->
Run
(
argument_ptr
.
get
(),
StreamConfig
{
nullptr
,
true
});
std
::
size_t
num_byte
=
std
::
size_t
num_byte
=
...
@@ -184,6 +188,10 @@ int main(int argc, char* argv[])
...
@@ -184,6 +188,10 @@ int main(int argc, char* argv[])
if
(
op_ptr
->
IsSupportedArgument
(
argument_ptr
.
get
()))
if
(
op_ptr
->
IsSupportedArgument
(
argument_ptr
.
get
()))
{
{
size_t
workspace_sz
=
op_ptr
->
GetWorkSpaceSize
(
argument_ptr
.
get
());
SimpleDeviceMem
workspace
(
workspace_sz
);
op_ptr
->
SetWorkSpacePointer
(
argument_ptr
.
get
(),
workspace
.
GetDeviceBuffer
());
invoker_ptr
->
Run
(
argument_ptr
.
get
(),
StreamConfig
{
nullptr
,
false
});
invoker_ptr
->
Run
(
argument_ptr
.
get
(),
StreamConfig
{
nullptr
,
false
});
}
}
...
...
client_example/19_pool
_fwd
/CMakeLists.txt
→
client_example/19_pool/CMakeLists.txt
View file @
5a7f7334
add_executable
(
client_max_pool2d_fwd max_pool2d_fwd.cpp
)
add_executable
(
client_max_pool2d_fwd max_pool2d_fwd.cpp
)
target_link_libraries
(
client_max_pool2d_fwd PRIVATE composable_kernel::device_operations
)
target_link_libraries
(
client_max_pool2d_fwd PRIVATE composable_kernel::device_operations
)
add_executable
(
client_max_pool2d_bwd max_pool2d_bwd.cpp
)
target_link_libraries
(
client_max_pool2d_bwd PRIVATE composable_kernel::device_operations
)
add_executable
(
client_avg_pool3d_fwd avg_pool3d_fwd.cpp
)
add_executable
(
client_avg_pool3d_fwd avg_pool3d_fwd.cpp
)
target_link_libraries
(
client_avg_pool3d_fwd PRIVATE composable_kernel::device_operations
)
target_link_libraries
(
client_avg_pool3d_fwd PRIVATE composable_kernel::device_operations
)
add_executable
(
client_avg_pool3d_bwd avg_pool3d_bwd.cpp
)
target_link_libraries
(
client_avg_pool3d_bwd PRIVATE composable_kernel::device_operations
)
client_example/19_pool/avg_pool3d_bwd.cpp
0 → 100644
View file @
5a7f7334
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
#include <iomanip>
#include <vector>
#include <iostream>
#include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/library/tensor_operation_instance/gpu/avg_pool3d_bwd.hpp"
using
DOutDataType
=
ck
::
half_t
;
using
DInDataType
=
ck
::
half_t
;
using
DOutLayout
=
ck
::
tensor_layout
::
convolution
::
NDHWC
;
using
DInLayout
=
ck
::
tensor_layout
::
convolution
::
NDHWC
;
struct
SimpleDeviceMem
{
SimpleDeviceMem
()
=
delete
;
SimpleDeviceMem
(
std
::
size_t
mem_size
)
:
p_mem_
{},
mMemSize_
(
mem_size
)
{
(
void
)
hipMalloc
(
static_cast
<
void
**>
(
&
p_mem_
),
mem_size
);
}
void
*
GetDeviceBuffer
()
{
return
p_mem_
;
}
void
SetZero
()
const
{
(
void
)
hipMemset
(
p_mem_
,
0
,
mMemSize_
);
}
~
SimpleDeviceMem
()
{
(
void
)
hipFree
(
p_mem_
);
}
void
*
p_mem_
;
std
::
size_t
mMemSize_
;
};
int
main
(
int
argc
,
char
*
argv
[])
{
ck
::
index_t
N
=
2
;
ck
::
index_t
C
=
32
;
ck
::
index_t
Z
=
2
;
ck
::
index_t
Y
=
2
;
ck
::
index_t
X
=
2
;
ck
::
index_t
Di
=
30
;
ck
::
index_t
Hi
=
30
;
ck
::
index_t
Wi
=
30
;
ck
::
index_t
window_stride_d
=
2
;
ck
::
index_t
window_stride_h
=
2
;
ck
::
index_t
window_stride_w
=
2
;
ck
::
index_t
window_dilation_d
=
1
;
ck
::
index_t
window_dilation_h
=
1
;
ck
::
index_t
window_dilation_w
=
1
;
ck
::
index_t
in_left_pad_d
=
1
;
ck
::
index_t
in_left_pad_h
=
1
;
ck
::
index_t
in_left_pad_w
=
1
;
ck
::
index_t
in_right_pad_d
=
1
;
ck
::
index_t
in_right_pad_h
=
1
;
ck
::
index_t
in_right_pad_w
=
1
;
const
ck
::
index_t
Zs
=
(
Z
-
1
)
*
window_dilation_d
+
1
;
const
ck
::
index_t
Ys
=
(
Y
-
1
)
*
window_dilation_h
+
1
;
const
ck
::
index_t
Xs
=
(
X
-
1
)
*
window_dilation_w
+
1
;
ck
::
index_t
Do
=
(
Di
+
in_left_pad_d
+
in_right_pad_d
-
Zs
)
/
window_stride_d
+
1
;
ck
::
index_t
Ho
=
(
Hi
+
in_left_pad_h
+
in_right_pad_h
-
Ys
)
/
window_stride_h
+
1
;
ck
::
index_t
Wo
=
(
Wi
+
in_left_pad_w
+
in_right_pad_w
-
Xs
)
/
window_stride_w
+
1
;
// Pool API only support the order of NCDHW
std
::
vector
<
ck
::
index_t
>
in_length
=
{
N
,
C
,
Di
,
Hi
,
Wi
};
std
::
vector
<
ck
::
index_t
>
out_length
=
{
N
,
C
,
Do
,
Ho
,
Wo
};
std
::
vector
<
ck
::
index_t
>
window_spatial_lengths
=
{
Z
,
Y
,
X
};
std
::
vector
<
ck
::
index_t
>
window_strides
=
{
window_stride_d
,
window_stride_h
,
window_stride_w
};
std
::
vector
<
ck
::
index_t
>
window_dilations
{
window_dilation_d
,
window_dilation_h
,
window_dilation_w
};
std
::
vector
<
ck
::
index_t
>
input_left_pads
=
{
in_left_pad_d
,
in_left_pad_h
,
in_left_pad_w
};
std
::
vector
<
ck
::
index_t
>
input_right_pads
=
{
in_right_pad_d
,
in_right_pad_h
,
in_right_pad_w
};
std
::
size_t
in_tensor_size
=
N
*
C
*
Di
*
Hi
*
Wi
;
std
::
size_t
out_tensor_size
=
N
*
C
*
Do
*
Ho
*
Wo
;
// tensor layout = NDHWC
std
::
vector
<
ck
::
index_t
>
in_tensor_stride
=
{
Di
*
C
*
Hi
*
Wi
,
1
,
C
*
Hi
*
Wi
,
Wi
*
C
,
C
};
std
::
vector
<
ck
::
index_t
>
out_tensor_stride
=
{
Do
*
C
*
Ho
*
Wo
,
1
,
C
*
Ho
*
Wo
,
Wo
*
C
,
C
};
SimpleDeviceMem
dout_device_buf
(
sizeof
(
DOutDataType
)
*
out_tensor_size
);
SimpleDeviceMem
din_device_buf
(
sizeof
(
DInDataType
)
*
in_tensor_size
);
using
DeviceOp
=
ck
::
tensor_operation
::
device
::
DeviceAvgPoolBwd
<
3
,
DOutDataType
,
DInDataType
,
DOutLayout
,
DInLayout
>
;
// get device op instances
const
auto
op_ptrs
=
ck
::
tensor_operation
::
device
::
instance
::
DeviceOperationInstanceFactory
<
DeviceOp
>::
GetInstances
();
std
::
cout
<<
"found "
<<
op_ptrs
.
size
()
<<
" instances"
<<
std
::
endl
;
std
::
string
best_op_name
;
bool
found
=
false
;
int
best_op_id
=
-
1
;
float
best_ave_time
=
std
::
numeric_limits
<
float
>::
max
();
float
best_gb_per_sec
=
0
;
// profile device operation instances
std
::
cout
<<
"Run all instances and do timing"
<<
std
::
endl
;
for
(
int
i
=
0
;
i
<
op_ptrs
.
size
();
++
i
)
{
auto
&
op_ptr
=
op_ptrs
[
i
];
auto
argument_ptr
=
op_ptr
->
MakeArgumentPointer
(
static_cast
<
DOutDataType
*>
(
dout_device_buf
.
GetDeviceBuffer
()),
static_cast
<
DInDataType
*>
(
din_device_buf
.
GetDeviceBuffer
()),
out_length
,
in_length
,
out_tensor_stride
,
in_tensor_stride
,
window_spatial_lengths
,
window_strides
,
window_dilations
,
input_left_pads
,
input_right_pads
);
auto
invoker_ptr
=
op_ptr
->
MakeInvokerPointer
();
std
::
string
op_name
=
op_ptr
->
GetTypeString
();
if
(
op_ptr
->
IsSupportedArgument
(
argument_ptr
.
get
()))
{
din_device_buf
.
SetZero
();
float
ave_time
=
invoker_ptr
->
Run
(
argument_ptr
.
get
(),
StreamConfig
{
nullptr
,
true
});
std
::
size_t
num_bytes
=
in_tensor_size
*
sizeof
(
DInDataType
)
+
out_tensor_size
*
sizeof
(
DOutDataType
);
float
gb_per_sec
=
num_bytes
/
1.E6
/
ave_time
;
std
::
cout
<<
"Perf: "
<<
std
::
setw
(
10
)
<<
ave_time
<<
" ms, "
<<
gb_per_sec
<<
" GB/s, "
<<
op_name
<<
std
::
endl
;
if
(
ave_time
<
best_ave_time
)
{
found
=
true
;
best_op_id
=
i
;
best_op_name
=
op_name
;
best_ave_time
=
ave_time
;
best_gb_per_sec
=
gb_per_sec
;
}
}
else
{
std
::
cout
<<
op_name
<<
" does not support this problem"
<<
std
::
endl
;
}
}
// run the best intance
if
(
found
)
{
std
::
cout
<<
"Best Perf: "
<<
best_ave_time
<<
" ms, "
<<
best_gb_per_sec
<<
" GB/s, "
<<
best_op_name
<<
std
::
endl
;
auto
&
op_ptr
=
op_ptrs
[
best_op_id
];
std
::
cout
<<
"Run the best instance without timing: "
<<
op_ptr
->
GetTypeString
()
<<
std
::
endl
;
auto
argument_ptr
=
op_ptr
->
MakeArgumentPointer
(
static_cast
<
DOutDataType
*>
(
dout_device_buf
.
GetDeviceBuffer
()),
static_cast
<
DInDataType
*>
(
din_device_buf
.
GetDeviceBuffer
()),
out_length
,
in_length
,
out_tensor_stride
,
in_tensor_stride
,
window_spatial_lengths
,
window_strides
,
window_dilations
,
input_left_pads
,
input_right_pads
);
auto
invoker_ptr
=
op_ptr
->
MakeInvokerPointer
();
if
(
op_ptr
->
IsSupportedArgument
(
argument_ptr
.
get
()))
{
din_device_buf
.
SetZero
();
invoker_ptr
->
Run
(
argument_ptr
.
get
(),
StreamConfig
{
nullptr
,
false
});
}
std
::
cout
<<
"Done"
<<
std
::
endl
;
}
return
0
;
}
client_example/19_pool
_fwd
/avg_pool3d_fwd.cpp
→
client_example/19_pool/avg_pool3d_fwd.cpp
View file @
5a7f7334
File moved
client_example/19_pool/max_pool2d_bwd.cpp
0 → 100644
View file @
5a7f7334
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
#include <iomanip>
#include <vector>
#include <iostream>
#include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/tensor_operation/gpu/device/device_pool_fwd.hpp"
#include "ck/tensor_operation/gpu/device/device_max_pool_bwd.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/library/tensor_operation_instance/gpu/pool3d_fwd.hpp"
#include "ck/library/tensor_operation_instance/gpu/max_pool_bwd.hpp"
using
InDataType
=
ck
::
half_t
;
using
OutDataType
=
ck
::
half_t
;
using
DOutDataType
=
ck
::
half_t
;
using
DInDataType
=
ck
::
half_t
;
using
IndexDataType
=
int32_t
;
// We use pool3d to implement pool2d in this example
using
InLayout
=
ck
::
tensor_layout
::
convolution
::
NDHWC
;
using
OutLayout
=
ck
::
tensor_layout
::
convolution
::
NDHWC
;
constexpr
ck
::
index_t
InOutRank
=
5
;
constexpr
ck
::
index_t
WindowRank
=
3
;
struct
SimpleDeviceMem
{
SimpleDeviceMem
()
=
delete
;
SimpleDeviceMem
(
std
::
size_t
mem_size
)
:
p_mem_
{}
{
(
void
)
hipMalloc
(
static_cast
<
void
**>
(
&
p_mem_
),
mem_size
);
}
void
*
GetDeviceBuffer
()
{
return
p_mem_
;
}
~
SimpleDeviceMem
()
{
(
void
)
hipFree
(
p_mem_
);
}
void
*
p_mem_
;
};
void
TransformPool2dparamToPool3d
(
std
::
vector
<
ck
::
index_t
>&
input_lengths
,
std
::
vector
<
ck
::
index_t
>&
window_lengths
,
std
::
vector
<
ck
::
index_t
>&
output_lengths
,
std
::
vector
<
ck
::
index_t
>&
input_stride
,
std
::
vector
<
ck
::
index_t
>&
output_stride
,
std
::
vector
<
ck
::
index_t
>&
indices_stride
,
std
::
vector
<
ck
::
index_t
>&
window_strides
,
std
::
vector
<
ck
::
index_t
>&
window_dilations
,
std
::
vector
<
ck
::
index_t
>&
input_left_pads
,
std
::
vector
<
ck
::
index_t
>&
input_right_pads
,
std
::
vector
<
ck
::
index_t
>&
pooling_dims
)
{
// NCHW to NCDHW
input_lengths
.
insert
(
input_lengths
.
begin
()
+
2
,
1
);
output_lengths
.
insert
(
output_lengths
.
begin
()
+
2
,
1
);
input_stride
.
insert
(
input_stride
.
begin
()
+
2
,
0
);
output_stride
.
insert
(
output_stride
.
begin
()
+
2
,
0
);
indices_stride
.
insert
(
indices_stride
.
begin
()
+
2
,
0
);
// YX to ZYX
window_lengths
.
insert
(
window_lengths
.
begin
(),
1
);
window_strides
.
insert
(
window_strides
.
begin
(),
0
);
window_dilations
.
insert
(
window_dilations
.
begin
(),
0
);
input_left_pads
.
insert
(
input_left_pads
.
begin
(),
0
);
input_right_pads
.
insert
(
input_right_pads
.
begin
(),
0
);
pooling_dims
=
{
2
,
3
,
4
};
}
int
main
(
int
argc
,
char
*
argv
[])
{
ck
::
index_t
N
=
2
;
ck
::
index_t
C
=
32
;
ck
::
index_t
Y
=
2
;
ck
::
index_t
X
=
2
;
ck
::
index_t
Hi
=
30
;
ck
::
index_t
Wi
=
30
;
ck
::
index_t
window_stride_h
=
2
;
ck
::
index_t
window_stride_w
=
2
;
ck
::
index_t
window_dilation_h
=
1
;
ck
::
index_t
window_dilation_w
=
1
;
ck
::
index_t
in_left_pad_h
=
1
;
ck
::
index_t
in_left_pad_w
=
1
;
ck
::
index_t
in_right_pad_h
=
1
;
ck
::
index_t
in_right_pad_w
=
1
;
const
ck
::
index_t
Ys
=
(
Y
-
1
)
*
window_dilation_h
+
1
;
const
ck
::
index_t
Xs
=
(
X
-
1
)
*
window_dilation_w
+
1
;
ck
::
index_t
Ho
=
(
Hi
+
in_left_pad_h
+
in_right_pad_h
-
Ys
)
/
window_stride_h
+
1
;
ck
::
index_t
Wo
=
(
Wi
+
in_left_pad_w
+
in_right_pad_w
-
Xs
)
/
window_stride_w
+
1
;
// Pool API only support the order of NCHW
std
::
vector
<
ck
::
index_t
>
in_length
=
{
N
,
C
,
Hi
,
Wi
};
std
::
vector
<
ck
::
index_t
>
out_length
=
{
N
,
C
,
Ho
,
Wo
};
std
::
vector
<
ck
::
index_t
>
window_spatial_lengths
=
{
Y
,
X
};
std
::
vector
<
ck
::
index_t
>
window_strides
=
{
window_stride_h
,
window_stride_w
};
std
::
vector
<
ck
::
index_t
>
window_dilations
=
{
window_dilation_h
,
window_dilation_w
};
std
::
vector
<
ck
::
index_t
>
input_left_pads
=
{
in_left_pad_h
,
in_left_pad_w
};
std
::
vector
<
ck
::
index_t
>
input_right_pads
=
{
in_right_pad_h
,
in_right_pad_w
};
std
::
vector
<
ck
::
index_t
>
pooling_dims
=
{
2
,
3
};
std
::
size_t
in_tensor_size
=
N
*
C
*
Hi
*
Wi
;
std
::
size_t
out_tensor_size
=
N
*
C
*
Ho
*
Wo
;
// tensor layout = NHWC
std
::
vector
<
ck
::
index_t
>
in_tensor_stride
=
{
C
*
Hi
*
Wi
,
1
,
Wi
*
C
,
C
};
std
::
vector
<
ck
::
index_t
>
out_tensor_stride
=
{
C
*
Ho
*
Wo
,
1
,
Wo
*
C
,
C
};
TransformPool2dparamToPool3d
(
in_length
,
window_spatial_lengths
,
out_length
,
in_tensor_stride
,
out_tensor_stride
,
out_tensor_stride
,
window_strides
,
window_dilations
,
input_left_pads
,
input_right_pads
,
pooling_dims
);
SimpleDeviceMem
in_device_buf
(
sizeof
(
InDataType
)
*
in_tensor_size
);
SimpleDeviceMem
out_device_buf
(
sizeof
(
OutDataType
)
*
out_tensor_size
);
SimpleDeviceMem
indices_device_buf
(
sizeof
(
IndexDataType
)
*
out_tensor_size
);
SimpleDeviceMem
dout_device_buf
(
sizeof
(
DOutDataType
)
*
out_tensor_size
);
SimpleDeviceMem
din_device_buf
(
sizeof
(
DInDataType
)
*
in_tensor_size
);
// Generate index data from max pool forward
{
using
MaxPoolFwdDeviceOp
=
ck
::
tensor_operation
::
device
::
DevicePoolFwd
<
InOutRank
,
WindowRank
,
InDataType
,
OutDataType
,
IndexDataType
,
InLayout
,
OutLayout
,
ck
::
ReduceTensorOp
::
MAX
,
true
>
;
const
auto
op_ptrs
=
ck
::
tensor_operation
::
device
::
instance
::
DeviceOperationInstanceFactory
<
MaxPoolFwdDeviceOp
>::
GetInstances
();
auto
&
op_ptr
=
op_ptrs
[
0
];
auto
argument_ptr
=
op_ptr
->
MakeArgumentPointer
(
static_cast
<
InDataType
*>
(
in_device_buf
.
GetDeviceBuffer
()),
static_cast
<
OutDataType
*>
(
out_device_buf
.
GetDeviceBuffer
()),
static_cast
<
IndexDataType
*>
(
indices_device_buf
.
GetDeviceBuffer
()),
in_length
,
window_spatial_lengths
,
out_length
,
in_tensor_stride
,
out_tensor_stride
,
out_tensor_stride
,
window_strides
,
window_dilations
,
input_left_pads
,
input_right_pads
,
pooling_dims
);
auto
invoker_ptr
=
op_ptr
->
MakeInvokerPointer
();
std
::
string
op_name
=
op_ptr
->
GetTypeString
();
if
(
op_ptr
->
IsSupportedArgument
(
argument_ptr
.
get
()))
invoker_ptr
->
Run
(
argument_ptr
.
get
(),
StreamConfig
{
nullptr
,
true
});
}
// Run MaxPool bwd
using
MaxPoolBwdDeviceOp
=
ck
::
tensor_operation
::
device
::
DeviceMaxPoolBwd
<
DOutDataType
,
IndexDataType
,
DInDataType
>
;
const
auto
op_ptrs
=
ck
::
tensor_operation
::
device
::
instance
::
DeviceOperationInstanceFactory
<
MaxPoolBwdDeviceOp
>::
GetInstances
();
std
::
cout
<<
"found "
<<
op_ptrs
.
size
()
<<
" instances"
<<
std
::
endl
;
std
::
string
best_op_name
;
bool
found
=
false
;
int
best_op_id
=
-
1
;
float
best_ave_time
=
std
::
numeric_limits
<
float
>::
max
();
float
best_gb_per_sec
=
0
;
// profile device operation instances
std
::
cout
<<
"Run all instances and do timing"
<<
std
::
endl
;
for
(
int
i
=
0
;
i
<
op_ptrs
.
size
();
++
i
)
{
auto
&
op_ptr
=
op_ptrs
[
i
];
auto
argument_ptr
=
op_ptr
->
MakeArgumentPointer
(
static_cast
<
InDataType
*>
(
dout_device_buf
.
GetDeviceBuffer
()),
static_cast
<
IndexDataType
*>
(
indices_device_buf
.
GetDeviceBuffer
()),
static_cast
<
DInDataType
*>
(
din_device_buf
.
GetDeviceBuffer
()),
out_tensor_size
,
in_tensor_size
,
window_spatial_lengths
,
window_strides
,
window_dilations
);
auto
invoker_ptr
=
op_ptr
->
MakeInvokerPointer
();
std
::
string
op_name
=
op_ptr
->
GetTypeString
();
if
(
op_ptr
->
IsSupportedArgument
(
argument_ptr
.
get
()))
{
size_t
workspace_sz
=
op_ptr
->
GetWorkSpaceSize
(
argument_ptr
.
get
());
SimpleDeviceMem
workspace
(
workspace_sz
);
op_ptr
->
SetWorkSpacePointer
(
argument_ptr
.
get
(),
workspace
.
GetDeviceBuffer
());
float
ave_time
=
invoker_ptr
->
Run
(
argument_ptr
.
get
(),
StreamConfig
{
nullptr
,
true
});
std
::
size_t
num_bytes
=
in_tensor_size
*
sizeof
(
DInDataType
)
+
out_tensor_size
*
sizeof
(
IndexDataType
)
+
out_tensor_size
*
sizeof
(
DOutDataType
);
float
gb_per_sec
=
num_bytes
/
1.E6
/
ave_time
;
std
::
cout
<<
"Perf: "
<<
std
::
setw
(
10
)
<<
ave_time
<<
" ms, "
<<
gb_per_sec
<<
"GB / s,"
<<
op_name
<<
std
::
endl
;
if
(
ave_time
<
best_ave_time
)
{
found
=
true
;
best_op_id
=
i
;
best_op_name
=
op_name
;
best_ave_time
=
ave_time
;
best_gb_per_sec
=
gb_per_sec
;
}
}
else
{
std
::
cout
<<
op_name
<<
" does not support this problem"
<<
std
::
endl
;
}
}
// run the best intance
if
(
found
)
{
std
::
cout
<<
"Best Perf: "
<<
best_ave_time
<<
" ms, "
<<
best_gb_per_sec
<<
" GB/s, "
<<
best_op_name
<<
std
::
endl
;
auto
&
op_ptr
=
op_ptrs
[
best_op_id
];
std
::
cout
<<
"Run the best instance without timing: "
<<
op_ptr
->
GetTypeString
()
<<
std
::
endl
;
auto
argument_ptr
=
op_ptr
->
MakeArgumentPointer
(
static_cast
<
InDataType
*>
(
dout_device_buf
.
GetDeviceBuffer
()),
static_cast
<
IndexDataType
*>
(
indices_device_buf
.
GetDeviceBuffer
()),
static_cast
<
DInDataType
*>
(
din_device_buf
.
GetDeviceBuffer
()),
out_tensor_size
,
in_tensor_size
,
window_spatial_lengths
,
window_strides
,
window_dilations
);
auto
invoker_ptr
=
op_ptr
->
MakeInvokerPointer
();
if
(
op_ptr
->
IsSupportedArgument
(
argument_ptr
.
get
()))
{
size_t
workspace_sz
=
op_ptr
->
GetWorkSpaceSize
(
argument_ptr
.
get
());
SimpleDeviceMem
workspace
(
workspace_sz
);
op_ptr
->
SetWorkSpacePointer
(
argument_ptr
.
get
(),
workspace
.
GetDeviceBuffer
());
invoker_ptr
->
Run
(
argument_ptr
.
get
(),
StreamConfig
{
nullptr
,
false
});
}
std
::
cout
<<
"Done"
<<
std
::
endl
;
}
return
0
;
}
client_example/19_pool
_fwd
/max_pool2d_fwd.cpp
→
client_example/19_pool/max_pool2d_fwd.cpp
View file @
5a7f7334
File moved
example/49_maxpool2d_bwd/maxpool2d_bwd_common.hpp
View file @
5a7f7334
...
@@ -8,7 +8,7 @@
...
@@ -8,7 +8,7 @@
#include "ck/ck.hpp"
#include "ck/ck.hpp"
#include "ck/utility/reduction_enums.hpp"
#include "ck/utility/reduction_enums.hpp"
#include "ck/tensor_operation/gpu/device/impl/device_pool2d_fwd_nhwc_nhwc.hpp"
#include "ck/tensor_operation/gpu/device/impl/device_pool2d_fwd_nhwc_nhwc.hpp"
#include "ck/tensor_operation/gpu/device/impl/device_
inde
x_pool_bwd_impl.hpp"
#include "ck/tensor_operation/gpu/device/impl/device_
ma
x_pool_bwd_impl.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/library/utility/check_err.hpp"
#include "ck/library/utility/check_err.hpp"
...
@@ -60,7 +60,7 @@ bool maxpool_bwd_test(bool do_verification,
...
@@ -60,7 +60,7 @@ bool maxpool_bwd_test(bool do_verification,
1
>
;
// InSrcOutDstVectorSize
1
>
;
// InSrcOutDstVectorSize
using
DeviceMaxPoolBwdInstance
=
ck
::
tensor_operation
::
device
::
using
DeviceMaxPoolBwdInstance
=
ck
::
tensor_operation
::
device
::
Device
Inde
xPoolBwdImpl
<
DOutDataType
,
IndexDataType
,
DInDataType
,
4
>
;
Device
Ma
xPoolBwdImpl
<
DOutDataType
,
IndexDataType
,
DInDataType
,
4
>
;
const
ck
::
index_t
Ys
=
(
Y
-
1
)
*
window_dilation_h
+
1
;
const
ck
::
index_t
Ys
=
(
Y
-
1
)
*
window_dilation_h
+
1
;
const
ck
::
index_t
Xs
=
(
X
-
1
)
*
window_dilation_w
+
1
;
const
ck
::
index_t
Xs
=
(
X
-
1
)
*
window_dilation_w
+
1
;
...
@@ -155,7 +155,8 @@ bool maxpool_bwd_test(bool do_verification,
...
@@ -155,7 +155,8 @@ bool maxpool_bwd_test(bool do_verification,
dout_n_c_ho_wo
.
mDesc
.
GetElementSpaceSize
(),
dout_n_c_ho_wo
.
mDesc
.
GetElementSpaceSize
(),
din_n_c_hi_wi_device
.
mDesc
.
GetElementSpaceSize
(),
din_n_c_hi_wi_device
.
mDesc
.
GetElementSpaceSize
(),
window_spatial_lengths
,
window_spatial_lengths
,
window_strides
);
window_strides
,
window_dilations
);
if
(
!
pool_bwd
.
IsSupportedArgument
(
pool_bwd_argument_ptr
.
get
()))
if
(
!
pool_bwd
.
IsSupportedArgument
(
pool_bwd_argument_ptr
.
get
()))
{
{
...
...
include/ck/tensor_operation/gpu/device/device_
inde
x_pool_bwd.hpp
→
include/ck/tensor_operation/gpu/device/device_
ma
x_pool_bwd.hpp
View file @
5a7f7334
...
@@ -13,7 +13,7 @@ namespace device {
...
@@ -13,7 +13,7 @@ namespace device {
// For pooling which used indexable operation, such as MaxPool, MinPool...etc
// For pooling which used indexable operation, such as MaxPool, MinPool...etc
template
<
typename
DOutDataType
,
typename
IndexDataType
,
typename
DInDataType
>
template
<
typename
DOutDataType
,
typename
IndexDataType
,
typename
DInDataType
>
struct
Device
Inde
xPoolBwd
:
public
BaseOperator
struct
Device
Ma
xPoolBwd
:
public
BaseOperator
{
{
virtual
std
::
unique_ptr
<
BaseArgument
>
virtual
std
::
unique_ptr
<
BaseArgument
>
MakeArgumentPointer
(
const
void
*
p_dout
,
MakeArgumentPointer
(
const
void
*
p_dout
,
...
@@ -22,7 +22,8 @@ struct DeviceIndexPoolBwd : public BaseOperator
...
@@ -22,7 +22,8 @@ struct DeviceIndexPoolBwd : public BaseOperator
index_t
dout_length
,
index_t
dout_length
,
index_t
din_length
,
index_t
din_length
,
std
::
vector
<
ck
::
index_t
>
window_lengths
,
std
::
vector
<
ck
::
index_t
>
window_lengths
,
std
::
vector
<
ck
::
index_t
>
window_strides
)
=
0
;
std
::
vector
<
ck
::
index_t
>
window_strides
,
std
::
vector
<
ck
::
index_t
>
window_dilations
)
=
0
;
virtual
std
::
unique_ptr
<
BaseInvoker
>
MakeInvokerPointer
()
=
0
;
virtual
std
::
unique_ptr
<
BaseInvoker
>
MakeInvokerPointer
()
=
0
;
};
};
...
...
include/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_xdl_fixed_nk.hpp
View file @
5a7f7334
...
@@ -220,11 +220,11 @@ struct DeviceGroupedGemm_Xdl_Fixed_NK : public DeviceGroupedGemmFixedNK<ALayout,
...
@@ -220,11 +220,11 @@ struct DeviceGroupedGemm_Xdl_Fixed_NK : public DeviceGroupedGemmFixedNK<ALayout,
using
GridwiseGemm
=
GridwiseGemmMultipleD_xdl_splitk_cshuffle
<
using
GridwiseGemm
=
GridwiseGemmMultipleD_xdl_splitk_cshuffle
<
ADataType
,
// TODO: distinguish A/B datatype
ADataType
,
// TODO: distinguish A/B datatype
BDataType
,
BDataType
,
ComputeType
,
AccDataType
,
AccDataType
,
CShuffleDataType
,
CShuffleDataType
,
DsDataType
,
DsDataType
,
EDataType
,
EDataType
,
ComputeType
,
AElementwiseOperation
,
AElementwiseOperation
,
BElementwiseOperation
,
BElementwiseOperation
,
CDEElementwiseOperation
,
CDEElementwiseOperation
,
...
...
include/ck/tensor_operation/gpu/device/impl/device_
inde
x_pool_bwd_impl.hpp
→
include/ck/tensor_operation/gpu/device/impl/device_
ma
x_pool_bwd_impl.hpp
View file @
5a7f7334
...
@@ -8,7 +8,7 @@
...
@@ -8,7 +8,7 @@
#include "ck/tensor_description/tensor_descriptor.hpp"
#include "ck/tensor_description/tensor_descriptor.hpp"
#include "ck/tensor_description/tensor_descriptor_helper.hpp"
#include "ck/tensor_description/tensor_descriptor_helper.hpp"
#include "ck/tensor_operation/gpu/device/device_
inde
x_pool_bwd.hpp"
#include "ck/tensor_operation/gpu/device/device_
ma
x_pool_bwd.hpp"
#include "ck/tensor_operation/gpu/grid/gridwise_put_element_1d.hpp"
#include "ck/tensor_operation/gpu/grid/gridwise_put_element_1d.hpp"
#include "ck/tensor_operation/gpu/grid/gridwise_elementwise_1d.hpp"
#include "ck/tensor_operation/gpu/grid/gridwise_elementwise_1d.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
...
@@ -25,7 +25,7 @@ template <typename DOutDataType,
...
@@ -25,7 +25,7 @@ template <typename DOutDataType,
typename
IndexDataType
,
typename
IndexDataType
,
typename
DInDataType
,
typename
DInDataType
,
ck
::
index_t
InOutVectorSize
>
ck
::
index_t
InOutVectorSize
>
struct
Device
Inde
xPoolBwdImpl
:
public
Device
Inde
xPoolBwd
<
DOutDataType
,
IndexDataType
,
DInDataType
>
struct
Device
Ma
xPoolBwdImpl
:
public
Device
Ma
xPoolBwd
<
DOutDataType
,
IndexDataType
,
DInDataType
>
{
{
using
DInDataType_AutomicAddPreCast
=
using
DInDataType_AutomicAddPreCast
=
conditional_t
<
is_same_v
<
DInDataType
,
float
>
||
is_same_v
<
DInDataType
,
double
>
,
conditional_t
<
is_same_v
<
DInDataType
,
float
>
||
is_same_v
<
DInDataType
,
double
>
,
...
@@ -91,7 +91,8 @@ struct DeviceIndexPoolBwdImpl : public DeviceIndexPoolBwd<DOutDataType, IndexDat
...
@@ -91,7 +91,8 @@ struct DeviceIndexPoolBwdImpl : public DeviceIndexPoolBwd<DOutDataType, IndexDat
index_t
dout_length
,
index_t
dout_length
,
index_t
din_length
,
index_t
din_length
,
const
std
::
vector
<
ck
::
index_t
>&
window_lengths
,
const
std
::
vector
<
ck
::
index_t
>&
window_lengths
,
const
std
::
vector
<
ck
::
index_t
>&
window_strides
)
const
std
::
vector
<
ck
::
index_t
>&
window_strides
,
const
std
::
vector
<
ck
::
index_t
>&
window_dilations
)
:
p_dout_
{
p_dout
},
:
p_dout_
{
p_dout
},
p_indices_
{
p_indices
},
p_indices_
{
p_indices
},
p_din_
{
p_din
},
p_din_
{
p_din
},
...
@@ -102,7 +103,8 @@ struct DeviceIndexPoolBwdImpl : public DeviceIndexPoolBwd<DOutDataType, IndexDat
...
@@ -102,7 +103,8 @@ struct DeviceIndexPoolBwdImpl : public DeviceIndexPoolBwd<DOutDataType, IndexDat
{
{
for
(
size_t
i
=
0
;
i
<
window_lengths
.
size
();
++
i
)
for
(
size_t
i
=
0
;
i
<
window_lengths
.
size
();
++
i
)
{
{
windowOverlap_
|=
window_lengths
.
at
(
i
)
>
window_strides
.
at
(
i
);
auto
eff
=
(
window_lengths
.
at
(
i
)
-
1
)
*
window_dilations
.
at
(
i
)
+
1
;
windowOverlap_
|=
eff
>
window_strides
.
at
(
i
);
}
}
}
}
...
@@ -228,6 +230,11 @@ struct DeviceIndexPoolBwdImpl : public DeviceIndexPoolBwd<DOutDataType, IndexDat
...
@@ -228,6 +230,11 @@ struct DeviceIndexPoolBwdImpl : public DeviceIndexPoolBwd<DOutDataType, IndexDat
}
}
else
else
{
{
hip_check_error
(
hipMemsetAsync
(
arg
.
p_din_
,
0
,
arg
.
din_length_raw_
*
sizeof
(
DInDataType
),
stream_config
.
stream_id_
));
const
auto
put_kernel
=
kernel_put_element_1d
<
GridwisePutElementSet
,
const
auto
put_kernel
=
kernel_put_element_1d
<
GridwisePutElementSet
,
InOutGrid1dDesc
,
InOutGrid1dDesc
,
DOutDataType
,
DOutDataType
,
...
@@ -292,7 +299,8 @@ struct DeviceIndexPoolBwdImpl : public DeviceIndexPoolBwd<DOutDataType, IndexDat
...
@@ -292,7 +299,8 @@ struct DeviceIndexPoolBwdImpl : public DeviceIndexPoolBwd<DOutDataType, IndexDat
index_t
dout_length
,
index_t
dout_length
,
index_t
din_length
,
index_t
din_length
,
std
::
vector
<
ck
::
index_t
>
window_lengths
,
std
::
vector
<
ck
::
index_t
>
window_lengths
,
std
::
vector
<
ck
::
index_t
>
window_strides
)
override
std
::
vector
<
ck
::
index_t
>
window_strides
,
std
::
vector
<
ck
::
index_t
>
window_dilations
)
override
{
{
// Assume p_dout, p_indices, p_din are packed memory space, dout_length and din_length are
// Assume p_dout, p_indices, p_din are packed memory space, dout_length and din_length are
// physical size of the packed tensor
// physical size of the packed tensor
...
@@ -302,7 +310,8 @@ struct DeviceIndexPoolBwdImpl : public DeviceIndexPoolBwd<DOutDataType, IndexDat
...
@@ -302,7 +310,8 @@ struct DeviceIndexPoolBwdImpl : public DeviceIndexPoolBwd<DOutDataType, IndexDat
dout_length
,
dout_length
,
din_length
,
din_length
,
window_lengths
,
window_lengths
,
window_strides
);
window_strides
,
window_dilations
);
}
}
std
::
unique_ptr
<
BaseInvoker
>
MakeInvokerPointer
()
override
std
::
unique_ptr
<
BaseInvoker
>
MakeInvokerPointer
()
override
...
...
include/ck/tensor_operation/gpu/grid/gridwise_gemm_multiple_d_xdl_splitk_cshuffle.hpp
View file @
5a7f7334
...
@@ -29,13 +29,13 @@ namespace ck {
...
@@ -29,13 +29,13 @@ namespace ck {
// E = cde_op(C, D0, D1, ...)
// E = cde_op(C, D0, D1, ...)
// Assume:
// Assume:
// D0, D1, ... and E have the same layout
// D0, D1, ... and E have the same layout
template
<
typename
ADataType
,
// FIXME: don't assume A/B have same datatype
template
<
typename
ADataType
,
typename
BDataType
,
typename
BDataType
,
typename
ComputeType
,
typename
AccDataType
,
typename
AccDataType
,
typename
CShuffleDataType
,
typename
CShuffleDataType
,
typename
DsDataType
,
typename
DsDataType
,
typename
EDataType
,
typename
EDataType
,
typename
ComputeType
,
typename
AElementwiseOperation
,
typename
AElementwiseOperation
,
typename
BElementwiseOperation
,
typename
BElementwiseOperation
,
typename
CDEElementwiseOperation
,
typename
CDEElementwiseOperation
,
...
@@ -186,8 +186,8 @@ struct GridwiseGemmMultipleD_xdl_splitk_cshuffle
...
@@ -186,8 +186,8 @@ struct GridwiseGemmMultipleD_xdl_splitk_cshuffle
constexpr
auto
c_block_size
=
constexpr
auto
c_block_size
=
c_shuffle_block_desc_mblock_mperblock_nblock_nperblock
.
GetElementSpaceSize
();
c_shuffle_block_desc_mblock_mperblock_nblock_nperblock
.
GetElementSpaceSize
();
return
math
::
max
(
a_block_space_size_aligned
*
sizeof
(
ADataType
)
+
return
math
::
max
(
(
a_block_space_size_aligned
+
b_block_space_size_aligned
)
*
b_block_space_size_aligned
*
sizeof
(
BData
Type
),
sizeof
(
Compute
Type
),
c_block_size
*
sizeof
(
CShuffleDataType
));
c_block_size
*
sizeof
(
CShuffleDataType
));
}
}
...
...
include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_streamk.hpp
View file @
5a7f7334
...
@@ -37,7 +37,8 @@ __global__ void
...
@@ -37,7 +37,8 @@ __global__ void
index_t
StrideC
,
index_t
StrideC
,
typename
GridwiseGemm
::
Block2CTileMap
block_mapping
)
typename
GridwiseGemm
::
Block2CTileMap
block_mapping
)
{
{
#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx908__) || defined(__gfx90a__))
#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx908__) || defined(__gfx90a__) || \
defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__))
constexpr
index_t
shared_size
=
GridwiseGemm
::
GetSharedMemoryNumberOfByte
();
constexpr
index_t
shared_size
=
GridwiseGemm
::
GetSharedMemoryNumberOfByte
();
__shared__
uint8_t
p_shared
[
shared_size
];
__shared__
uint8_t
p_shared
[
shared_size
];
...
...
include/ck/utility/reduction_operator.hpp
View file @
5a7f7334
...
@@ -115,8 +115,16 @@ struct Max
...
@@ -115,8 +115,16 @@ struct Max
{
{
template
<
typename
T
>
template
<
typename
T
>
__host__
__device__
static
constexpr
T
GetIdentityValue
()
__host__
__device__
static
constexpr
T
GetIdentityValue
()
{
if
constexpr
(
is_same_v
<
T
,
bhalf_t
>
)
{
float
val
=
NumericLimits
<
float
>::
Lowest
();
return
type_convert
<
bhalf_t
>
(
val
);
}
else
{
{
return
NumericLimits
<
T
>::
Lowest
();
return
NumericLimits
<
T
>::
Lowest
();
}
};
};
__host__
__device__
static
constexpr
bool
__host__
__device__
static
constexpr
bool
...
@@ -138,6 +146,15 @@ struct Max
...
@@ -138,6 +146,15 @@ struct Max
a
=
b
;
a
=
b
;
}
}
__host__
__device__
inline
constexpr
void
operator
()(
bhalf_t
&
a
,
bhalf_t
b
)
const
{
float
a_
=
type_convert
<
float
>
(
a
);
float
b_
=
type_convert
<
float
>
(
b
);
if
(
a_
<
b_
)
a
=
b
;
}
template
<
typename
T
>
template
<
typename
T
>
__host__
__device__
inline
constexpr
void
operator
()(
T
&
a
,
T
b
,
bool
&
changed
)
const
__host__
__device__
inline
constexpr
void
operator
()(
T
&
a
,
T
b
,
bool
&
changed
)
const
{
{
...
@@ -152,14 +169,35 @@ struct Max
...
@@ -152,14 +169,35 @@ struct Max
changed
=
true
;
changed
=
true
;
}
}
}
}
__host__
__device__
inline
constexpr
void
operator
()(
bhalf_t
&
a
,
bhalf_t
b
,
bool
&
changed
)
const
{
float
a_
=
type_convert
<
float
>
(
a
);
float
b_
=
type_convert
<
float
>
(
b
);
if
(
a_
<
b_
)
{
a
=
b
;
changed
=
true
;
}
}
};
};
struct
Min
struct
Min
{
{
template
<
typename
T
>
template
<
typename
T
>
__host__
__device__
static
constexpr
T
GetIdentityValue
()
__host__
__device__
static
constexpr
T
GetIdentityValue
()
{
if
constexpr
(
is_same_v
<
T
,
bhalf_t
>
)
{
float
val
=
NumericLimits
<
float
>::
Max
();
return
type_convert
<
bhalf_t
>
(
val
);
}
else
{
{
return
NumericLimits
<
T
>::
Max
();
return
NumericLimits
<
T
>::
Max
();
}
return
NumericLimits
<
T
>::
Max
();
};
};
__host__
__device__
static
constexpr
bool
__host__
__device__
static
constexpr
bool
...
@@ -181,6 +219,15 @@ struct Min
...
@@ -181,6 +219,15 @@ struct Min
a
=
b
;
a
=
b
;
}
}
__host__
__device__
inline
constexpr
void
operator
()(
bhalf_t
&
a
,
bhalf_t
b
)
const
{
float
a_
=
type_convert
<
float
>
(
a
);
float
b_
=
type_convert
<
float
>
(
b
);
if
(
a_
>
b_
)
a
=
b
;
}
template
<
typename
T
>
template
<
typename
T
>
__host__
__device__
inline
constexpr
void
operator
()(
T
&
a
,
T
b
,
bool
&
changed
)
const
__host__
__device__
inline
constexpr
void
operator
()(
T
&
a
,
T
b
,
bool
&
changed
)
const
{
{
...
@@ -195,6 +242,18 @@ struct Min
...
@@ -195,6 +242,18 @@ struct Min
changed
=
true
;
changed
=
true
;
}
}
}
}
__host__
__device__
inline
constexpr
void
operator
()(
bhalf_t
&
a
,
bhalf_t
b
,
bool
&
changed
)
const
{
float
a_
=
type_convert
<
float
>
(
a
);
float
b_
=
type_convert
<
float
>
(
b
);
if
(
a_
>
b_
)
{
a
=
b
;
changed
=
true
;
}
}
};
};
struct
AMax
struct
AMax
...
...
library/include/ck/library/reference_tensor_operation/cpu/reference_maxpool_bwd.hpp
View file @
5a7f7334
...
@@ -53,8 +53,17 @@ struct ReferenceMaxPoolBwd : public device::BaseOperator
...
@@ -53,8 +53,17 @@ struct ReferenceMaxPoolBwd : public device::BaseOperator
{
{
int
index
=
arg
.
indices_
.
mData
[
i
];
int
index
=
arg
.
indices_
.
mData
[
i
];
if
(
index
>=
0
&&
index
<
din_length
)
if
(
index
>=
0
&&
index
<
din_length
)
{
if
constexpr
(
is_same_v
<
ConputeDataType
,
bhalf_t
>
)
{
float
buf_val
=
ck
::
type_convert
<
float
>
(
buf
[
index
]);
buf_val
+=
ck
::
type_convert
<
float
>
(
arg
.
dout_
.
mData
[
i
]);
buf
[
index
]
=
ck
::
type_convert
<
ConputeDataType
>
(
buf_val
);
}
else
buf
[
index
]
+=
ck
::
type_convert
<
ConputeDataType
>
(
arg
.
dout_
.
mData
[
i
]);
buf
[
index
]
+=
ck
::
type_convert
<
ConputeDataType
>
(
arg
.
dout_
.
mData
[
i
]);
}
}
}
for
(
int
i
=
0
;
i
<
din_length
;
++
i
)
for
(
int
i
=
0
;
i
<
din_length
;
++
i
)
arg
.
din_
.
mData
[
i
]
=
ck
::
type_convert
<
DInDataType
>
(
buf
[
i
]);
arg
.
din_
.
mData
[
i
]
=
ck
::
type_convert
<
DInDataType
>
(
buf
[
i
]);
...
...
library/include/ck/library/reference_tensor_operation/cpu/reference_pool_fwd.hpp
View file @
5a7f7334
...
@@ -256,10 +256,12 @@ struct ReferencePoolingFwd : public device::BaseOperator
...
@@ -256,10 +256,12 @@ struct ReferencePoolingFwd : public device::BaseOperator
for
(
ck
::
index_t
y
=
0
;
y
<
arg
.
window_spatial_lengths_
[
0
];
++
y
)
for
(
ck
::
index_t
y
=
0
;
y
<
arg
.
window_spatial_lengths_
[
0
];
++
y
)
{
{
ck
::
index_t
hi
=
ho
*
arg
.
window_strides_
[
0
]
+
y
-
arg
.
in_left_pads_
[
0
];
ck
::
index_t
hi
=
ho
*
arg
.
window_strides_
[
0
]
+
y
*
arg
.
window_dilations_
[
0
]
-
arg
.
in_left_pads_
[
0
];
for
(
ck
::
index_t
x
=
0
;
x
<
arg
.
window_spatial_lengths_
[
1
];
++
x
)
for
(
ck
::
index_t
x
=
0
;
x
<
arg
.
window_spatial_lengths_
[
1
];
++
x
)
{
{
ck
::
index_t
wi
=
wo
*
arg
.
window_strides_
[
1
]
+
x
-
arg
.
in_left_pads_
[
1
];
ck
::
index_t
wi
=
wo
*
arg
.
window_strides_
[
1
]
+
x
*
arg
.
window_dilations_
[
1
]
-
arg
.
in_left_pads_
[
1
];
if
(
hi
>=
0
&&
if
(
hi
>=
0
&&
hi
<
static_cast
<
ck
::
index_t
>
(
arg
.
in_
.
mDesc
.
GetLengths
()[
2
])
&&
hi
<
static_cast
<
ck
::
index_t
>
(
arg
.
in_
.
mDesc
.
GetLengths
()[
2
])
&&
wi
>=
0
&&
wi
>=
0
&&
...
...
library/include/ck/library/tensor_operation_instance/gpu/avg_pool3d_bwd.hpp
0 → 100644
View file @
5a7f7334
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#include "ck/tensor_operation/gpu/device/device_avgpool_bwd.hpp"
#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp"
namespace
ck
{
namespace
tensor_operation
{
namespace
device
{
namespace
instance
{
#ifdef CK_ENABLE_FP16
void
add_device_avgpool_bwd_ndhwc_f16_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceAvgPoolBwd
<
3
,
F16
,
F16
,
NDHWC
,
NDHWC
>>>&
);
#endif
#ifdef CK_ENABLE_BF16
void
add_device_avgpool_bwd_ndhwc_bf16_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceAvgPoolBwd
<
3
,
BF16
,
BF16
,
NDHWC
,
NDHWC
>>>&
);
#endif
#ifdef CK_ENABLE_FP32
void
add_device_avgpool_bwd_ndhwc_f32_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceAvgPoolBwd
<
3
,
F32
,
F32
,
NDHWC
,
NDHWC
>>>&
);
#endif
template
<
typename
DOutDataType
,
typename
DInDataType
,
typename
InLayout
,
typename
OutLayout
>
struct
DeviceOperationInstanceFactory
<
ck
::
tensor_operation
::
device
::
DeviceAvgPoolBwd
<
3
,
DOutDataType
,
DInDataType
,
InLayout
,
OutLayout
>>
{
using
DeviceOp
=
DeviceAvgPoolBwd
<
3
,
DOutDataType
,
DInDataType
,
InLayout
,
OutLayout
>
;
static
auto
GetInstances
()
{
std
::
vector
<
std
::
unique_ptr
<
DeviceOp
>>
op_ptrs
;
if
constexpr
(
is_same_v
<
InLayout
,
NDHWC
>
&&
is_same_v
<
OutLayout
,
NDHWC
>
)
{
#ifdef CK_ENABLE_FP16
if
constexpr
(
is_same_v
<
DOutDataType
,
F16
>
&&
is_same_v
<
DInDataType
,
F16
>
)
add_device_avgpool_bwd_ndhwc_f16_instances
(
op_ptrs
);
#endif
#ifdef CK_ENABLE_BF16
else
if
constexpr
(
is_same_v
<
DOutDataType
,
BF16
>
&&
is_same_v
<
DInDataType
,
BF16
>
)
add_device_avgpool_bwd_ndhwc_bf16_instances
(
op_ptrs
);
#endif
#ifdef CK_ENABLE_FP32
else
if
constexpr
(
is_same_v
<
DOutDataType
,
F32
>
&&
is_same_v
<
DInDataType
,
F32
>
)
add_device_avgpool_bwd_ndhwc_f32_instances
(
op_ptrs
);
#endif
}
return
op_ptrs
;
}
};
}
// namespace instance
}
// namespace device
}
// namespace tensor_operation
}
// namespace ck
library/include/ck/library/tensor_operation_instance/gpu/max_pool_bwd.hpp
0 → 100644
View file @
5a7f7334
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#include "ck/tensor_operation/gpu/device/device_max_pool_bwd.hpp"
#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp"
namespace
ck
{
namespace
tensor_operation
{
namespace
device
{
namespace
instance
{
#ifdef CK_ENABLE_FP16
void
add_device_maxpool_bwd_f16_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceMaxPoolBwd
<
F16
,
I32
,
F16
>>>&
);
#endif
#ifdef CK_ENABLE_BF16
void
add_device_maxpool_bwd_bf16_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceMaxPoolBwd
<
BF16
,
I32
,
BF16
>>>&
);
#endif
#ifdef CK_ENABLE_FP32
void
add_device_maxpool_bwd_f32_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceMaxPoolBwd
<
F32
,
I32
,
F32
>>>&
);
#endif
template
<
typename
DOutDataType
,
typename
IndexDataType
,
typename
DInDataType
>
struct
DeviceOperationInstanceFactory
<
ck
::
tensor_operation
::
device
::
DeviceMaxPoolBwd
<
DOutDataType
,
IndexDataType
,
DInDataType
>>
{
using
DeviceOp
=
DeviceMaxPoolBwd
<
DOutDataType
,
IndexDataType
,
DInDataType
>
;
static
auto
GetInstances
()
{
std
::
vector
<
std
::
unique_ptr
<
DeviceOp
>>
op_ptrs
;
#ifdef CK_ENABLE_FP16
if
constexpr
(
is_same_v
<
DOutDataType
,
F16
>
&&
is_same_v
<
DInDataType
,
F16
>
&&
is_same_v
<
IndexDataType
,
I32
>
)
add_device_maxpool_bwd_f16_instances
(
op_ptrs
);
#endif
#ifdef CK_ENABLE_BF16
else
if
constexpr
(
is_same_v
<
DOutDataType
,
BF16
>
&&
is_same_v
<
DInDataType
,
BF16
>
&&
is_same_v
<
IndexDataType
,
I32
>
)
add_device_maxpool_bwd_bf16_instances
(
op_ptrs
);
#endif
#ifdef CK_ENABLE_FP32
else
if
constexpr
(
is_same_v
<
DOutDataType
,
F32
>
&&
is_same_v
<
DInDataType
,
F32
>
&&
is_same_v
<
IndexDataType
,
I32
>
)
add_device_maxpool_bwd_f32_instances
(
op_ptrs
);
#endif
return
op_ptrs
;
}
};
}
// namespace instance
}
// namespace device
}
// namespace tensor_operation
}
// namespace ck
library/include/ck/library/tensor_operation_instance/gpu/pool3d_fwd.hpp
View file @
5a7f7334
...
@@ -37,6 +37,21 @@ void add_device_pool3d_fwd_ndhwc_index_f16_instances(
...
@@ -37,6 +37,21 @@ void add_device_pool3d_fwd_ndhwc_index_f16_instances(
std
::
vector
<
std
::
unique_ptr
<
std
::
vector
<
std
::
unique_ptr
<
DevicePoolFwd
<
InOutRank
,
WindowRank
,
F16
,
F16
,
I32
,
NDHWC
,
NDHWC
,
MaxOp
,
true
>>>&
);
DevicePoolFwd
<
InOutRank
,
WindowRank
,
F16
,
F16
,
I32
,
NDHWC
,
NDHWC
,
MaxOp
,
true
>>>&
);
#endif
#endif
#ifdef CK_ENABLE_BF16
// BF16
void
add_device_pool3d_fwd_ndhwc_bf16_instances
(
std
::
vector
<
std
::
unique_ptr
<
DevicePoolFwd
<
InOutRank
,
WindowRank
,
BF16
,
BF16
,
I32
,
NDHWC
,
NDHWC
,
MaxOp
,
false
>>>&
);
void
add_device_pool3d_fwd_ndhwc_bf16_instances
(
std
::
vector
<
std
::
unique_ptr
<
DevicePoolFwd
<
InOutRank
,
WindowRank
,
BF16
,
BF16
,
I32
,
NDHWC
,
NDHWC
,
AvgOp
,
false
>>>&
);
// BF16 - return index
void
add_device_pool3d_fwd_ndhwc_index_bf16_instances
(
std
::
vector
<
std
::
unique_ptr
<
DevicePoolFwd
<
InOutRank
,
WindowRank
,
BF16
,
BF16
,
I32
,
NDHWC
,
NDHWC
,
MaxOp
,
true
>>>&
);
#endif
#ifdef CK_ENABLE_FP32
#ifdef CK_ENABLE_FP32
// FP32
// FP32
void
add_device_pool3d_fwd_ndhwc_f32_instances
(
void
add_device_pool3d_fwd_ndhwc_f32_instances
(
...
@@ -98,8 +113,22 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DevicePoolFw
...
@@ -98,8 +113,22 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DevicePoolFw
}
}
}
}
#endif
#endif
#ifdef CK_ENABLE_BF16
else
if
constexpr
(
is_same_v
<
InDataType
,
BF16
>
&&
is_same_v
<
OutDataType
,
BF16
>
&&
is_same_v
<
IndexDataType
,
I32
>
)
{
if
constexpr
(
OutputIndex
&&
ReduceOpId
==
MaxOp
)
{
add_device_pool3d_fwd_ndhwc_index_bf16_instances
(
op_ptrs
);
}
else
{
add_device_pool3d_fwd_ndhwc_bf16_instances
(
op_ptrs
);
}
}
#endif
#ifdef CK_ENABLE_FP32
#ifdef CK_ENABLE_FP32
if
constexpr
(
is_same_v
<
InDataType
,
F32
>
&&
is_same_v
<
OutDataType
,
F32
>
&&
else
if
constexpr
(
is_same_v
<
InDataType
,
F32
>
&&
is_same_v
<
OutDataType
,
F32
>
&&
is_same_v
<
IndexDataType
,
I32
>
)
is_same_v
<
IndexDataType
,
I32
>
)
{
{
if
constexpr
(
OutputIndex
&&
ReduceOpId
==
MaxOp
)
if
constexpr
(
OutputIndex
&&
ReduceOpId
==
MaxOp
)
...
...
library/src/tensor_operation_instance/gpu/avg_pool3d_bwd/CMakeLists.txt
0 → 100644
View file @
5a7f7334
set
(
DEVICE_AVGPOOL_BWD_INSTANCES
)
if
(
DTYPES MATCHES
"fp16"
OR NOT DEFINED DTYPES
)
list
(
APPEND DEVICE_AVGPOOL_BWD_INSTANCES device_avg_pool3d_bwd_ndhwc_f16_instance.cpp
)
endif
()
if
(
DTYPES MATCHES
"bf16"
OR NOT DEFINED DTYPES
)
list
(
APPEND DEVICE_AVGPOOL_BWD_INSTANCES device_avg_pool3d_bwd_ndhwc_bf16_instance.cpp
)
endif
()
if
(
DTYPES MATCHES
"fp32"
OR NOT DEFINED DTYPES
)
list
(
APPEND DEVICE_AVGPOOL_BWD_INSTANCES device_avg_pool3d_bwd_ndhwc_f32_instance.cpp
)
endif
()
add_instance_library
(
device_avg_pool3d_bwd_instance
${
DEVICE_AVGPOOL_BWD_INSTANCES
}
)
Prev
1
2
3
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment