Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
composable_kernel
Commits
7e44fd84
Commit
7e44fd84
authored
Oct 14, 2022
by
Jing Zhang
Browse files
fixed
parent
15965dfc
Changes
4
Show whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
108 additions
and
91 deletions
+108
-91
example/43_elementwise_permute_2d/elementwise_permute_4D_fp16_2d.cpp
...elementwise_permute_2d/elementwise_permute_4D_fp16_2d.cpp
+22
-28
include/ck/tensor_operation/gpu/device/device_elementwise_2d.hpp
.../ck/tensor_operation/gpu/device/device_elementwise_2d.hpp
+45
-32
include/ck/tensor_operation/gpu/grid/gridwise_elementwise_2d.hpp
.../ck/tensor_operation/gpu/grid/gridwise_elementwise_2d.hpp
+35
-31
include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer.hpp
...operation/gpu/thread/threadwise_tensor_slice_transfer.hpp
+6
-0
No files found.
example/43_elementwise_permute_2d/elementwise_permute_4D_fp16_2d.cpp
View file @
7e44fd84
...
@@ -11,7 +11,6 @@
...
@@ -11,7 +11,6 @@
#include "ck/library/utility/host_tensor_generator.hpp"
#include "ck/library/utility/host_tensor_generator.hpp"
using
F16
=
ck
::
half_t
;
using
F16
=
ck
::
half_t
;
using
F32
=
float
;
using
ADataType
=
F16
;
using
ADataType
=
F16
;
using
BDataType
=
F16
;
using
BDataType
=
F16
;
...
@@ -21,13 +20,12 @@ using DeviceElementwisePermuteInstance =
...
@@ -21,13 +20,12 @@ using DeviceElementwisePermuteInstance =
ck
::
tensor_operation
::
device
::
DeviceElementwise
<
ck
::
Tuple
<
ADataType
>
,
ck
::
tensor_operation
::
device
::
DeviceElementwise
<
ck
::
Tuple
<
ADataType
>
,
ck
::
Tuple
<
BDataType
>
,
ck
::
Tuple
<
BDataType
>
,
PassThrough
,
PassThrough
,
4
,
3
,
// NumDim_M
3
,
1
,
// NumDim_N
1
,
8
,
8
,
8
,
8
,
ck
::
Sequence
<
1
>
,
ck
::
Sequence
<
8
>
,
ck
::
Sequence
<
1
>>
;
ck
::
Sequence
<
8
>>
;
template
<
typename
HostTensorA
,
typename
HostTensorB
,
typename
Functor
>
template
<
typename
HostTensorA
,
typename
HostTensorB
,
typename
Functor
>
void
host_elementwise4D
(
HostTensorB
&
B_nhwc
,
void
host_elementwise4D
(
HostTensorB
&
B_nhwc
,
...
@@ -48,10 +46,16 @@ void host_elementwise4D(HostTensorB& B_nhwc,
...
@@ -48,10 +46,16 @@ void host_elementwise4D(HostTensorB& B_nhwc,
int
main
()
int
main
()
{
{
bool
do_verification
=
true
;
bool
do_verification
=
true
;
bool
time_kernel
=
tru
e
;
bool
time_kernel
=
fals
e
;
std
::
vector
<
std
::
size_t
>
nchw
=
{
4
,
8
,
4
,
8
};
const
int
N
=
16
;
std
::
vector
<
std
::
size_t
>
nhwc
=
{
4
,
4
,
8
,
8
};
const
int
H
=
32
;
const
int
W
=
64
;
const
int
C
=
128
;
std
::
vector
<
std
::
size_t
>
nchw
=
{
N
,
C
,
H
,
W
};
std
::
vector
<
std
::
size_t
>
nhwc
=
{
N
,
H
,
W
,
C
};
Tensor
<
ADataType
>
a
(
nchw
);
Tensor
<
ADataType
>
a
(
nchw
);
Tensor
<
BDataType
>
b
(
nhwc
);
Tensor
<
BDataType
>
b
(
nhwc
);
...
@@ -62,28 +66,16 @@ int main()
...
@@ -62,28 +66,16 @@ int main()
DeviceMem
b_device_buf
(
sizeof
(
BDataType
)
*
b
.
mDesc
.
GetElementSpaceSize
());
DeviceMem
b_device_buf
(
sizeof
(
BDataType
)
*
b
.
mDesc
.
GetElementSpaceSize
());
a_device_buf
.
ToDevice
(
a
.
mData
.
data
());
a_device_buf
.
ToDevice
(
a
.
mData
.
data
());
LogRangeAsType
<
float
>
(
std
::
cout
<<
"Tensor a : "
,
a
.
mData
,
","
)
<<
std
::
endl
;
//
LogRangeAsType<float>(std::cout << "Tensor a : ", a.mData, ",") << std::endl;
std
::
array
<
const
void
*
,
1
>
input
=
{
a_device_buf
.
GetDeviceBuffer
()};
std
::
array
<
const
void
*
,
1
>
input
=
{
a_device_buf
.
GetDeviceBuffer
()};
std
::
array
<
void
*
,
1
>
output
=
{
b_device_buf
.
GetDeviceBuffer
()};
std
::
array
<
void
*
,
1
>
output
=
{
b_device_buf
.
GetDeviceBuffer
()};
std
::
array
<
ck
::
index_t
,
4
>
ab_lengths
;
std
::
array
<
ck
::
index_t
,
4
>
ab_lengths
{
N
,
H
,
W
,
C
};
std
::
array
<
ck
::
index_t
,
4
>
a_strides
=
{
static_cast
<
int
>
(
nhwc
[
1
]
*
nhwc
[
2
]
*
nhwc
[
3
]),
// std::copy(nhwc.begin(), nhwc.end(), ab_lengths.begin());
static_cast
<
int
>
(
nhwc
[
2
]),
1
,
static_cast
<
int
>
(
nhwc
[
1
]
*
nhwc
[
2
])};
std
::
array
<
ck
::
index_t
,
4
>
b_strides
=
{
static_cast
<
int
>
(
nhwc
[
1
]
*
nhwc
[
2
]
*
nhwc
[
3
]),
static_cast
<
int
>
(
nhwc
[
2
]
*
nhwc
[
3
]),
static_cast
<
int
>
(
nhwc
[
3
]),
1
};
// std::cout << "Length: " << ab_lengths << std::endl;
// std::cout << "A stride: " << a_strides << std::endl;
// std::cout << "B stride: " << b_strides << std::endl;
std
::
copy
(
nhwc
.
begin
(),
nhwc
.
end
(),
ab_lengths
.
begin
());
std
::
array
<
ck
::
index_t
,
4
>
a_strides
=
{
C
*
H
*
W
,
W
,
1
,
H
*
W
};
// std::copy(a.mDesc.GetStrides().begin(), a.mDesc.GetStrides().end(), a_strides.begin());
std
::
array
<
ck
::
index_t
,
4
>
b_strides
=
{
H
*
W
*
C
,
W
*
C
,
C
,
1
};
// std::copy(b.mDesc.GetStrides().begin(), b.mDesc.GetStrides().end(), b_strides.begin());
auto
broadcastPermute
=
DeviceElementwisePermuteInstance
{};
auto
broadcastPermute
=
DeviceElementwisePermuteInstance
{};
auto
argument
=
broadcastPermute
.
MakeArgumentPointer
(
auto
argument
=
broadcastPermute
.
MakeArgumentPointer
(
...
@@ -94,6 +86,7 @@ int main()
...
@@ -94,6 +86,7 @@ int main()
throw
std
::
runtime_error
(
throw
std
::
runtime_error
(
"The runtime parameters seems not supported by the device instance, exiting!"
);
"The runtime parameters seems not supported by the device instance, exiting!"
);
};
};
auto
broadcastPermute_invoker_ptr
=
broadcastPermute
.
MakeInvokerPointer
();
auto
broadcastPermute_invoker_ptr
=
broadcastPermute
.
MakeInvokerPointer
();
float
ave_time
=
float
ave_time
=
broadcastPermute_invoker_ptr
->
Run
(
argument
.
get
(),
StreamConfig
{
nullptr
,
time_kernel
});
broadcastPermute_invoker_ptr
->
Run
(
argument
.
get
(),
StreamConfig
{
nullptr
,
time_kernel
});
...
@@ -105,12 +98,13 @@ int main()
...
@@ -105,12 +98,13 @@ int main()
if
(
do_verification
)
if
(
do_verification
)
{
{
b_device_buf
.
FromDevice
(
b
.
mData
.
data
());
b_device_buf
.
FromDevice
(
b
.
mData
.
data
());
LogRangeAsType
<
float
>
(
std
::
cout
<<
"Tensor b : "
,
b
.
mData
,
","
)
<<
std
::
endl
;
// LogRangeAsType<float>(std::cout << "Tensor b : ", b.mData, ",") << std::endl;
Tensor
<
BDataType
>
host_b
(
nhwc
);
Tensor
<
BDataType
>
host_b
(
nhwc
);
host_elementwise4D
<
Tensor
<
ADataType
>
,
Tensor
<
BDataType
>
,
PassThrough
>
(
host_elementwise4D
<
Tensor
<
ADataType
>
,
Tensor
<
BDataType
>
,
PassThrough
>
(
host_b
,
a
,
nchw
,
PassThrough
{});
host_b
,
a
,
nchw
,
PassThrough
{});
LogRangeAsType
<
float
>
(
std
::
cout
<<
"Host b : "
,
host_b
.
mData
,
","
)
<<
std
::
endl
;
//
LogRangeAsType<float>(std::cout << "Host b : ", host_b.mData, ",") << std::endl;
pass
&=
pass
&=
ck
::
utils
::
check_err
(
b
.
mData
,
host_b
.
mData
,
"Error: Incorrect results b"
,
1e-3
,
1e-3
);
ck
::
utils
::
check_err
(
b
.
mData
,
host_b
.
mData
,
"Error: Incorrect results b"
,
1e-3
,
1e-3
);
}
}
...
...
include/ck/tensor_operation/gpu/device/device_elementwise_2d.hpp
View file @
7e44fd84
...
@@ -20,20 +20,24 @@ namespace device {
...
@@ -20,20 +20,24 @@ namespace device {
template
<
typename
InDataTypeTuple
,
template
<
typename
InDataTypeTuple
,
typename
OutDataTypeTuple
,
typename
OutDataTypeTuple
,
typename
ElementwiseOperation
,
typename
ElementwiseOperation
,
index_t
NumDim
,
index_t
NumDim_m
,
index_t
NumDim_m
,
index_t
NumDim_n
,
index_t
NumDim_n
,
index_t
MPerThread
,
index_t
MPerThread
,
index_t
NPerThread
,
index_t
NPerThread
,
typename
InScalarPerVectorSeq
,
typename
InScalarPerVectorSeq
,
typename
OutScalarPerVectorSeq
>
typename
OutScalarPerVectorSeq
>
struct
DeviceElementwise
struct
DeviceElementwise
:
public
DeviceElementwiseBase
<
InDataTypeTuple
,
:
public
DeviceElementwiseBase
<
InDataTypeTuple
,
OutDataTypeTuple
,
ElementwiseOperation
,
NumDim
>
OutDataTypeTuple
,
ElementwiseOperation
,
NumDim_m
+
NumDim_n
>
{
{
static
constexpr
index_t
NumDim
=
NumDim_m
+
NumDim_n
;
static
constexpr
int
NumInput
=
InDataTypeTuple
::
Size
();
static
constexpr
int
NumInput
=
InDataTypeTuple
::
Size
();
static
constexpr
int
NumOutput
=
OutDataTypeTuple
::
Size
();
static
constexpr
int
NumOutput
=
OutDataTypeTuple
::
Size
();
// const index_t NumDim = NumDim_m + NumDim_n;
static
constexpr
auto
I0
=
Number
<
0
>
{};
static
constexpr
auto
I1
=
Number
<
1
>
{};
static_assert
(
NumInput
==
InScalarPerVectorSeq
::
Size
()
&&
static_assert
(
NumInput
==
InScalarPerVectorSeq
::
Size
()
&&
NumOutput
==
OutScalarPerVectorSeq
::
Size
(),
NumOutput
==
OutScalarPerVectorSeq
::
Size
(),
...
@@ -67,15 +71,16 @@ struct DeviceElementwise
...
@@ -67,15 +71,16 @@ struct DeviceElementwise
template
<
typename
Desc_MN
>
template
<
typename
Desc_MN
>
static
auto
PadDescriptor_MN_2d
(
Desc_MN
desc_mn
,
index_t
gridSize
,
index_t
blockSize
)
static
auto
PadDescriptor_MN_2d
(
Desc_MN
desc_mn
,
index_t
gridSize
,
index_t
blockSize
)
{
{
constexpr
auto
I0
=
Number
<
0
>
{};
constexpr
auto
I1
=
Number
<
1
>
{};
const
auto
m
=
desc_mn
.
GetLength
(
I0
);
const
auto
m
=
desc_mn
.
GetLength
(
I0
);
const
auto
n
=
desc_mn
.
GetLength
(
I1
);
const
auto
n
=
desc_mn
.
GetLength
(
I1
);
const
index_t
loop_step_m
=
gridSize
*
blockSize
*
MPerThread
;
const
index_t
loop_step_m
=
MPerThread
;
const
index_t
loop_step_n
=
gridSize
*
blockSize
*
NPerThread
;
const
index_t
loop_step_n
=
gridSize
*
blockSize
*
NPerThread
;
const
auto
pad_m
=
math
::
integer_least_multiple
(
m
,
loop_step_m
)
-
m
;
const
auto
pad_m
=
math
::
integer_least_multiple
(
m
,
loop_step_m
)
-
m
;
const
auto
pad_n
=
math
::
integer_least_multiple
(
n
,
loop_step_n
)
-
n
;
const
auto
pad_n
=
math
::
integer_least_multiple
(
n
,
loop_step_n
)
-
n
;
std
::
cout
<<
NumDim_m
<<
" m: "
<<
m
<<
" loop_step_m: "
<<
loop_step_m
<<
" pad_m: "
<<
pad_m
<<
std
::
endl
;
std
::
cout
<<
NumDim_n
<<
" n: "
<<
n
<<
" loop_step_n: "
<<
loop_step_n
<<
" pad_n: "
<<
pad_n
<<
std
::
endl
;
const
auto
desc_mn_pad
=
transform_tensor_descriptor
(
const
auto
desc_mn_pad
=
transform_tensor_descriptor
(
desc_mn
,
desc_mn
,
make_tuple
(
make_right_pad_transform
(
m
,
pad_m
),
make_right_pad_transform
(
n
,
pad_n
)),
make_tuple
(
make_right_pad_transform
(
m
,
pad_m
),
make_right_pad_transform
(
n
,
pad_n
)),
...
@@ -137,7 +142,6 @@ struct DeviceElementwise
...
@@ -137,7 +142,6 @@ struct DeviceElementwise
using
OutGrid2dDescTuple
=
decltype
(
GenerateInOutGrid2dDescTuple
(
Number
<
NumOutput
>
{}));
using
OutGrid2dDescTuple
=
decltype
(
GenerateInOutGrid2dDescTuple
(
Number
<
NumOutput
>
{}));
using
InGrid2dDescTuple
=
decltype
(
GenerateInOutGrid2dDescTuple
(
Number
<
NumInput
>
{}));
using
InGrid2dDescTuple
=
decltype
(
GenerateInOutGrid2dDescTuple
(
Number
<
NumInput
>
{}));
// using OutGrid2dDescTuple = decltype(GenerateInOutGrid2dDescTuple(Number<NumOutput>{}));
using
GridwiseElementwise
=
GridwiseElementwise_2D
<
InGrid2dDescTuple
,
using
GridwiseElementwise
=
GridwiseElementwise_2D
<
InGrid2dDescTuple
,
OutGrid2dDescTuple
,
OutGrid2dDescTuple
,
...
@@ -165,6 +169,9 @@ struct DeviceElementwise
...
@@ -165,6 +169,9 @@ struct DeviceElementwise
blockSize_
(
256
),
blockSize_
(
256
),
gridSize_
(
120
)
// FIXME - Calculate the grid size by number of CU in the future
gridSize_
(
120
)
// FIXME - Calculate the grid size by number of CU in the future
{
{
static_assert
(
NumDim_m
>
0
,
""
);
static_assert
(
NumDim_n
>
0
,
""
);
in_dev_buffers_
=
generate_tuple
(
in_dev_buffers_
=
generate_tuple
(
[
&
](
auto
I
)
{
[
&
](
auto
I
)
{
using
DataType
=
remove_cvref_t
<
decltype
(
InDataTypeTuple
{}[
I
])
>
;
using
DataType
=
remove_cvref_t
<
decltype
(
InDataTypeTuple
{}[
I
])
>
;
...
@@ -257,29 +264,31 @@ struct DeviceElementwise
...
@@ -257,29 +264,31 @@ struct DeviceElementwise
const
std
::
array
<
index_t
,
NumDim
>&
strides
,
const
std
::
array
<
index_t
,
NumDim
>&
strides
,
index_t
scalarPerVector
,
index_t
scalarPerVector
,
index_t
vectorDim
)
{
index_t
vectorDim
)
{
std
::
cout
<<
"scalarPerVector: "
<<
scalarPerVector
<<
std
::
endl
;
// std::cout << "scalarPerVector: " << scalarPerVector << std::endl;
std
::
cout
<<
"stride back: "
<<
strides
.
back
()
<<
std
::
endl
;
// std::cout << "stride back: " << strides.back() << std::endl;
std
::
cout
<<
"len back: "
<<
lengths
.
back
()
<<
std
::
endl
;
// std::cout << "len back: " << lengths.back() << std::endl;
std
::
cout
<<
"NumDim-1: "
<<
NumDim
-
1
<<
std
::
endl
;
// std::cout << "NumDim-1: " << NumDim - 1 << std::endl;
std
::
cout
<<
"stride[nd-1]: "
<<
strides
[
NumDim
-
1
]
<<
std
::
endl
;
// std::cout << "stride[nd-1]: " << strides[NumDim - 1] << std::endl;
std
::
cout
<<
"NumDim_m-1: "
<<
NumDim_m
-
1
<<
std
::
endl
;
// std::cout << "NumDim_m-1: " << NumDim_m - 1 << std::endl;
std
::
cout
<<
std
::
endl
;
// std::cout << std::endl;
std
::
cout
<<
"ISPVV Check 1 starting"
<<
std
::
endl
;
// std::cout << "ISPVV Check 1 starting" << std::endl;
if
(
strides
[
vectorDim
]
==
1
&&
(
lengths
[
vectorDim
]
%
scalarPerVector
==
0
||
lengths
[
vectorDim
]
%
scalarPerVector
==
lengths
[
vectorDim
]))
if
(
strides
[
vectorDim
]
==
1
&&
(
lengths
[
vectorDim
]
%
scalarPerVector
==
0
||
lengths
[
vectorDim
]
%
scalarPerVector
==
lengths
[
vectorDim
]))
{
{
std
::
cout
<<
"Check 1 passed"
<<
std
::
endl
;
//
std::cout << "Check 1 passed" << std::endl;
return
true
;
return
true
;
}
}
std
::
cout
<<
"Check 1 failed "
<<
std
::
endl
;
//
std::cout << "Check 1 failed " << std::endl;
std
::
cout
<<
"ISPVV Check 2 starting"
<<
std
::
endl
;
//
std::cout << "ISPVV Check 2 starting" << std::endl;
std
::
cout
<<
"strides[vectorDim]: "
<<
strides
[
vectorDim
]
<<
std
::
endl
;
//
std::cout << "strides[vectorDim]: " << strides[vectorDim] << std::endl;
if
(
strides
[
vectorDim
]
!=
1
&&
scalarPerVector
==
strides
[
vectorDim
])
if
(
strides
[
vectorDim
]
!=
1
&&
scalarPerVector
==
strides
[
vectorDim
])
{
{
std
::
cout
<<
"Check 2 passed "
<<
std
::
endl
;
//
std::cout << "Check 2 passed " << std::endl;
return
true
;
return
true
;
}
}
std
::
cout
<<
"Check 2 failed"
<<
std
::
endl
;
//
std::cout << "Check 2 failed" << std::endl;
return
false
;
return
false
;
};
};
...
@@ -300,16 +309,20 @@ struct DeviceElementwise
...
@@ -300,16 +309,20 @@ struct DeviceElementwise
bool
valid
=
true
;
bool
valid
=
true
;
static_for
<
0
,
NumInput
,
1
>
{}([
&
](
auto
I
)
{
static_for
<
0
,
NumInput
,
1
>
{}([
&
](
auto
I
)
{
std
::
cout
<<
"running: "
<<
I
<<
std
::
endl
;
std
::
cout
<<
"running: "
<<
I
<<
std
::
endl
;
if
(
!
IsScalarPerVectorValid
(
if
(
!
IsScalarPerVectorValid
(
pArg
->
lengths_
,
pArg
->
lengths_
,
pArg
->
inStridesArray_
[
I
.
value
],
InScalarPerVectorSeq
::
At
(
I
),
NumDim_m
-
1
))
pArg
->
inStridesArray_
[
I
.
value
],
InScalarPerVectorSeq
::
At
(
I
),
NumDim_m
-
1
))
valid
=
false
;
valid
=
false
;
});
});
std
::
cout
<<
"valid after loop through input: "
<<
valid
<<
std
::
endl
;
std
::
cout
<<
"valid after loop through input: "
<<
valid
<<
std
::
endl
;
static_for
<
0
,
NumOutput
,
1
>
{}([
&
](
auto
I
)
{
static_for
<
0
,
NumOutput
,
1
>
{}([
&
](
auto
I
)
{
std
::
cout
<<
"running 2: "
<<
I
<<
std
::
endl
;
std
::
cout
<<
"running 2: "
<<
I
<<
std
::
endl
;
if
(
!
IsScalarPerVectorValid
(
if
(
!
IsScalarPerVectorValid
(
pArg
->
lengths_
,
pArg
->
lengths_
,
pArg
->
outStridesArray_
[
I
.
value
],
OutScalarPerVectorSeq
::
At
(
I
),
NumDim
-
1
))
pArg
->
outStridesArray_
[
I
.
value
],
OutScalarPerVectorSeq
::
At
(
I
),
NumDim
-
1
))
valid
=
false
;
valid
=
false
;
});
});
std
::
cout
<<
"valid after loop through output: "
<<
valid
<<
std
::
endl
;
std
::
cout
<<
"valid after loop through output: "
<<
valid
<<
std
::
endl
;
...
...
include/ck/tensor_operation/gpu/grid/gridwise_elementwise_2d.hpp
View file @
7e44fd84
...
@@ -102,26 +102,20 @@ struct GridwiseElementwise_2D
...
@@ -102,26 +102,20 @@ struct GridwiseElementwise_2D
Number
<
NumOutput
>
{});
Number
<
NumOutput
>
{});
const
index_t
blockSize
=
get_block_size
();
const
index_t
blockSize
=
get_block_size
();
const
index_t
blockPerGrid_m
=
get_grid_size
();
const
index_t
blockPerGrid
=
get_grid_size
();
//const index_t blockPerGrid_n = gridDim.y;
const
index_t
totalNumThread
=
blockSize
*
blockPerGrid
;
// const index_t block_1d = get_block_1d_id();
const
auto
M
=
in_grid_2d_desc_tuple
[
I0
].
GetLength
(
I0
);
const
auto
M
=
in_grid_2d_desc_tuple
[
I0
].
GetLength
(
I0
);
const
auto
N
=
in_grid_2d_desc_tuple
[
I0
].
GetLength
(
I1
);
const
auto
N
=
in_grid_2d_desc_tuple
[
I0
].
GetLength
(
I1
);
const
index_t
loop_step_m
=
blockPerGrid_m
*
blockSize
*
MPerThread
;
const
index_t
loop_step_m
=
MPerThread
;
const
index_t
loop_step_n
=
blockPerGrid_m
*
blockSize
*
NPerThread
;
const
index_t
loop_step_n
=
totalNumThread
*
NPerThread
;
const
auto
loop_step_index_m
=
make_multi_index
(
loop_step_m
,
0
);
const
auto
loop_step_index_n
=
make_multi_index
(
0
,
loop_step_n
);
const
index_t
thread_1d_id
=
get_thread_global_1d_id
();
const
index_t
thread_1d_id
=
get_thread_global_1d_id
();
index_t
tid_m
=
thread_1d_id
/
(
N
/
NPerThread
);
// index_t tid_m = thread_1d_id / (N / NPerThread);
index_t
tid_n
=
thread_1d_id
%
(
N
/
NPerThread
);
// index_t tid_n = thread_1d_id % (N / NPerThread);
//index_t tid_m = thread_1d_id;
//index_t tid_n = blockDim.y * blockIdx.y + threadIdx.y;
const
auto
thread_global_offset
=
make_multi_index
(
0
,
thread_1d_id
*
NPerThread
);
const
auto
thread_global_offset
=
make_multi_index
(
tid_m
*
MPerThread
,
tid_n
*
NPerThread
);
// make_multi_index(thread_global_id_2d[I0] * MPerThread, thread_global_id_2d[I1] *
// NPerThread);
auto
in_global_load_tuple
=
generate_tuple
(
auto
in_global_load_tuple
=
generate_tuple
(
[
&
](
auto
I
)
{
[
&
](
auto
I
)
{
...
@@ -135,10 +129,10 @@ struct GridwiseElementwise_2D
...
@@ -135,10 +129,10 @@ struct GridwiseElementwise_2D
decltype
(
thread_buffer_desc_mn
),
decltype
(
thread_buffer_desc_mn
),
Sequence
<
MPerThread
,
NPerThread
>
,
// SliceLengths
Sequence
<
MPerThread
,
NPerThread
>
,
// SliceLengths
Sequence
<
0
,
1
>
,
// DimAccessOrder
Sequence
<
0
,
1
>
,
// DimAccessOrder
1
,
// SrcVectorDim
0
,
// SrcVectorDim
InScalarPerVectorSeq
::
At
(
I
),
// ScalarPerVector
InScalarPerVectorSeq
::
At
(
I
),
// ScalarPerVector
1
,
// SrcScalarStrideInVector
1
,
// SrcScalarStrideInVector
fals
e
>
{
in_grid_2d_desc_tuple
[
I
],
thread_global_offset
};
tru
e
>
{
in_grid_2d_desc_tuple
[
I
],
thread_global_offset
};
},
},
Number
<
NumInput
>
{});
Number
<
NumInput
>
{});
...
@@ -154,21 +148,21 @@ struct GridwiseElementwise_2D
...
@@ -154,21 +148,21 @@ struct GridwiseElementwise_2D
decltype
(
out_grid_2d_desc_tuple
[
I
]),
decltype
(
out_grid_2d_desc_tuple
[
I
]),
PassThroughOp
,
PassThroughOp
,
Sequence
<
MPerThread
,
NPerThread
>
,
// SliceLengths
Sequence
<
MPerThread
,
NPerThread
>
,
// SliceLengths
Sequence
<
1
,
0
>
,
// DimAccessOrder
Sequence
<
0
,
1
>
,
// DimAccessOrder
0
,
// SrcVectorDim
1
,
// SrcVectorDim
OutScalarPerVectorSeq
::
At
(
I
),
1
,
//
OutScalarPerVectorSeq::At(I),
InMemoryDataOperationEnum
::
Set
,
InMemoryDataOperationEnum
::
Set
,
1
,
1
,
fals
e
>
(
out_grid_2d_desc_tuple
[
I
],
thread_global_offset
,
PassThroughOp
{});
tru
e
>
(
out_grid_2d_desc_tuple
[
I
],
thread_global_offset
,
PassThroughOp
{});
},
},
Number
<
NumOutput
>
{});
Number
<
NumOutput
>
{});
index_t
num_iter_m
=
M
/
(
loop_step_m
);
index_t
num_iter_m
=
M
/
(
loop_step_m
);
index_t
num_iter_n
=
N
/
(
loop_step_n
);
do
do
{
{
index_t
num_iter_n
=
N
/
(
loop_step_n
);
do
do
{
{
static_for
<
0
,
NumInput
,
1
>
{}([
&
](
auto
I
)
{
static_for
<
0
,
NumInput
,
1
>
{}([
&
](
auto
I
)
{
in_global_load_tuple
(
I
).
Run
(
in_grid_2d_desc_tuple
[
I
],
in_global_load_tuple
(
I
).
Run
(
in_grid_2d_desc_tuple
[
I
],
in_global_buf_tuple
[
I
],
in_global_buf_tuple
[
I
],
...
@@ -177,7 +171,7 @@ struct GridwiseElementwise_2D
...
@@ -177,7 +171,7 @@ struct GridwiseElementwise_2D
in_thread_buf_tuple
(
I
));
in_thread_buf_tuple
(
I
));
in_global_load_tuple
(
I
).
MoveSrcSliceWindow
(
in_grid_2d_desc_tuple
[
I
],
in_global_load_tuple
(
I
).
MoveSrcSliceWindow
(
in_grid_2d_desc_tuple
[
I
],
loop_step_
index_
n
);
make_multi_index
(
0
,
loop_step_n
)
);
});
});
static_for
<
0
,
MPerThread
,
1
>
{}([
&
](
auto
iM
)
{
static_for
<
0
,
MPerThread
,
1
>
{}([
&
](
auto
iM
)
{
...
@@ -203,6 +197,9 @@ struct GridwiseElementwise_2D
...
@@ -203,6 +197,9 @@ struct GridwiseElementwise_2D
});
});
});
});
// static_for<0, MPerThread * NPerThread, 1>{}(
//[&](auto i) { out_thread_buf_tuple(I0)(i) = 1; });
static_for
<
0
,
NumOutput
,
1
>
{}([
&
](
auto
I
)
{
static_for
<
0
,
NumOutput
,
1
>
{}([
&
](
auto
I
)
{
out_global_store_tuple
(
I
).
Run
(
thread_buffer_desc_mn
,
out_global_store_tuple
(
I
).
Run
(
thread_buffer_desc_mn
,
make_tuple
(
I0
,
I0
),
make_tuple
(
I0
,
I0
),
...
@@ -211,14 +208,21 @@ struct GridwiseElementwise_2D
...
@@ -211,14 +208,21 @@ struct GridwiseElementwise_2D
out_global_buf_tuple
(
I
));
out_global_buf_tuple
(
I
));
out_global_store_tuple
(
I
).
MoveDstSliceWindow
(
out_grid_2d_desc_tuple
[
I
],
out_global_store_tuple
(
I
).
MoveDstSliceWindow
(
out_grid_2d_desc_tuple
[
I
],
loop_step_
index_
n
);
make_multi_index
(
0
,
loop_step_n
)
);
});
});
}
while
(
--
num_iter_n
);
}
while
(
--
num_iter_n
);
static_for
<
0
,
NumInput
,
1
>
{}([
&
](
auto
I
)
{
static_for
<
0
,
NumInput
,
1
>
{}([
&
](
auto
I
)
{
in_global_load_tuple
(
I
).
MoveSrcSliceWindow
(
in_grid_2d_desc_tuple
[
I
],
loop_step_index_m
);
in_global_load_tuple
(
I
).
MoveSrcSliceWindow
(
in_grid_2d_desc_tuple
[
I
],
make_multi_index
(
loop_step_m
,
-
(
N
/
loop_step_n
)
*
loop_step_n
));
});
});
static_for
<
0
,
NumOutput
,
1
>
{}([
&
](
auto
I
){
out_global_store_tuple
(
I
).
MoveDstSliceWindow
(
out_grid_2d_desc_tuple
[
I
],
loop_step_index_m
);
static_for
<
0
,
NumOutput
,
1
>
{}([
&
](
auto
I
)
{
out_global_store_tuple
(
I
).
MoveDstSliceWindow
(
out_grid_2d_desc_tuple
[
I
],
make_multi_index
(
loop_step_m
,
-
(
N
/
loop_step_n
)
*
loop_step_n
));
});
});
}
while
(
--
num_iter_m
);
}
while
(
--
num_iter_m
);
}
}
...
...
include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer.hpp
View file @
7e44fd84
...
@@ -149,6 +149,12 @@ struct ThreadwiseTensorSliceTransfer_v1r3
...
@@ -149,6 +149,12 @@ struct ThreadwiseTensorSliceTransfer_v1r3
const
bool
is_dst_valid
=
const
bool
is_dst_valid
=
coordinate_has_valid_offset_assuming_visible_index_is_valid
(
dst_desc
,
dst_coord_
);
coordinate_has_valid_offset_assuming_visible_index_is_valid
(
dst_desc
,
dst_coord_
);
// if(get_thread_global_1d_id() == 0)
//{
// const index_t dst_off = dst_coord_.GetOffset();
// printf("dst_off: %d\n", dst_off);
//}
// copy data from dst_vector into dst_buf
// copy data from dst_vector into dst_buf
dst_buf
.
template
Update
<
DstInMemOp
,
dst_vector_t
>(
dst_buf
.
template
Update
<
DstInMemOp
,
dst_vector_t
>(
dst_coord_
.
GetOffset
(),
dst_coord_
.
GetOffset
(),
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment