Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
composable_kernel
Commits
7d44e782
"configs/git@developer.sourcefind.cn:OpenDAS/opencompass.git" did not exist on "a6552224cb2651706c2aa8be481e1dc27fb2cb1a"
Commit
7d44e782
authored
May 18, 2022
by
rocking
Browse files
Support any dimension for elementwise operation
parent
06e52d90
Changes
3
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
138 additions
and
44 deletions
+138
-44
example/19_binary_elementwise/CMakeLists.txt
example/19_binary_elementwise/CMakeLists.txt
+2
-1
example/19_binary_elementwise/elementwise_add_4d.cpp
example/19_binary_elementwise/elementwise_add_4d.cpp
+117
-0
include/ck/tensor_operation/gpu/device/device_binary_elementwise.hpp
...tensor_operation/gpu/device/device_binary_elementwise.hpp
+19
-43
No files found.
example/19_binary_elementwise/CMakeLists.txt
View file @
7d44e782
add_example_executable
(
example_broadcast_add_2d broadcast_add_2d.cpp
)
add_example_executable
(
example_broadcast_add_2d broadcast_add_2d.cpp
)
add_example_executable
(
example_elementwise_add_1d elementwise_add_1d.cpp
)
add_example_executable
(
example_elementwise_add_1d elementwise_add_1d.cpp
)
\ No newline at end of file
add_example_executable
(
example_elementwise_add_4d elementwise_add_4d.cpp
)
\ No newline at end of file
example/19_binary_elementwise/elementwise_add_4d.cpp
0 → 100644
View file @
7d44e782
#include <iostream>
#include <numeric>
#include <initializer_list>
#include <cstdlib>
#include <stdlib.h>
#include <half.hpp>
#include <math.h>
#include "check_err.hpp"
#include "config.hpp"
#include "device.hpp"
#include "host_reduce_util.hpp"
#include "host_tensor.hpp"
#include "host_tensor_generator.hpp"
#include "device_tensor.hpp"
#include "binary_element_wise_operation.hpp"
#include "device_binary_elementwise.hpp"
using
F16
=
ck
::
half_t
;
using
F32
=
float
;
using
ABDataType
=
F16
;
using
CDataType
=
F16
;
using
EltwiseComputeDataType
=
F32
;
using
Add
=
ck
::
tensor_operation
::
binary_element_wise
::
Add
;
using
DeviceElementwiseAddInstance
=
ck
::
tensor_operation
::
device
::
DeviceBinaryElementwise
<
F16
,
F16
,
CDataType
,
EltwiseComputeDataType
,
Add
,
4
,
8
>
;
template
<
typename
HostTensorA
,
typename
HostTensorB
,
typename
HostTensorC
,
typename
ComputeDataType
,
typename
Functor
>
void
host_elementwise4D
(
HostTensorC
&
C
,
const
HostTensorA
&
A
,
const
HostTensorB
&
B
,
const
std
::
vector
<
std
::
size_t
>&
shape
,
Functor
functor
)
{
for
(
std
::
size_t
n
=
0
;
n
<
shape
[
0
];
++
n
)
for
(
std
::
size_t
c
=
0
;
c
<
shape
[
1
];
++
c
)
for
(
std
::
size_t
h
=
0
;
h
<
shape
[
2
];
++
h
)
for
(
std
::
size_t
w
=
0
;
w
<
shape
[
3
];
++
w
)
{
ComputeDataType
a_val
=
static_cast
<
ComputeDataType
>
(
A
(
n
,
c
,
h
,
w
));
ComputeDataType
b_val
=
static_cast
<
ComputeDataType
>
(
B
(
n
,
c
,
h
,
w
));
ComputeDataType
c_val
=
0
;
functor
(
c_val
,
a_val
,
b_val
);
C
(
n
,
c
,
h
,
w
)
=
static_cast
<
ComputeDataType
>
(
c_val
);
}
}
int
main
()
{
bool
do_verification
=
true
;
bool
time_kernel
=
false
;
std
::
vector
<
std
::
size_t
>
nchw
=
{
4
,
16
,
32
,
32
};
Tensor
<
ABDataType
>
a_m
(
nchw
);
Tensor
<
ABDataType
>
b_m
(
nchw
);
Tensor
<
ABDataType
>
c_m
(
nchw
);
a_m
.
GenerateTensorValue
(
GeneratorTensor_3
<
ABDataType
>
{
0.0
,
1.0
});
b_m
.
GenerateTensorValue
(
GeneratorTensor_3
<
ABDataType
>
{
0.0
,
1.0
});
DeviceMem
a_m_device_buf
(
sizeof
(
ABDataType
)
*
a_m
.
mDesc
.
GetElementSpace
());
DeviceMem
b_m_device_buf
(
sizeof
(
ABDataType
)
*
b_m
.
mDesc
.
GetElementSpace
());
DeviceMem
c_m_device_buf
(
sizeof
(
CDataType
)
*
c_m
.
mDesc
.
GetElementSpace
());
a_m_device_buf
.
ToDevice
(
a_m
.
mData
.
data
());
b_m_device_buf
.
ToDevice
(
b_m
.
mData
.
data
());
auto
broadcastAdd
=
DeviceElementwiseAddInstance
{};
auto
argument
=
broadcastAdd
.
MakeArgumentPointer
(
a_m_device_buf
.
GetDeviceBuffer
(),
b_m_device_buf
.
GetDeviceBuffer
(),
c_m_device_buf
.
GetDeviceBuffer
(),
ck
::
to_int_vector
(
nchw
),
ck
::
to_int_vector
(
a_m
.
mDesc
.
GetStrides
()),
ck
::
to_int_vector
(
b_m
.
mDesc
.
GetStrides
()),
ck
::
to_int_vector
(
c_m
.
mDesc
.
GetStrides
()),
Add
{},
256
);
if
(
!
broadcastAdd
.
IsSupportedArgument
(
argument
.
get
()))
{
throw
std
::
runtime_error
(
"The runtime parameters seems not supported by the "
"DeviceBinaryElementwise_2D instance, exiting!"
);
};
auto
broadcastAdd_invoker_ptr
=
broadcastAdd
.
MakeInvokerPointer
();
float
ave_time
=
broadcastAdd_invoker_ptr
->
Run
(
argument
.
get
(),
StreamConfig
{
nullptr
,
time_kernel
});
std
::
cout
<<
"Perf: "
<<
ave_time
<<
" ms"
<<
std
::
endl
;
bool
pass
=
true
;
if
(
do_verification
)
{
c_m_device_buf
.
FromDevice
(
c_m
.
mData
.
data
());
Tensor
<
CDataType
>
host_c_m
(
nchw
);
host_elementwise4D
<
Tensor
<
ABDataType
>
,
Tensor
<
ABDataType
>
,
Tensor
<
CDataType
>
,
EltwiseComputeDataType
,
Add
>
(
host_c_m
,
a_m
,
b_m
,
nchw
,
Add
{});
pass
&=
ck
::
utils
::
check_err
(
c_m
.
mData
,
host_c_m
.
mData
,
"Error: Incorrect results d1"
,
1e-3
,
1e-3
);
}
return
pass
?
0
:
1
;
}
include/ck/tensor_operation/gpu/device/device_binary_elementwise.hpp
View file @
7d44e782
...
@@ -35,54 +35,30 @@ struct DeviceBinaryElementwise : public BaseOperator
...
@@ -35,54 +35,30 @@ struct DeviceBinaryElementwise : public BaseOperator
return
desc_m0_pad
;
return
desc_m0_pad
;
}
}
static
auto
MakeDescriptor_M0_1d
(
const
std
::
vector
<
int
>&
shape
,
const
std
::
vector
<
int
>&
stride
,
index_t
gridSize
,
index_t
threadPerBlock
)
{
const
auto
desc_m0
=
make_naive_tensor_descriptor
(
make_tuple
(
shape
[
0
]),
make_tuple
(
stride
[
0
]));
return
PadDescriptor_M0_1d
(
desc_m0
,
gridSize
,
threadPerBlock
);
}
static
auto
MakeDescriptor_M0_2d
(
const
std
::
vector
<
int
>&
shape
,
const
std
::
vector
<
int
>&
stride
,
index_t
gridSize
,
index_t
threadPerBlock
)
{
const
int
m
=
shape
[
0
];
const
int
n
=
shape
[
1
];
// 2d desc - [m, n]
const
auto
desc_m_n
=
make_naive_tensor_descriptor
(
make_tuple
(
m
,
n
),
make_tuple
(
stride
[
0
],
stride
[
1
]));
// 1d desc - [m * n]
const
auto
desc_m0
=
transform_tensor_descriptor
(
desc_m_n
,
make_tuple
(
make_merge_transform
(
make_tuple
(
m
,
n
))),
make_tuple
(
Sequence
<
0
,
1
>
{}),
make_tuple
(
Sequence
<
0
>
{}));
return
PadDescriptor_M0_1d
(
desc_m0
,
gridSize
,
threadPerBlock
);
}
static
auto
MakeDescriptor_M0
(
const
std
::
vector
<
int
>&
shape
,
static
auto
MakeDescriptor_M0
(
const
std
::
vector
<
int
>&
shape
,
const
std
::
vector
<
int
>&
stride
,
const
std
::
vector
<
int
>&
stride
,
index_t
gridSize
,
index_t
gridSize
,
index_t
threadPerBlock
)
index_t
threadPerBlock
)
{
{
static_assert
(
Dim
==
1
||
Dim
==
2
,
auto
tupleOfShape
=
generate_tuple
([
&
](
auto
I
)
{
return
shape
[
I
];
},
Number
<
Dim
>
{});
"wrong! DeviceBinaryElementwise not support this dimension"
);
auto
tupleOfStride
=
generate_tuple
([
&
](
auto
I
)
{
return
stride
[
I
];
},
Number
<
Dim
>
{});
// TODO - 3D, 4D, 5D
// nd desc - [s0, s1, s2, ...]
if
constexpr
(
Dim
==
1
)
const
auto
desc
=
make_naive_tensor_descriptor
(
tupleOfShape
,
tupleOfStride
);
return
MakeDescriptor_M0_1d
(
shape
,
stride
,
gridSize
,
threadPerBlock
);
else
if
constexpr
(
Dim
==
2
)
// merge nd to 1d desc - [s0 * s1 * ...]
return
MakeDescriptor_M0_2d
(
shape
,
stride
,
gridSize
,
threadPerBlock
);
if
constexpr
(
Dim
>
1
)
{
const
auto
desc_m0
=
transform_tensor_descriptor
(
desc
,
make_tuple
(
make_merge_transform
(
tupleOfShape
)),
make_tuple
(
generate_sequence_v2
([
&
](
auto
I
)
{
return
I
;
},
Number
<
Dim
>
{})),
make_tuple
(
Sequence
<
0
>
{}));
return
PadDescriptor_M0_1d
(
desc_m0
,
gridSize
,
threadPerBlock
);
}
else
else
return
make_naive_tensor_descriptor
(
make_tuple
(
0
),
make_tuple
(
0
)
);
return
PadDescriptor_M0_1d
(
desc
,
gridSize
,
threadPerBlock
);
}
}
using
GridDesc_M0
=
decltype
(
MakeDescriptor_M0
({
1
,
1
},
{
1
,
1
},
1
,
1
));
using
GridDesc_M0
=
decltype
(
MakeDescriptor_M0
({
1
,
1
},
{
1
,
1
},
1
,
1
));
...
@@ -169,7 +145,7 @@ struct DeviceBinaryElementwise : public BaseOperator
...
@@ -169,7 +145,7 @@ struct DeviceBinaryElementwise : public BaseOperator
if
(
pArg
==
nullptr
)
if
(
pArg
==
nullptr
)
return
false
;
return
false
;
//
m * n
//
shape[0] * shape[1] * shape[2] * ...
const
auto
m0
=
pArg
->
c_grid_desc_m0_
.
GetLength
(
I0
);
const
auto
m0
=
pArg
->
c_grid_desc_m0_
.
GetLength
(
I0
);
if
(
m0
%
ScalarPerVector
!=
0
)
if
(
m0
%
ScalarPerVector
!=
0
)
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment