Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
composable_kernel
Commits
ddefb951
"...composable_kernel.git" did not exist on "f95267f166927bee1d806cefbdc142b2e35f640f"
Commit
ddefb951
authored
Nov 07, 2023
by
Astha Rai
Browse files
updated example with 1d kernel
parent
4a20c076
Changes
3
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
25 additions
and
138 deletions
+25
-138
example/44_elementwise_permute/elementwise_permute.cpp
example/44_elementwise_permute/elementwise_permute.cpp
+25
-12
example/44_elementwise_permute/elementwise_permute_3d.cpp
example/44_elementwise_permute/elementwise_permute_3d.cpp
+0
-126
profiler/src/profile_transpose.cpp
profiler/src/profile_transpose.cpp
+0
-0
No files found.
example/44_elementwise_permute/elementwise_permute.cpp
View file @
ddefb951
...
@@ -24,11 +24,11 @@ using DeviceElementwisePermuteInstance =
...
@@ -24,11 +24,11 @@ using DeviceElementwisePermuteInstance =
PassThrough
,
// ElementwiseOp
PassThrough
,
// ElementwiseOp
5
,
// NumDim
5
,
// NumDim
8
,
// MPerThread
8
,
// MPerThread
ck
::
Sequence
<
8
>
,
// InScalarPerVectorSeq
ck
::
Sequence
<
1
>
,
// InScalarPerVectorSeq
ck
::
Sequence
<
1
>>
;
// OutScalarPerVectorSeq
ck
::
Sequence
<
1
>>
;
// OutScalarPerVectorSeq
template
<
typename
HostTensorA
,
typename
HostTensorB
,
typename
Functor
>
template
<
typename
HostTensorA
,
typename
HostTensorB
,
typename
Functor
>
void
host_elementwise4D
(
HostTensorB
&
B_n
c
hw
d
,
const
HostTensorA
&
A_ncdhw
,
Functor
functor
)
void
host_elementwise4D
(
HostTensorB
&
B_n
d
hw
c
,
const
HostTensorA
&
A_ncdhw
,
Functor
functor
)
{
{
for
(
std
::
size_t
n
=
0
;
n
<
A_ncdhw
.
mDesc
.
GetLengths
()[
0
];
++
n
)
for
(
std
::
size_t
n
=
0
;
n
<
A_ncdhw
.
mDesc
.
GetLengths
()[
0
];
++
n
)
for
(
std
::
size_t
c
=
0
;
c
<
A_ncdhw
.
mDesc
.
GetLengths
()[
1
];
++
c
)
for
(
std
::
size_t
c
=
0
;
c
<
A_ncdhw
.
mDesc
.
GetLengths
()[
1
];
++
c
)
...
@@ -37,7 +37,7 @@ void host_elementwise4D(HostTensorB& B_nchwd, const HostTensorA& A_ncdhw, Functo
...
@@ -37,7 +37,7 @@ void host_elementwise4D(HostTensorB& B_nchwd, const HostTensorA& A_ncdhw, Functo
for
(
std
::
size_t
w
=
0
;
w
<
A_ncdhw
.
mDesc
.
GetLengths
()[
4
];
++
w
)
for
(
std
::
size_t
w
=
0
;
w
<
A_ncdhw
.
mDesc
.
GetLengths
()[
4
];
++
w
)
{
{
auto
a_val
=
A_ncdhw
(
n
,
c
,
d
,
h
,
w
);
auto
a_val
=
A_ncdhw
(
n
,
c
,
d
,
h
,
w
);
functor
(
B_n
c
hw
d
(
n
,
c
,
h
,
w
,
d
),
a_val
);
functor
(
B_n
d
hw
c
(
n
,
d
,
h
,
w
,
c
),
a_val
);
}
}
}
}
...
@@ -47,9 +47,9 @@ int main()
...
@@ -47,9 +47,9 @@ int main()
bool
time_kernel
=
true
;
bool
time_kernel
=
true
;
std
::
vector
<
std
::
size_t
>
ncdhw
=
{
16
,
8
,
8
,
8
,
8
};
std
::
vector
<
std
::
size_t
>
ncdhw
=
{
16
,
8
,
8
,
8
,
8
};
std
::
vector
<
std
::
size_t
>
n
c
hw
d
=
{
16
,
8
,
8
,
8
,
8
};
std
::
vector
<
std
::
size_t
>
n
d
hw
c
=
{
16
,
8
,
8
,
8
,
8
};
Tensor
<
ADataType
>
a
(
ncdhw
);
Tensor
<
ADataType
>
a
(
ncdhw
);
Tensor
<
BDataType
>
b
(
n
c
hw
d
);
Tensor
<
BDataType
>
b
(
n
d
hw
c
);
a
.
GenerateTensorValue
(
GeneratorTensor_3
<
ADataType
>
{
0.0
,
1.0
});
a
.
GenerateTensorValue
(
GeneratorTensor_3
<
ADataType
>
{
0.0
,
1.0
});
...
@@ -62,19 +62,32 @@ int main()
...
@@ -62,19 +62,32 @@ int main()
std
::
array
<
void
*
,
1
>
output
=
{
b_device_buf
.
GetDeviceBuffer
()};
std
::
array
<
void
*
,
1
>
output
=
{
b_device_buf
.
GetDeviceBuffer
()};
std
::
array
<
ck
::
index_t
,
5
>
ab_lengths
;
std
::
array
<
ck
::
index_t
,
5
>
ab_lengths
;
std
::
array
<
ck
::
index_t
,
5
>
a_strides
=
{
/**
std::array<ck::index_t, 5> a_strides = {
static_cast<int>(ncdhw[1] * ncdhw[2] * ncdhw[3] * ncdhw[4]),
static_cast<int>(ncdhw[1] * ncdhw[2] * ncdhw[3] * ncdhw[4]),
static_cast<int>(ncdhw[2] * ncdhw[3] * ncdhw[4]),
static_cast<int>(ncdhw[2] * ncdhw[3] * ncdhw[4]),
static_cast<int>(ncdhw[3] * ncdhw[4]),
static_cast<int>(ncdhw[3] * ncdhw[4]),
static_cast<int>(ncdhw[4]),
static_cast<int>(ncdhw[4]),
1};
1};
std::array<ck::index_t, 5> b_strides = {
std::array<ck::index_t, 5> b_strides = {
static_cast
<
int
>
(
n
c
hw
d
[
1
]
*
n
c
hw
d
[
2
]
*
n
c
hw
d
[
3
]
*
n
c
hw
d
[
4
]),
static_cast<int>(n
d
hw
c
[1] * n
d
hw
c
[2] * n
d
hw
c
[3] * n
d
hw
c
[4]),
static_cast
<
int
>
(
n
c
hw
d
[
2
]
*
n
c
hw
d
[
3
]
*
n
c
hw
d
[
4
]),
static_cast<int>(n
d
hw
c
[2] * n
d
hw
c
[3] * n
d
hw
c
[4]),
1,
1,
static_cast
<
int
>
(
n
c
hw
d
[
3
]
*
n
c
hw
d
[
4
]),
static_cast<int>(n
d
hw
c
[3] * n
d
hw
c
[4]),
static_cast
<
int
>
(
n
c
hw
d
[
4
])};
static_cast<int>(n
d
hw
c
[4])};
**/
std
::
array
<
ck
::
index_t
,
5
>
a_strides
=
{
static_cast
<
int
>
(
ncdhw
[
1
]
*
ncdhw
[
2
]
*
ncdhw
[
3
]
*
ncdhw
[
4
]),
static_cast
<
int
>
(
ncdhw
[
3
]
*
ncdhw
[
4
]),
static_cast
<
int
>
(
ncdhw
[
4
]),
1
,
static_cast
<
int
>
(
ncdhw
[
2
]
*
ncdhw
[
3
]
*
ncdhw
[
4
])};
std
::
array
<
ck
::
index_t
,
5
>
b_strides
=
{
static_cast
<
int
>
(
ndhwc
[
1
]
*
ndhwc
[
2
]
*
ndhwc
[
3
]
*
ndhwc
[
4
]),
static_cast
<
int
>
(
ndhwc
[
2
]
*
ndhwc
[
3
]
*
ndhwc
[
4
]),
static_cast
<
int
>
(
ndhwc
[
3
]
*
ndhwc
[
4
]),
static_cast
<
int
>
(
ndhwc
[
4
]),
1
};
ck
::
ranges
::
copy
(
ncdhw
,
ab_lengths
.
begin
());
ck
::
ranges
::
copy
(
ncdhw
,
ab_lengths
.
begin
());
auto
broadcastPermute
=
DeviceElementwisePermuteInstance
{};
auto
broadcastPermute
=
DeviceElementwisePermuteInstance
{};
...
@@ -88,7 +101,7 @@ int main()
...
@@ -88,7 +101,7 @@ int main()
};
};
std
::
cout
<<
"A (ncdhw): "
<<
a
.
mDesc
<<
std
::
endl
;
std
::
cout
<<
"A (ncdhw): "
<<
a
.
mDesc
<<
std
::
endl
;
std
::
cout
<<
"B (n
c
hw
d
): "
<<
b
.
mDesc
<<
std
::
endl
;
std
::
cout
<<
"B (n
d
hw
c
): "
<<
b
.
mDesc
<<
std
::
endl
;
auto
broadcastPermute_invoker_ptr
=
broadcastPermute
.
MakeInvokerPointer
();
auto
broadcastPermute_invoker_ptr
=
broadcastPermute
.
MakeInvokerPointer
();
float
ave_time
=
float
ave_time
=
...
@@ -111,7 +124,7 @@ int main()
...
@@ -111,7 +124,7 @@ int main()
if
(
do_verification
)
if
(
do_verification
)
{
{
b_device_buf
.
FromDevice
(
b
.
mData
.
data
());
b_device_buf
.
FromDevice
(
b
.
mData
.
data
());
Tensor
<
BDataType
>
host_b
(
n
c
hw
d
);
Tensor
<
BDataType
>
host_b
(
n
d
hw
c
);
host_elementwise4D
(
host_b
,
a
,
PassThrough
{});
host_elementwise4D
(
host_b
,
a
,
PassThrough
{});
pass
&=
pass
&=
...
...
example/44_elementwise_permute/elementwise_permute_3d.cpp
deleted
100644 → 0
View file @
4a20c076
#include <iostream>
#include <cstdlib>
#include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/element/binary_element_wise_operation.hpp"
#include "ck/tensor_operation/gpu/device/impl/device_elementwise_3d_impl.hpp"
#include "ck/library/utility/algorithm.hpp"
#include "ck/library/utility/check_err.hpp"
#include "ck/library/utility/device_memory.hpp"
#include "ck/library/utility/host_tensor.hpp"
#include "ck/library/utility/host_tensor_generator.hpp"
using
F16
=
ck
::
half_t
;
using
F32
=
float
;
using
ADataType
=
F16
;
using
BDataType
=
F16
;
using
PassThrough
=
ck
::
tensor_operation
::
element_wise
::
PassThrough
;
using
DeviceElementwisePermuteInstance
=
ck
::
tensor_operation
::
device
::
DeviceElementwise3dImpl
<
ck
::
Tuple
<
ADataType
>
,
// InDataTypeTuple
ck
::
Tuple
<
BDataType
>
,
// OutDataTypeTuple
PassThrough
,
// ElementwiseOp
2
,
// NumDim_m, {N, C}
2
,
// NumDim_n, {H, W}
1
,
// NumDim_k, {D}
8
,
// MPerThread
8
,
// NPerThread
8
,
// KPerThread
ck
::
Sequence
<
1
>
,
// InScalarPerVectorSeq
ck
::
Sequence
<
1
>>
;
// OutScalarPerVectorSeq
template
<
typename
HostTensorA
,
typename
HostTensorB
,
typename
Functor
>
void
host_elementwise4D
(
HostTensorB
&
B_nchwd
,
const
HostTensorA
&
A_ncdhw
,
Functor
functor
)
{
for
(
std
::
size_t
n
=
0
;
n
<
A_ncdhw
.
mDesc
.
GetLengths
()[
0
];
++
n
)
for
(
std
::
size_t
c
=
0
;
c
<
A_ncdhw
.
mDesc
.
GetLengths
()[
1
];
++
c
)
for
(
std
::
size_t
d
=
0
;
d
<
A_ncdhw
.
mDesc
.
GetLengths
()[
2
];
++
d
)
for
(
std
::
size_t
h
=
0
;
h
<
A_ncdhw
.
mDesc
.
GetLengths
()[
3
];
++
h
)
for
(
std
::
size_t
w
=
0
;
w
<
A_ncdhw
.
mDesc
.
GetLengths
()[
4
];
++
w
)
{
auto
a_val
=
A_ncdhw
(
n
,
c
,
d
,
h
,
w
);
functor
(
B_nchwd
(
n
,
c
,
h
,
w
,
d
),
a_val
);
}
}
int
main
()
{
bool
do_verification
=
true
;
bool
time_kernel
=
true
;
/**const int N = 4;
const int C = 16;
const int H = 32;
const int W = 5;
const int D = 16;**/
ck
::
index_t
N
=
4
;
ck
::
index_t
C
=
16
;
ck
::
index_t
H
=
32
;
ck
::
index_t
W
=
5
;
ck
::
index_t
D
=
16
;
std
::
vector
<
ck
::
index_t
>
ncdhw
=
{
N
,
C
,
D
,
H
,
W
};
std
::
vector
<
ck
::
index_t
>
nchwd
=
{
N
,
C
,
H
,
W
,
D
};
Tensor
<
ADataType
>
a
(
ncdhw
);
Tensor
<
BDataType
>
b
(
nchwd
);
a
.
GenerateTensorValue
(
GeneratorTensor_3
<
ADataType
>
{
0.0
,
1.0
});
DeviceMem
a_device_buf
(
sizeof
(
ADataType
)
*
a
.
mDesc
.
GetElementSpaceSize
());
DeviceMem
b_device_buf
(
sizeof
(
BDataType
)
*
b
.
mDesc
.
GetElementSpaceSize
());
a_device_buf
.
ToDevice
(
a
.
mData
.
data
());
std
::
array
<
const
void
*
,
1
>
input
=
{
a_device_buf
.
GetDeviceBuffer
()};
std
::
array
<
void
*
,
1
>
output
=
{
b_device_buf
.
GetDeviceBuffer
()};
std
::
array
<
ck
::
index_t
,
5
>
ab_lengths
{
N
,
C
,
H
,
W
,
D
};
std
::
array
<
ck
::
index_t
,
5
>
a_strides
=
{
C
*
D
*
H
*
W
,
D
*
H
*
W
,
W
,
1
,
H
*
W
};
// N, C, D, H, W
std
::
array
<
ck
::
index_t
,
5
>
b_strides
=
{
C
*
H
*
W
*
D
,
H
*
W
*
D
,
W
*
D
,
D
,
1
};
// N, C, H, W, D
auto
broadcastPermute
=
DeviceElementwisePermuteInstance
{};
auto
argument
=
broadcastPermute
.
MakeArgumentPointer
(
ab_lengths
,
{
a_strides
},
{
b_strides
},
input
,
output
,
PassThrough
{});
if
(
!
broadcastPermute
.
IsSupportedArgument
(
argument
.
get
()))
{
throw
std
::
runtime_error
(
"The runtime parameters seems not supported by the device instance, exiting!"
);
};
std
::
cout
<<
"A (ncdhw): "
<<
a
.
mDesc
<<
std
::
endl
;
std
::
cout
<<
"B (nchwd): "
<<
b
.
mDesc
<<
std
::
endl
;
auto
broadcastPermute_invoker_ptr
=
broadcastPermute
.
MakeInvokerPointer
();
float
ave_time
=
broadcastPermute_invoker_ptr
->
Run
(
argument
.
get
(),
StreamConfig
{
nullptr
,
time_kernel
});
std
::
size_t
flop
=
std
::
size_t
(
2
)
*
ncdhw
[
0
]
*
ncdhw
[
1
]
*
ncdhw
[
2
]
*
ncdhw
[
3
]
*
ncdhw
[
4
];
std
::
size_t
num_btype
=
sizeof
(
ADataType
)
*
(
ncdhw
[
0
]
*
ncdhw
[
1
]
*
ncdhw
[
2
]
*
ncdhw
[
3
]
*
ncdhw
[
4
])
+
sizeof
(
BDataType
)
*
(
ncdhw
[
0
]
*
ncdhw
[
1
]
*
ncdhw
[
2
]
*
ncdhw
[
3
]
*
ncdhw
[
4
]);
float
tflops
=
static_cast
<
float
>
(
flop
)
/
1.E9
/
ave_time
;
float
gb_per_sec
=
num_btype
/
1.E6
/
ave_time
;
std
::
cout
<<
"Perf: "
<<
ave_time
<<
" ms, "
<<
tflops
<<
" TFlops, "
<<
gb_per_sec
<<
" GB/s"
<<
std
::
endl
;
bool
pass
=
true
;
if
(
do_verification
)
{
b_device_buf
.
FromDevice
(
b
.
mData
.
data
());
Tensor
<
BDataType
>
host_b
(
nchwd
);
host_elementwise4D
(
host_b
,
a
,
PassThrough
{});
pass
&=
ck
::
utils
::
check_err
(
b
.
mData
,
host_b
.
mData
,
"Error: Incorrect results b"
,
1e-3
,
1e-3
);
}
return
pass
?
0
:
1
;
}
profiler/
include/profiler
/profile_transpose.cpp
→
profiler/
src
/profile_transpose.cpp
View file @
ddefb951
File moved
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment