Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
composable_kernel
Commits
f3b6e205
"src/vscode:/vscode.git/clone" did not exist on "c33da3ec1893cbea879031e85fe77c497a0efadf"
Commit
f3b6e205
authored
Oct 29, 2023
by
Astha Rai
Browse files
fixed errors in test/profiler
parent
4dab86fe
Changes
5
Hide whitespace changes
Inline
Side-by-side
Showing
5 changed files
with
65 additions
and
80 deletions
+65
-80
example/44_elementwise_permute/elementwise_permute_3d.cpp
example/44_elementwise_permute/elementwise_permute_3d.cpp
+10
-4
profiler/include/profiler/profile_transpose_impl.hpp
profiler/include/profiler/profile_transpose_impl.hpp
+48
-62
test/CMakeLists.txt
test/CMakeLists.txt
+1
-0
test/transpose/test_transpose.cpp
test/transpose/test_transpose.cpp
+4
-10
test/transpose/test_transpose_util.hpp
test/transpose/test_transpose_util.hpp
+2
-4
No files found.
example/44_elementwise_permute/elementwise_permute_3d.cpp
View file @
f3b6e205
...
...
@@ -50,14 +50,20 @@ int main()
bool
do_verification
=
true
;
bool
time_kernel
=
true
;
const
int
N
=
4
;
/**
const int N = 4;
const int C = 16;
const int H = 32;
const int W = 5;
const
int
D
=
16
;
const int D = 16;
**/
std
::
vector
<
std
::
size_t
>
ncdhw
=
{
N
,
C
,
D
,
H
,
W
};
std
::
vector
<
std
::
size_t
>
nchwd
=
{
N
,
C
,
H
,
W
,
D
};
ck
::
index_t
N
=
4
;
ck
::
index_t
C
=
16
;
ck
::
index_t
H
=
32
;
ck
::
index_t
W
=
5
;
ck
::
index_t
D
=
16
;
std
::
vector
<
ck
::
index_t
>
ncdhw
=
{
N
,
C
,
D
,
H
,
W
};
std
::
vector
<
ck
::
index_t
>
nchwd
=
{
N
,
C
,
H
,
W
,
D
};
Tensor
<
ADataType
>
a
(
ncdhw
);
Tensor
<
BDataType
>
b
(
nchwd
);
...
...
profiler/include/profiler/profile_transpose_impl.hpp
View file @
f3b6e205
...
...
@@ -9,44 +9,56 @@
#include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/tensor_operation/gpu/device/device_
gemm_splitk
.hpp"
#include "ck/tensor_operation/gpu/device/device_
elementwise
.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/tensor_operation/gpu/device/impl/device_elementwise_3d_impl.hpp"
#include "ck/library/tensor_operation_instance/gpu/
gemm_splitk
.hpp"
#include "ck/library/tensor_operation_instance/gpu/
transpose_3d
.hpp"
#include "ck/library/utility/check_err.hpp"
#include "ck/library/utility/device_memory.hpp"
#include "ck/library/utility/host_tensor.hpp"
#include "ck/library/utility/host_tensor_generator.hpp"
#include "ck/library/utility/literals.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"
namespace
ck
{
namespace
profiler
{
template
<
typename
ADataType
,
typename
BDataType
>
bool
profile_gemm_splitk_impl
(
int
do_verification
,
int
init_method
,
bool
do_log
,
bool
time_kernel
,
int
N
,
int
C
,
int
D
,
int
H
,
int
W
)
template
<
typename
HostTensorA
,
typename
HostTensorB
,
typename
Functor
>
void
host_elementwise4D
(
HostTensorB
&
B_nchwd
,
const
HostTensorA
&
A_ncdhw
,
Functor
functor
)
{
for
(
std
::
size_t
n
=
0
;
n
<
A_ncdhw
.
mDesc
.
GetLengths
()[
0
];
++
n
)
for
(
std
::
size_t
c
=
0
;
c
<
A_ncdhw
.
mDesc
.
GetLengths
()[
1
];
++
c
)
for
(
std
::
size_t
d
=
0
;
d
<
A_ncdhw
.
mDesc
.
GetLengths
()[
2
];
++
d
)
for
(
std
::
size_t
h
=
0
;
h
<
A_ncdhw
.
mDesc
.
GetLengths
()[
3
];
++
h
)
for
(
std
::
size_t
w
=
0
;
w
<
A_ncdhw
.
mDesc
.
GetLengths
()[
4
];
++
w
)
{
auto
a_val
=
A_ncdhw
(
n
,
c
,
d
,
h
,
w
);
functor
(
B_nchwd
(
n
,
c
,
h
,
w
,
d
),
a_val
);
}
}
template
<
typename
ADataType
,
typename
BDataType
,
index_t
NumDim
>
bool
profile_transpose_impl
(
int
do_verification
,
int
init_method
,
bool
do_log
,
bool
time_kernel
,
ck
::
index_t
N
,
ck
::
index_t
C
,
ck
::
index_t
D
,
ck
::
index_t
H
,
ck
::
index_t
W
)
{
bool
pass
=
true
;
std
::
vector
<
std
::
size
_t
>
ncdhw
=
{
N
,
C
,
D
,
H
,
W
};
std
::
vector
<
std
::
size
_t
>
ndhwc
=
{
N
,
D
,
H
,
W
,
C
};
std
::
vector
<
ck
::
index
_t
>
ncdhw
=
{
N
,
C
,
D
,
H
,
W
};
std
::
vector
<
ck
::
index
_t
>
ndhwc
=
{
N
,
D
,
H
,
W
,
C
};
Tensor
<
ADataType
>
a
(
ncdhw
);
Tensor
<
BDataType
>
b
(
ndhwc
);
Tensor
<
BDataType
>
host_b
(
ndhwc
);
// a.GenerateTensorValue(GeneratorTensor_3<ADataType>{0.0, 1.0});
std
::
array
<
const
void
*
,
1
>
input
=
{
a_device_buf
.
GetDeviceBuffer
()};
std
::
array
<
void
*
,
1
>
output
=
{
b_device_buf
.
GetDeviceBuffer
()};
std
::
array
<
ck
::
index_t
,
5
>
ab_lengths
{
N
,
C
,
H
,
W
,
D
};
std
::
array
<
ck
::
index_t
,
5
>
a_strides
=
{
C
*
D
*
H
*
W
,
H
*
W
,
W
,
1
,
D
*
H
*
W
};
// N, C, D, H, W
std
::
array
<
ck
::
index_t
,
5
>
b_strides
=
{
C
*
H
*
W
*
D
,
H
*
W
*
D
,
W
*
D
,
D
,
1
};
// N, D, H, W, C
...
...
@@ -63,25 +75,17 @@ bool profile_gemm_splitk_impl(int do_verification,
using
ElementOp
=
ck
::
tensor_operation
::
element_wise
::
PassThrough
;
const
auto
element_op
=
ElementOp
{};
//
const auto element_op = ElementOp{};
DeviceMem
a_device_buf
(
sizeof
(
ADataType
)
*
a
.
mDesc
.
GetElementSpaceSize
());
DeviceMem
b_device_buf
(
sizeof
(
BDataType
)
*
b
.
mDesc
.
GetElementSpaceSize
());
a_device_buf
.
ToDevice
(
a
.
mData
.
data
());
using
DeviceOp
=
ck
::
tensor_operation
::
device
::
DeviceElementwise3dImpl
<
ck
::
Tuple
<
ADataType
>
,
ck
::
Tuple
<
BDataType
>
,
ElementOp
,
NumDim_m
,
NumDim_n
,
NumDim_k
,
MPerThread
,
NPerThread
,
KPerThread
,
ck
::
Sequence
<
InScalarPerVector
>
,
ck
::
Sequence
<
OutScalarPerVector
>>
;
std
::
array
<
const
void
*
,
1
>
input
=
{
a_device_buf
.
GetDeviceBuffer
()};
std
::
array
<
void
*
,
1
>
output
=
{
b_device_buf
.
GetDeviceBuffer
()};
using
DeviceOp
=
ck
::
tensor_operation
::
device
::
DeviceElementwise
<
ck
::
Tuple
<
ADataType
>
,
ck
::
Tuple
<
BDataType
>
,
ElementOp
,
NumDim
>
;
// get device op instances
const
auto
op_ptrs
=
ck
::
tensor_operation
::
device
::
instance
::
DeviceOperationInstanceFactory
<
...
...
@@ -91,19 +95,7 @@ bool profile_gemm_splitk_impl(int do_verification,
if
(
do_verification
)
{
using
ReferenceTransposeInstance
=
ck
::
tensor_operation
::
host
::
ReferenceTranspose
<<
ck
::
Tuple
<
ADataType
>
,
ck
::
Tuple
<
BDataType
>
,
ElementOp
,
NumDim_m
,
NumDim_n
,
NumDim_k
,
MPerThread
,
NPerThread
,
KPerThread
,
ck
::
Sequence
<
InScalarPerVector
>
,
ck
::
Sequence
<
OutScalarPerVector
>
>
;
auto
ref_transpose
=
ReferenceTransposeInstance
{};
auto
ref_invoker
=
ref_transpose
.
MakeInvoker
();
auto
ref_argument
=
ref_transpose
.
MakeArgument
(
ab_lengths
,
{
a_strides
},
{
b_strides
},
input
,
output
,
element_op
{})
ref_invoker
.
Run
(
ref_argument
);
host_elementwise4D
(
host_b
,
a
,
ElementOp
{});
}
std
::
string
best_op_name
;
...
...
@@ -114,7 +106,7 @@ bool profile_gemm_splitk_impl(int do_verification,
for
(
auto
&
op_ptr
:
op_ptrs
)
{
auto
argument_ptr
=
op_ptr
->
MakeArgumentPointer
(
ab_lengths
,
{
a_strides
},
{
b_strides
},
input
,
output
,
e
lement
_o
p
{});
ab_lengths
,
{
a_strides
},
{
b_strides
},
input
,
output
,
E
lement
O
p
{});
auto
invoker_ptr
=
op_ptr
->
MakeInvokerPointer
();
...
...
@@ -128,9 +120,11 @@ bool profile_gemm_splitk_impl(int do_verification,
if
(
do_verification
)
{
b_device_buf
.
FromDevice
(
b
_device_result
.
mData
.
data
());
b_device_buf
.
FromDevice
(
b
.
mData
.
data
());
pass
=
pass
&
ck
::
utils
::
check_err
(
b_device_result
,
b_host_result
);
// pass = pass & ck::utils::check_err(b_device_result, b_host_result);
pass
&=
ck
::
utils
::
check_err
(
b
.
mData
,
host_b
.
mData
,
"Error: Incorrect results b"
,
1e-3
,
1e-3
);
if
(
do_log
)
{
...
...
@@ -158,7 +152,9 @@ bool profile_gemm_splitk_impl(int do_verification,
std
::
cout
<<
"Perf: "
<<
std
::
setw
(
10
)
<<
ave_time
<<
" ms, "
<<
tflops
<<
" TFlops, "
<<
gb_per_sec
<<
" GB/s, "
<<
op_name
<<
std
::
endl
;
pass
=
pass
&
ck
::
utils
::
check_err
(
b_device_result
,
b_host_result
);
// pass = pass & ck::utils::check_err(b_device_result, b_host_result);
pass
&=
ck
::
utils
::
check_err
(
b
.
mData
,
host_b
.
mData
,
"Error: Incorrect results b"
,
1e-3
,
1e-3
);
if
(
tflops
>
best_tflops
)
{
...
...
@@ -173,22 +169,12 @@ bool profile_gemm_splitk_impl(int do_verification,
std
::
cout
<<
op_ptr
->
GetTypeString
()
<<
" does not support this problem"
<<
std
::
endl
;
}
}
}
if
constexpr
(
is_same
<
BDataType
,
float
>::
value
)
{
std
::
cout
<<
"Best Perf for datatype = f32"
;
}
else
if
constexpr
(
is_same
<
BDataType
,
half_t
>::
value
)
{
std
::
cout
<<
"Best Perf for datatype = f16"
;
}
std
::
cout
<<
" N = "
<<
N
<<
" C = "
<<
C
<<
" D = "
<<
D
<<
" H = "
<<
H
<<
" W = "
<<
W
<<
" : "
<<
best_ave_time
<<
" ms, "
<<
best_tflops
<<
" TFlops, "
<<
best_gb_per_sec
<<
" GB/s, "
<<
best_op_name
<<
std
::
endl
;
std
::
cout
<<
" N = "
<<
N
<<
" C = "
<<
C
<<
" D = "
<<
D
<<
" H = "
<<
H
<<
" W = "
<<
W
<<
" : "
<<
best_ave_time
<<
" ms, "
<<
best_tflops
<<
" TFlops, "
<<
best_gb_per_sec
<<
" GB/s, "
<<
best_op_name
<<
std
::
endl
;
return
pass
;
return
pass
;
}
}
// namespace profiler
...
...
test/CMakeLists.txt
View file @
f3b6e205
...
...
@@ -156,6 +156,7 @@ add_subdirectory(pool)
add_subdirectory
(
batched_gemm_multi_d
)
add_subdirectory
(
grouped_convnd_bwd_data
)
add_subdirectory
(
conv_tensor_rearrange
)
add_subdirectory
(
transpose
)
if
(
GPU_TARGETS MATCHES
"gfx11"
)
add_subdirectory
(
wmma_op
)
endif
()
test/transpose/test_transpose.cpp
View file @
f3b6e205
...
...
@@ -5,19 +5,13 @@
#include "gtest/gtest.h"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "test_tranpose_util.hpp"
#include "test_tran
s
pose_util.hpp"
using
F16
=
ck
::
half_t
;
using
F32
=
float
;
enum
struct
MatrixLayout
{
NCDHW
,
// 0
NCHWD
,
// 1
};
template
<
typename
Tuple
>
class
TestTranspose
:
public
ck
::
test
::
TestTranspose
<
typename
MatrixLayout
<
NCDHW
>::
typ
e
>
class
TestTranspose
:
public
ck
::
test
::
TestTranspose
<
Tupl
e
>
{
};
...
...
@@ -28,6 +22,6 @@ using KernelTypes = ::testing::Types<
>
;
// clang-format on
TYPED_TEST_SUITE
(
Test
GemmSplitK_MK_KN
,
KernelTypes
);
TYPED_TEST_SUITE
(
Test
Transpose
,
KernelTypes
);
//#include "test_transpose_ut_cases.inc"
\ No newline at end of file
//#include "test_transpose_ut_cases.inc"
test/transpose/test_transpose_
interface.c
pp
→
test/transpose/test_transpose_
util.h
pp
View file @
f3b6e205
...
...
@@ -24,8 +24,6 @@ class TestTranspose : public testing::Test
using
F32
=
float
;
protected:
// using ALayout = std::tuple_element_t<0, Tuple>;
// using BLayout = std::tuple_element_t<1, Tuple>;
using
ADataType
=
std
::
tuple_element_t
<
0
,
Tuple
>
;
using
BDataType
=
std
::
tuple_element_t
<
1
,
Tuple
>
;
...
...
@@ -42,11 +40,11 @@ class TestTranspose : public testing::Test
void
RunSingle
(
const
int
N
,
const
int
C
,
const
int
D
,
const
int
H
,
const
int
W
)
{
bool
pass
=
ck
::
profiler
::
profile_transpose_impl
<
ADataType
,
BDataType
,
>
(
bool
pass
=
ck
::
profiler
::
profile_transpose_impl
<
ADataType
,
BDataType
,
5
>
(
verify_
,
init_method_
,
log_
,
bench_
,
N
,
C
,
D
,
H
,
W
);
EXPECT_TRUE
(
pass
);
}
};
}
// namespace test
}
// namespace ck
\ No newline at end of file
}
// namespace ck
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment