Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
composable_kernel_ROCM
Commits
909f519c
Unverified
Commit
909f519c
authored
Jun 27, 2024
by
Harisankar Sadasivan
Committed by
GitHub
Jun 27, 2024
Browse files
Merge branch 'develop' into universal_streamk
parents
406fa265
3bb0fe6c
Changes
82
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
105 additions
and
0 deletions
+105
-0
test/smfmac_op/smfmac_op_xdl.cpp
test/smfmac_op/smfmac_op_xdl.cpp
+89
-0
test/wmma_op/wmma_op_util.hpp
test/wmma_op/wmma_op_util.hpp
+16
-0
No files found.
test/smfmac_op/smfmac_op_xdl.cpp
0 → 100644
View file @
909f519c
// SPDX-License-Identifier: MIT
// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
#include <algorithm>
#include <cstdlib>
#include <iostream>
#include <numeric>
#include <tuple>
#include <vector>
#include "ck/ck.hpp"
#include "gtest/gtest.h"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "test/smfmac_op/smfmac_op_util.hpp"
using
BF16
=
ck
::
bhalf_t
;
using
F16
=
ck
::
half_t
;
using
F32
=
float
;
using
Row
=
ck
::
tensor_layout
::
gemm
::
RowMajor
;
using
PassThrough
=
ck
::
tensor_operation
::
element_wise
::
PassThrough
;
template
<
typename
Tuple
>
class
TestSmfmac
:
public
::
testing
::
Test
{
protected:
using
Src1Type
=
std
::
tuple_element_t
<
0
,
Tuple
>
;
static
constexpr
ck
::
index_t
Src1VecSize
=
std
::
tuple_element_t
<
1
,
Tuple
>
{}.
value
;
using
Src2Type
=
std
::
tuple_element_t
<
2
,
Tuple
>
;
static
constexpr
ck
::
index_t
Src2VecSize
=
std
::
tuple_element_t
<
3
,
Tuple
>
{}.
value
;
using
DstType
=
std
::
tuple_element_t
<
4
,
Tuple
>
;
static
constexpr
ck
::
index_t
AccVecSize
=
std
::
tuple_element_t
<
5
,
Tuple
>
{}.
value
;
using
GPUAccType
=
std
::
tuple_element_t
<
6
,
Tuple
>
;
using
CPUAccType
=
std
::
tuple_element_t
<
7
,
Tuple
>
;
static
constexpr
ck
::
index_t
M
=
std
::
tuple_element_t
<
8
,
Tuple
>
{}.
value
;
static
constexpr
ck
::
index_t
N
=
std
::
tuple_element_t
<
9
,
Tuple
>
{}.
value
;
static
constexpr
ck
::
index_t
K
=
std
::
tuple_element_t
<
10
,
Tuple
>
{}.
value
;
void
Run
()
{
bool
pass
=
true
;
constexpr
auto
matmul_default
=
ck
::
smfmac_op_util
::
matmul
<
Src1Type
,
Src1VecSize
,
Src2Type
,
Src2VecSize
,
GPUAccType
,
AccVecSize
,
DstType
,
M
,
N
,
K
>
;
constexpr
auto
smfmac_kernel_container
=
std
::
make_tuple
(
matmul_default
);
ck
::
static_for
<
0
,
std
::
tuple_size_v
<
decltype
(
smfmac_kernel_container
)
>
,
1
>
{}([
&
](
auto
i
)
{
pass
&=
ck
::
smfmac_op_util
::
TestSmfmac
<
std
::
tuple_element_t
<
i
.
value
,
decltype
(
smfmac_kernel_container
)
>
,
Src1Type
,
Src2Type
,
DstType
,
GPUAccType
,
CPUAccType
,
decltype
(
Row
{}),
decltype
(
Row
{}),
decltype
(
Row
{}),
PassThrough
,
PassThrough
,
PassThrough
,
AccVecSize
,
M
,
N
,
K
>
{}(
std
::
get
<
ck
::
Number
<
i
>
{}
>
(
smfmac_kernel_container
));
});
EXPECT_TRUE
(
pass
);
}
};
template
<
ck
::
index_t
N
>
using
I
=
ck
::
Number
<
N
>
;
using
KernelTypes
=
::
testing
::
Types
<
std
::
tuple
<
F16
,
I
<
4
>
,
F16
,
I
<
8
>
,
F32
,
I
<
4
>
,
F32
,
F32
,
I
<
16
>
,
I
<
16
>
,
I
<
32
>>
,
std
::
tuple
<
BF16
,
I
<
4
>
,
BF16
,
I
<
8
>
,
F32
,
I
<
4
>
,
F32
,
F32
,
I
<
16
>
,
I
<
16
>
,
I
<
32
>>
,
std
::
tuple
<
F16
,
I
<
4
>
,
F16
,
I
<
8
>
,
F32
,
I
<
16
>
,
F32
,
F32
,
I
<
32
>
,
I
<
32
>
,
I
<
16
>>
,
std
::
tuple
<
BF16
,
I
<
4
>
,
BF16
,
I
<
8
>
,
F32
,
I
<
16
>
,
F32
,
F32
,
I
<
32
>
,
I
<
32
>
,
I
<
16
>>>
;
TYPED_TEST_SUITE
(
TestSmfmac
,
KernelTypes
);
TYPED_TEST
(
TestSmfmac
,
TestSmfmacFP16BF16
)
{
this
->
Run
();
}
test/wmma_op/wmma_op_util.hpp
View file @
909f519c
...
...
@@ -140,10 +140,18 @@ __global__ void matmul(const src_t* a, const src_t* b, dst_t* c)
p_shared
[
8
*
16
*
lane_hi
+
8
*
lane_lo
+
ele
+
16
*
16
]
=
b_temp
[
ele
];
}
#ifdef __gfx12__
asm
volatile
(
"\
s_wait_dscnt 0x0
\n
\
s_barrier_signal -1
\n
\
s_barrier_wait -1 \
"
::
);
#else
asm
volatile
(
"\
s_waitcnt lgkmcnt(0)
\n
\
s_barrier \
"
::
);
#endif
for
(
int
ele
=
0
;
ele
<
16
;
++
ele
)
{
...
...
@@ -155,10 +163,18 @@ __global__ void matmul(const src_t* a, const src_t* b, dst_t* c)
a_frag
[
ele
]
=
p_shared
[(
ele
/
8
)
*
16
*
8
+
8
*
lane
+
ele
%
8
];
}
#ifdef __gfx12__
asm
volatile
(
"\
s_wait_dscnt 0x0
\n
\
s_barrier_signal -1
\n
\
s_barrier_wait -1 \
"
::
);
#else
asm
volatile
(
"\
s_waitcnt lgkmcnt(0)
\n
\
s_barrier \
"
::
);
#endif
// sync threads, similar to mma_sync
// __syncthreads();
...
...
Prev
1
2
3
4
5
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment