Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
composable_kernel
Commits
76c4719e
Commit
76c4719e
authored
Jul 24, 2023
by
Bartlomiej Kocot
Browse files
Skip inline version to builtin
parent
8a9daf16
Changes
3
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
10 additions
and
13 deletions
+10
-13
include/ck/ck.hpp
include/ck/ck.hpp
+6
-2
include/ck/utility/inner_product.hpp
include/ck/utility/inner_product.hpp
+4
-4
script/profile_batched_gemm.sh
script/profile_batched_gemm.sh
+0
-7
No files found.
include/ck/ck.hpp
View file @
76c4719e
...
@@ -118,8 +118,12 @@
...
@@ -118,8 +118,12 @@
// inline asm
// inline asm
#define CK_USE_AMD_INLINE_ASM 1
#define CK_USE_AMD_INLINE_ASM 1
// inner product (DLOP)
// inner product (V_MAC/V_FMAC)
#define CK_USE_AMD_INNER_PRODUCT_INLINE_ASM 1
#define CK_USE_AMD_V_MAC_INLINE_ASM 1
// V_DOT inline instructions, less efficient by the need to add s_nop to avoid
// hazard
#define CK_USE_AMD_V_DOT_INLINE_ASM 0
// block synchronization only s_wait lgkmcnt(0), not vmcnt(0)
// block synchronization only s_wait lgkmcnt(0), not vmcnt(0)
#define CK_EXPERIMENTAL_BLOCK_SYNC_LDS_WITHOUT_SYNC_VMEM 1
#define CK_EXPERIMENTAL_BLOCK_SYNC_LDS_WITHOUT_SYNC_VMEM 1
...
...
include/ck/utility/inner_product.hpp
View file @
76c4719e
...
@@ -13,13 +13,13 @@ __device__ void inner_product(const TA& a, const TB& b, TC& c);
...
@@ -13,13 +13,13 @@ __device__ void inner_product(const TA& a, const TB& b, TC& c);
template
<
>
template
<
>
__device__
void
inner_product
<
float
,
float
,
float
>
(
const
float
&
a
,
const
float
&
b
,
float
&
c
)
__device__
void
inner_product
<
float
,
float
,
float
>
(
const
float
&
a
,
const
float
&
b
,
float
&
c
)
{
{
#if CK_USE_AMD_
INNER_PRODUCT
_INLINE_ASM && defined(CK_USE_AMD_V_MAC_F32)
#if CK_USE_AMD_
V_MAC
_INLINE_ASM && defined(CK_USE_AMD_V_MAC_F32)
asm
volatile
(
"
\n
\
asm
volatile
(
"
\n
\
v_mac_f32 %0, %1, %2
\n
\
v_mac_f32 %0, %1, %2
\n
\
"
"
:
"=v"
(
c
)
:
"=v"
(
c
)
:
"v"
(
a
),
"v"
(
b
),
"0"
(
c
));
:
"v"
(
a
),
"v"
(
b
),
"0"
(
c
));
#elif CK_USE_AMD_
INNER_PRODUCT
_INLINE_ASM && defined(CK_USE_AMD_V_FMAC_F32)
#elif CK_USE_AMD_
V_MAC
_INLINE_ASM && defined(CK_USE_AMD_V_FMAC_F32)
asm
volatile
(
"
\n
\
asm
volatile
(
"
\n
\
v_fmac_f32 %0, %1, %2
\n
\
v_fmac_f32 %0, %1, %2
\n
\
"
"
...
@@ -76,7 +76,7 @@ template <>
...
@@ -76,7 +76,7 @@ template <>
__device__
void
inner_product
<
half2_t
,
half2_t
,
float
>
(
const
half2_t
&
a
,
const
half2_t
&
b
,
float
&
c
)
__device__
void
inner_product
<
half2_t
,
half2_t
,
float
>
(
const
half2_t
&
a
,
const
half2_t
&
b
,
float
&
c
)
{
{
#if defined(CK_USE_AMD_V_DOT2_F32_F16)
#if defined(CK_USE_AMD_V_DOT2_F32_F16)
#if CK_USE_AMD_
INNER_PRODUC
T_INLINE_ASM
#if CK_USE_AMD_
V_DO
T_INLINE_ASM
// Use 3 x s_nop to avoid hazard (mi200 cdna2 isa)
// Use 3 x s_nop to avoid hazard (mi200 cdna2 isa)
asm
volatile
(
"
\n
\
asm
volatile
(
"
\n
\
v_dot2_f32_f16 %0, %1, %2, %0
\n
\
v_dot2_f32_f16 %0, %1, %2, %0
\n
\
...
@@ -165,7 +165,7 @@ __device__ void
...
@@ -165,7 +165,7 @@ __device__ void
inner_product
<
int8x4_t
,
int8x4_t
,
int32_t
>
(
const
int8x4_t
&
a
,
const
int8x4_t
&
b
,
int32_t
&
c
)
inner_product
<
int8x4_t
,
int8x4_t
,
int32_t
>
(
const
int8x4_t
&
a
,
const
int8x4_t
&
b
,
int32_t
&
c
)
{
{
#if defined(CK_USE_AMD_V_DOT4_I32_I8)
#if defined(CK_USE_AMD_V_DOT4_I32_I8)
#if CK_USE_AMD_
INNER_PRODUC
T_INLINE_ASM
#if CK_USE_AMD_
V_DO
T_INLINE_ASM
// Use 3 x s_nop to avoid hazard (mi200 cdna2 isa)
// Use 3 x s_nop to avoid hazard (mi200 cdna2 isa)
asm
volatile
(
"
\n
\
asm
volatile
(
"
\n
\
v_dot4_i32_i8 %0, %1, %2, %0
\n
\
v_dot4_i32_i8 %0, %1, %2, %0
\n
\
...
...
script/profile_batched_gemm.sh
View file @
76c4719e
...
@@ -3,13 +3,6 @@
...
@@ -3,13 +3,6 @@
## GPU visibility
## GPU visibility
export
HIP_VISIBLE_DEVICES
=
0
export
HIP_VISIBLE_DEVICES
=
0
DRIVER
=
"../build/bin/ckProfiler"
DRIVER
=
"../build/bin/ckProfiler"
OP
=
$1
DATATYPE
=
$2
LAYOUT
=
$3
VERIFY
=
$4
INIT
=
$5
LOG
=
$6
TIME
=
$7
OP
=
$1
OP
=
$1
DATATYPE
=
$2
DATATYPE
=
$2
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment