Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
composable_kernel
Commits
df4cc03f
Commit
df4cc03f
authored
Jul 24, 2023
by
Bartlomiej
Committed by
Bartlomiej Kocot
Jul 27, 2023
Browse files
Add s_nops after v_dot to avoid hazard
parent
e7dca79d
Changes
3
Show whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
19 additions
and
21 deletions
+19
-21
include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_multiple_d_dl.hpp
...ion/gpu/device/impl/device_batched_gemm_multiple_d_dl.hpp
+7
-8
include/ck/utility/inner_product.hpp
include/ck/utility/inner_product.hpp
+10
-9
test/batched_gemm_multi_d/CMakeLists.txt
test/batched_gemm_multi_d/CMakeLists.txt
+2
-4
No files found.
include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_multiple_d_dl.hpp
View file @
df4cc03f
...
@@ -70,10 +70,9 @@ __global__ void
...
@@ -70,10 +70,9 @@ __global__ void
const
ComputePtrOffsetOfBatch
compute_ptr_offset_of_batch
,
const
ComputePtrOffsetOfBatch
compute_ptr_offset_of_batch
,
const
Block2CTileMap
block_2_ctile_map
)
const
Block2CTileMap
block_2_ctile_map
)
{
{
// TODO: Enable for gfx90a after complier fix
#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx906__) || defined(__gfx908__) || \
#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx906__) || defined(__gfx908__) || \
defined(__gfx940__) || defined(__gfx1030__) || defined(__gfx1100__) ||
defined(__gfx1101__) ||
\
defined(__gfx90a__) ||
defined(__gfx940__) || defined(__gfx1030__) || defined(__gfx1100__) || \
defined(__gfx1102__))
defined(__gfx1101__) ||
defined(__gfx1102__))
const
index_t
num_blocks_per_batch
=
const
index_t
num_blocks_per_batch
=
__builtin_amdgcn_readfirstlane
(
get_grid_size
()
/
batch_count
);
__builtin_amdgcn_readfirstlane
(
get_grid_size
()
/
batch_count
);
...
@@ -650,10 +649,10 @@ struct DeviceBatchedGemmMultipleD_Dl : public DeviceBatchedGemmMultiD<ALayout,
...
@@ -650,10 +649,10 @@ struct DeviceBatchedGemmMultipleD_Dl : public DeviceBatchedGemmMultiD<ALayout,
static
bool
IsSupportedArgument
(
const
Argument
&
arg
)
static
bool
IsSupportedArgument
(
const
Argument
&
arg
)
{
{
// TODO: Enable for gfx90a after complier fix
// TODO: Enable for gfx90a after complier fix
if
(
ck
::
get_device_name
()
==
"gfx906"
||
ck
::
get_device_name
()
==
"gfx90
8
"
||
if
(
ck
::
get_device_name
()
==
"gfx906"
||
ck
::
get_device_name
()
==
"gfx90
a
"
||
ck
::
get_device_name
()
==
"gfx
1030
"
||
ck
::
get_device_name
()
==
"gfx
94
0"
||
ck
::
get_device_name
()
==
"gfx
908
"
||
ck
::
get_device_name
()
==
"gfx
103
0"
||
ck
::
get_device_name
()
==
"gfx
110
0"
||
ck
::
get_device_name
()
==
"gfx110
1
"
||
ck
::
get_device_name
()
==
"gfx
94
0"
||
ck
::
get_device_name
()
==
"gfx110
0
"
||
ck
::
get_device_name
()
==
"gfx1102"
)
ck
::
get_device_name
()
==
"gfx1101"
||
ck
::
get_device_name
()
==
"gfx1102"
)
{
{
bool
pass
=
true
;
bool
pass
=
true
;
pass
=
pass
&&
arg
.
K_
%
K1
==
0
;
pass
=
pass
&&
arg
.
K_
%
K1
==
0
;
...
...
include/ck/utility/inner_product.hpp
View file @
df4cc03f
...
@@ -75,16 +75,17 @@ inner_product<float4_t, float4_t, float>(const float4_t& a, const float4_t& b, f
...
@@ -75,16 +75,17 @@ inner_product<float4_t, float4_t, float>(const float4_t& a, const float4_t& b, f
template
<
>
template
<
>
__device__
void
inner_product
<
half2_t
,
half2_t
,
float
>
(
const
half2_t
&
a
,
const
half2_t
&
b
,
float
&
c
)
__device__
void
inner_product
<
half2_t
,
half2_t
,
float
>
(
const
half2_t
&
a
,
const
half2_t
&
b
,
float
&
c
)
{
{
#if defined(CK_USE_AMD_V_DOT2_F32_F16)
// builtin is disabled because it does not generate s_nop
#if CK_USE_AMD_INNER_PRODUCT_INLINE_ASM
// and this can lead to hazards
#if defined(CK_USE_AMD_V_DOT2_F32_F16) && CK_USE_AMD_INNER_PRODUCT_INLINE_ASM
// Use 3 x s_nop to avoid hazard (mi200 cdna2 isa)
asm
volatile
(
"
\n
\
asm
volatile
(
"
\n
\
v_dot2_f32_f16 %0, %1, %2, %0
\n
\
v_dot2_f32_f16 %0, %1, %2, %0
\n
\
s_nop 2
\n
\
"
"
:
"=v"
(
c
)
:
"=v"
(
c
)
:
"v"
(
a
),
"v"
(
b
),
"0"
(
c
));
:
"v"
(
a
),
"v"
(
b
),
"0"
(
c
));
#else
c
=
__builtin_amdgcn_sdot2
(
a
,
b
,
c
,
false
);
c
=
__builtin_amdgcn_sdot2
(
a
,
b
,
c
,
false
);
#endif
#else
#else
const
vector_type
<
half_t
,
2
>
a_vector
{
a
};
const
vector_type
<
half_t
,
2
>
a_vector
{
a
};
const
vector_type
<
half_t
,
2
>
b_vector
{
b
};
const
vector_type
<
half_t
,
2
>
b_vector
{
b
};
...
@@ -162,16 +163,16 @@ template <>
...
@@ -162,16 +163,16 @@ template <>
__device__
void
__device__
void
inner_product
<
int8x4_t
,
int8x4_t
,
int32_t
>
(
const
int8x4_t
&
a
,
const
int8x4_t
&
b
,
int32_t
&
c
)
inner_product
<
int8x4_t
,
int8x4_t
,
int32_t
>
(
const
int8x4_t
&
a
,
const
int8x4_t
&
b
,
int32_t
&
c
)
{
{
#if defined(CK_USE_AMD_V_DOT4_I32_I8)
// builtin is disabled because it does not generate s_nop
#if CK_USE_AMD_INNER_PRODUCT_INLINE_ASM
// and this can lead to hazards
#if defined(CK_USE_AMD_V_DOT4_I32_I8) && CK_USE_AMD_INNER_PRODUCT_INLINE_ASM
// Use 3 x s_nop to avoid hazard (mi200 cdna2 isa)
asm
volatile
(
"
\n
\
asm
volatile
(
"
\n
\
v_dot4_i32_i8 %0, %1, %2, %0
\n
\
v_dot4_i32_i8 %0, %1, %2, %0
\n
\
s_nop 2
\n
\
"
"
:
"=v"
(
c
)
:
"=v"
(
c
)
:
"v"
(
bit_cast
<
int32_t
>
(
a
)),
"v"
(
bit_cast
<
int32_t
>
(
b
)),
"0"
(
c
));
:
"v"
(
bit_cast
<
int32_t
>
(
a
)),
"v"
(
bit_cast
<
int32_t
>
(
b
)),
"0"
(
c
));
#else
c
=
__builtin_amdgcn_sdot4
(
bit_cast
<
int32_t
>
(
a
),
bit_cast
<
int32_t
>
(
b
),
c
,
false
);
#endif
#else
#else
const
vector_type
<
int8_t
,
4
>
a_vector
{
a
};
const
vector_type
<
int8_t
,
4
>
a_vector
{
a
};
const
vector_type
<
int8_t
,
4
>
b_vector
{
b
};
const
vector_type
<
int8_t
,
4
>
b_vector
{
b
};
...
...
test/batched_gemm_multi_d/CMakeLists.txt
View file @
df4cc03f
# TODO: Enable for gfx90a after complier fix
# TODO: Enable for gfx90a after complier fix
if
(
DL_KERNELS
)
if
(
DL_KERNELS
)
if
(
NOT GPU_TARGETS MATCHES
"gfx90a"
)
add_gtest_executable
(
test_batched_gemm_multi_d test_batched_gemm_multi_d.cpp
)
add_gtest_executable
(
test_batched_gemm_multi_d test_batched_gemm_multi_d.cpp
)
target_link_libraries
(
test_batched_gemm_multi_d PRIVATE utility device_batched_gemm_multi_d_instance
)
target_link_libraries
(
test_batched_gemm_multi_d PRIVATE utility device_batched_gemm_multi_d_instance
)
endif
()
endif
()
endif
()
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment