Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
composable_kernel_ROCM
Commits
895e8c40
Commit
895e8c40
authored
Jul 31, 2024
by
Jing Zhang
Browse files
replace buffer_atomic with global_atomic
parent
733f33af
Changes
4
Hide whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
40 additions
and
7 deletions
+40
-7
cmake/gtest.cmake
cmake/gtest.cmake
+2
-1
include/ck/utility/amd_buffer_addressing.hpp
include/ck/utility/amd_buffer_addressing.hpp
+33
-3
include/ck/utility/dynamic_buffer.hpp
include/ck/utility/dynamic_buffer.hpp
+4
-2
script/cmake-ck-dev.sh
script/cmake-ck-dev.sh
+1
-1
No files found.
cmake/gtest.cmake
View file @
895e8c40
...
@@ -8,7 +8,8 @@ endif()
...
@@ -8,7 +8,8 @@ endif()
FetchContent_Declare
(
FetchContent_Declare
(
GTest
GTest
GIT_REPOSITORY https://github.com/google/googletest.git
#GIT_REPOSITORY https://github.com/google/googletest.git
GIT_REPOSITORY git@github.com:google/googletest.git
GIT_TAG f8d7d77c06936315286eb55f8de22cd23c188571
GIT_TAG f8d7d77c06936315286eb55f8de22cd23c188571
)
)
...
...
include/ck/utility/amd_buffer_addressing.hpp
View file @
895e8c40
...
@@ -562,6 +562,33 @@ __device__ void amd_buffer_store_impl(const typename vector_type<T, N>::type src
...
@@ -562,6 +562,33 @@ __device__ void amd_buffer_store_impl(const typename vector_type<T, N>::type src
dst_wave_addr_offset
);
dst_wave_addr_offset
);
}
}
template
<
typename
T
,
index_t
N
>
__device__
void
amd_global_atomic_add_impl
(
const
typename
vector_type
<
T
,
N
>::
type
src_thread_data
,
T
*
addr
)
{
if
constexpr
(
is_same
<
T
,
half_t
>::
value
)
{
if
constexpr
(
N
==
2
)
{
__builtin_amdgcn_global_atomic_fadd_v2f16
(
addr
,
src_thread_data
);
}
else
if
constexpr
(
N
==
4
)
{
vector_type
<
half_t
,
4
>
tmp
{
src_thread_data
};
static_for
<
0
,
2
,
1
>
{}([
&
](
auto
i
)
{
__builtin_amdgcn_global_atomic_fadd_v2f16
(
addr
+
i
,
tmp
.
AsType
<
half2_t
>
()[
i
]);
});
}
else
if
constexpr
(
N
==
8
)
{
vector_type
<
half_t
,
8
>
tmp
{
src_thread_data
};
static_for
<
0
,
4
,
1
>
{}([
&
](
auto
i
)
{
__builtin_amdgcn_global_atomic_fadd_v2f16
(
addr
+
i
,
tmp
.
AsType
<
half2_t
>
()[
i
]);
});
}
}
}
template
<
typename
T
,
index_t
N
>
template
<
typename
T
,
index_t
N
>
__device__
void
amd_buffer_atomic_add_impl
(
const
typename
vector_type
<
T
,
N
>::
type
src_thread_data
,
__device__
void
amd_buffer_atomic_add_impl
(
const
typename
vector_type
<
T
,
N
>::
type
src_thread_data
,
int32x4_t
dst_wave_buffer_resource
,
int32x4_t
dst_wave_buffer_resource
,
...
@@ -907,7 +934,7 @@ amd_buffer_atomic_add(const typename vector_type_maker<T, N>::type::type src_thr
...
@@ -907,7 +934,7 @@ amd_buffer_atomic_add(const typename vector_type_maker<T, N>::type::type src_thr
using
scalar_t
=
typename
scalar_type
<
vector_t
>::
type
;
using
scalar_t
=
typename
scalar_type
<
vector_t
>::
type
;
constexpr
index_t
vector_size
=
scalar_type
<
vector_t
>::
vector_size
;
constexpr
index_t
vector_size
=
scalar_type
<
vector_t
>::
vector_size
;
#if
CK_EXPERIMENTAL_USE_BUFFER_ATOMIC_ADD_OOB_CHECK_OFFSET_TRICK
#if
0
uint32_t dst_addr_shift = dst_thread_element_valid ? 0 : 0x80000000;
uint32_t dst_addr_shift = dst_thread_element_valid ? 0 : 0x80000000;
amd_buffer_atomic_add_impl<scalar_t, vector_size>(
amd_buffer_atomic_add_impl<scalar_t, vector_size>(
...
@@ -915,8 +942,11 @@ amd_buffer_atomic_add(const typename vector_type_maker<T, N>::type::type src_thr
...
@@ -915,8 +942,11 @@ amd_buffer_atomic_add(const typename vector_type_maker<T, N>::type::type src_thr
#else
#else
if
(
dst_thread_element_valid
)
if
(
dst_thread_element_valid
)
{
{
amd_buffer_atomic_add_impl
<
scalar_t
,
vector_size
>
(
ignore
=
dst_wave_buffer_resource
;
src_thread_data
,
dst_wave_buffer_resource
,
dst_thread_addr_offset
,
0
);
ignore
=
dst_thread_addr_offset
;
//amd_buffer_atomic_add_impl<scalar_t, vector_size>(
//src_thread_data, dst_wave_buffer_resource, dst_thread_addr_offset, 0);
amd_global_atomic_add_impl
<
scalar_t
,
vector_size
>
(
src_thread_data
,
p_dst_wave
+
dst_thread_element_offset
);
}
}
#endif
#endif
}
}
...
...
include/ck/utility/dynamic_buffer.hpp
View file @
895e8c40
...
@@ -358,13 +358,15 @@ struct DynamicBuffer
...
@@ -358,13 +358,15 @@ struct DynamicBuffer
bool
constexpr
use_amd_buffer_addressing
=
bool
constexpr
use_amd_buffer_addressing
=
is_same_v
<
remove_cvref_t
<
scalar_t
>
,
int32_t
>
||
is_same_v
<
remove_cvref_t
<
scalar_t
>
,
int32_t
>
||
is_same_v
<
remove_cvref_t
<
scalar_t
>
,
float
>
||
is_same_v
<
remove_cvref_t
<
scalar_t
>
,
float
>
||
(
is_same_v
<
remove_cvref_t
<
scalar_t
>
,
half_t
>
&&
scalar_per_x_vector
%
2
==
0
);
(
is_same_v
<
remove_cvref_t
<
scalar_t
>
,
half_t
>
&&
scalar_per_x_vector
%
2
==
0
)
||
(
is_same_v
<
remove_cvref_t
<
scalar_t
>
,
bhalf_t
>
&&
scalar_per_x_vector
%
2
==
0
);
#elif CK_USE_AMD_BUFFER_ATOMIC_ADD_INTEGER && (!CK_USE_AMD_BUFFER_ATOMIC_ADD_FLOAT)
#elif CK_USE_AMD_BUFFER_ATOMIC_ADD_INTEGER && (!CK_USE_AMD_BUFFER_ATOMIC_ADD_FLOAT)
bool
constexpr
use_amd_buffer_addressing
=
is_same_v
<
remove_cvref_t
<
scalar_t
>
,
int32_t
>
;
bool
constexpr
use_amd_buffer_addressing
=
is_same_v
<
remove_cvref_t
<
scalar_t
>
,
int32_t
>
;
#elif(!CK_USE_AMD_BUFFER_ATOMIC_ADD_INTEGER) && CK_USE_AMD_BUFFER_ATOMIC_ADD_FLOAT
#elif(!CK_USE_AMD_BUFFER_ATOMIC_ADD_INTEGER) && CK_USE_AMD_BUFFER_ATOMIC_ADD_FLOAT
bool
constexpr
use_amd_buffer_addressing
=
bool
constexpr
use_amd_buffer_addressing
=
is_same_v
<
remove_cvref_t
<
scalar_t
>
,
float
>
||
is_same_v
<
remove_cvref_t
<
scalar_t
>
,
float
>
||
(
is_same_v
<
remove_cvref_t
<
scalar_t
>
,
half_t
>
&&
scalar_per_x_vector
%
2
==
0
);
(
is_same_v
<
remove_cvref_t
<
scalar_t
>
,
half_t
>
&&
scalar_per_x_vector
%
2
==
0
)
||
(
is_same_v
<
remove_cvref_t
<
scalar_t
>
,
bhalf_t
>
&&
scalar_per_x_vector
%
2
==
0
);
#else
#else
bool
constexpr
use_amd_buffer_addressing
=
false
;
bool
constexpr
use_amd_buffer_addressing
=
false
;
#endif
#endif
...
...
script/cmake-ck-dev.sh
View file @
895e8c40
...
@@ -14,7 +14,7 @@ fi
...
@@ -14,7 +14,7 @@ fi
cmake
\
cmake
\
-D
CMAKE_PREFIX_PATH
=
/opt/rocm
\
-D
CMAKE_PREFIX_PATH
=
/opt/rocm
\
-D
CMAKE_CXX_COMPILER
=
/opt/rocm/bin/hipcc
\
-D
CMAKE_CXX_COMPILER
=
/opt/rocm/bin/hipcc
\
-D
CMAKE_
CXX
_FLAGS
=
"-Xclang -mllvm -Xclang -enable-post-misched=0 -std=c++17 -O3 -ftemplate-backtrace-limit=0 -fPIE -Wno-gnu-line-marker"
\
-D
CMAKE_
HIP
_FLAGS
=
"
--save-temps -v
-Xclang -mllvm -Xclang -enable-post-misched=0 -std=c++17 -O3 -ftemplate-backtrace-limit=0 -fPIE -Wno-gnu-line-marker"
\
-D
CMAKE_BUILD_TYPE
=
Release
\
-D
CMAKE_BUILD_TYPE
=
Release
\
-D
BUILD_DEV
=
ON
\
-D
BUILD_DEV
=
ON
\
-D
GPU_TARGETS
=
$GPU_TARGETS
\
-D
GPU_TARGETS
=
$GPU_TARGETS
\
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment