Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
composable_kernel
Commits
435f5f91
"...composable_kernel_rocm.git" did not exist on "3e6c2610ae9256dc7e4118dbf2074e97487babe3"
Commit
435f5f91
authored
Jun 28, 2020
by
Chao Liu
Browse files
buffer APIs use combined wave and thread offset
parent
7a3d9697
Changes
2
Expand all
Show whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
351 additions
and
461 deletions
+351
-461
composable_kernel/include/utility/amd_buffer_addressing.hpp
composable_kernel/include/utility/amd_buffer_addressing.hpp
+342
-451
composable_kernel/include/utility/in_memory_operation.amd.hpp.in
...ble_kernel/include/utility/in_memory_operation.amd.hpp.in
+9
-10
No files found.
composable_kernel/include/utility/amd_buffer_addressing.hpp
View file @
435f5f91
This diff is collapsed.
Click to expand it.
composable_kernel/include/utility/in_memory_operation.amd.hpp.in
View file @
435f5f91
...
@@ -72,8 +72,8 @@ struct SetData
...
@@ -72,8 +72,8 @@ struct SetData
#if CK_USE_AMD_BUFFER_ADDRESSING
#if CK_USE_AMD_BUFFER_ADDRESSING
// buffer_load requires:
// buffer_load requires:
// 1) p_src must be in global memory space,
d
_dst must be vgpr
// 1) p_src
_thread
must be in global memory space,
p
_dst
_thread
must be vgpr
// 2) p_src to be a
block-invariant
pointer.
// 2) p_src
_thread
to be a
wavewise
pointer.
// It is user's responsibility to make sure that is true.
// It is user's responsibility to make sure that is true.
template <>
template <>
__device__ void Run<AddressSpace::Global, AddressSpace::Vgpr>(const T* p_src,
__device__ void Run<AddressSpace::Global, AddressSpace::Vgpr>(const T* p_src,
...
@@ -88,13 +88,13 @@ struct SetData
...
@@ -88,13 +88,13 @@ struct SetData
if(dst_valid)
if(dst_valid)
{
{
*reinterpret_cast<vector_t*>(&p_dst[dst_offset]) =
*reinterpret_cast<vector_t*>(&p_dst[dst_offset]) =
amd_buffer_load<T, DataPerAccess>(p_src, src_offset,
0,
src_valid, src_range);
amd_buffer_load<T, DataPerAccess>(p_src, src_offset, src_valid, src_range);
}
}
}
}
// buffer_store requires:
// buffer_store requires:
// 1) p_src must be in vgpr space,
d
_dst must be global memory
// 1) p_src
_thread
must be in vgpr space,
p
_dst
_thread
must be global memory
// 2) p_dst to be a
block-invariant
pointer.
// 2) p_dst
_thread
to be a
wavewise
pointer.
// It is user's responsibility to make sure that is true.
// It is user's responsibility to make sure that is true.
template <>
template <>
__device__ void Run<AddressSpace::Vgpr, AddressSpace::Global>(const T* p_src,
__device__ void Run<AddressSpace::Vgpr, AddressSpace::Global>(const T* p_src,
...
@@ -109,7 +109,7 @@ struct SetData
...
@@ -109,7 +109,7 @@ struct SetData
const auto zeros = vector_t(0);
const auto zeros = vector_t(0);
amd_buffer_store<T, DataPerAccess>(
amd_buffer_store<T, DataPerAccess>(
src_valid ? &(p_src[src_offset]) : &zeros, p_dst, dst_offset,
0,
dst_valid, dst_range);
src_valid ? &(p_src[src_offset]) : &zeros, p_dst, dst_offset, dst_valid, dst_range);
}
}
#endif
#endif
};
};
...
@@ -138,9 +138,9 @@ struct AtomicAddData
...
@@ -138,9 +138,9 @@ struct AtomicAddData
}
}
#if CK_USE_AMD_BUFFER_ADDRESSING && CK_USE_AMD_BUFFER_ATOMIC_ADD
#if CK_USE_AMD_BUFFER_ADDRESSING && CK_USE_AMD_BUFFER_ATOMIC_ADD
// buffer_atomic
_add
requires:
// buffer_atomic requires:
// 1) p_src must be in vgpr space,
d
_dst must be global memory
// 1) p_src
_thread
must be in vgpr space,
p
_dst
_thread
must be global memory
// 2) p_dst to be a
block-invariant
pointer.
// 2) p_dst
_thread
to be a
wavewise
pointer.
// It is user's responsibility to make sure that is true.
// It is user's responsibility to make sure that is true.
template <>
template <>
__device__ void Run<AddressSpace::Vgpr, AddressSpace::Global>(const T* p_src,
__device__ void Run<AddressSpace::Vgpr, AddressSpace::Global>(const T* p_src,
...
@@ -156,7 +156,6 @@ struct AtomicAddData
...
@@ -156,7 +156,6 @@ struct AtomicAddData
amd_buffer_atomic_add<T, DataPerAccess>(src_valid ? &(p_src[src_offset]) : &zeros,
amd_buffer_atomic_add<T, DataPerAccess>(src_valid ? &(p_src[src_offset]) : &zeros,
p_dst,
p_dst,
dst_offset,
dst_offset,
0,
dst_valid,
dst_valid,
index_t dst_range);
index_t dst_range);
}
}
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment