Commit 7abc0752 authored by Jing Zhang's avatar Jing Zhang
Browse files

add thread_buff

parent 82bf5de2
#ifndef CK_BLOCKWISE_GENERIC_TENSOR_SLICE_COPY_V2_HPP
#define CK_BLOCKWISE_GENERIC_TENSOR_SLICE_COPY_V2_HPP
#include "common_header.hpp"
#include "tensor_descriptor.hpp"
#include "tensor_descriptor_helper.hpp"
#include "tensor_coordinate.hpp"
#include "cluster_descriptor.hpp"
#include "threadwise_generic_tensor_slice_copy_v2.hpp"
namespace ck {
// This blockwise copy allow vector access of src and dst.
// It allows the vector size to be different on src and dst.
// The dimension of vector access can be different for src and dst.
// The dimension access order can be different for src and dst.
// Will do valid mapping check on src data: Read 0 if src data has a invalid mapping
// Will do valid mapping check on dst data: No write if dst data has a invalid mapping
// BlockSize can be equal or larger than ThreadCluster size, which means some threads may not do
// threadwise copy
template <index_t BlockSize,
typename BlockSrcDesc,
typename BlockDstDesc,
typename BlockSliceLengths,
typename ThreadSliceLengths,
typename ThreadClusterLengths,
typename ThreadClusterArrangeOrder,
typename SrcDimAccessOrder,
typename DstDimAccessOrder,
index_t SrcVectoReadDim,
index_t DstVectorWriteDim,
index_t SrcDataPerRead,
index_t DstDataPerWrite,
AddressSpace SrcAddressSpace = AddressSpace::Generic,
AddressSpace ThreadBufferAddressSpace = AddressSpace::Generic,
AddressSpace DstAddressSpace = AddressSpace::Generic,
InMemoryDataOperation DstInMemOp = InMemoryDataOperation::Set,
index_t SrcDataStride = 1,
index_t DstDataStride = 1>
struct BlockwiseGenericTensorSliceCopy_v5
{
static constexpr index_t nDim = BlockSrcDesc::GetNumOfDimension();
using Index = MultiIndex<nDim>;
__device__ constexpr BlockwiseGenericTensorSliceCopy_v5(const Index& src_block_slice_origin,
const Index& dst_block_slice_origin)
{
static_assert(nDim == BlockSrcDesc::GetNumOfDimension() &&
nDim == BlockDstDesc::GetNumOfDimension() &&
nDim == BlockSliceLengths::Size() && nDim == ThreadSliceLengths::Size() &&
nDim == ThreadClusterLengths::Size() &&
nDim == ThreadClusterArrangeOrder::Size() &&
nDim == SrcDimAccessOrder::Size() && nDim == DstDimAccessOrder::Size(),
"wrong! nDim not consistent");
static_assert(
is_same<BlockSliceLengths, decltype(ThreadSliceLengths{} * ThreadClusterLengths{})>{},
"wrong! threads should be mapped to cover entire slicing window");
static_assert(BlockSize >= mThreadClusterDesc.GetElementSize(),
"wrong! BlockSize too small");
if(BlockSize == mThreadClusterDesc.GetElementSize() or
get_thread_local_1d_id() < mThreadClusterDesc.GetElementSize())
{
const auto thread_cluster_id =
mThreadClusterDesc.CalculateClusterIndex(get_thread_local_1d_id());
const auto thread_data_id_begin = thread_cluster_id * ThreadSliceLengths{};
mThreadwiseCopy.SetSrcSliceOrigin(src_block_slice_origin + thread_data_id_begin);
mThreadwiseCopy.SetDstSliceOrigin(dst_block_slice_origin + thread_data_id_begin);
}
}
__device__ static constexpr index_t GetThreadBufferSize()
{
return ThreadBufferDesc::GetElementSpace();
}
template <typename BlockSrcData, typename ThreadBufferData>
__device__ void RunLoadThreadBuffer(const BlockSrcData* p_block_src,
ThreadBufferData* p_thread_buffer)
{
if(BlockSize == mThreadClusterDesc.GetElementSize() or
get_thread_local_1d_id() < mThreadClusterDesc.GetElementSize())
{
mThreadwiseCopy.Load(p_block_src, p_thread_buffer);
}
}
template <typename ThreadBufferData, typename BlockDstData>
__device__ void RunStoreThreadBuffer(const ThreadBufferData* p_thread_buffer,
BlockDstData* p_block_dst)
{
if(BlockSize == mThreadClusterDesc.GetElementSize() or
get_thread_local_1d_id() < mThreadClusterDesc.GetElementSize())
{
mThreadwiseCopy.Store(p_thread_buffer, p_block_dst);
}
}
template <typename BlockSrcData, typename BlockDstData>
__device__ void Run(const BlockSrcData* p_block_src, BlockDstData* p_block_dst)
{
static_assert(ThreadBufferAddressSpace == AddressSpace::Vgpr,
"wrong! This function use vgpr as its thread "
"buffer. However, you have set RunLoadThreadBuffer and RunStoreThreadBuffer "
"to use ThreadBufferAddressSpace as their thread buffer, which is not vgpr. "
"Behavior may be different");
BlockSrcData p_thread_buffer[GetThreadBufferSize()];
if(BlockSize == mThreadClusterDesc.GetElementSize() or
get_thread_local_1d_id() < mThreadClusterDesc.GetElementSize())
{
RunLoadThreadBuffer(p_block_src, p_thread_buffer);
// if there is type conversion, it's done during store
RunStoreThreadBuffer(p_thread_buffer, p_block_dst);
}
}
template <typename T, bool PositiveDirection>
__device__ void
MoveSrcSliceWindow(const T& step_sizes,
integral_constant<bool, PositiveDirection> positive_direction)
{
if(BlockSize == mThreadClusterDesc.GetElementSize() or
get_thread_local_1d_id() < mThreadClusterDesc.GetElementSize())
{
mThreadwiseCopy.MoveSrcSliceWindow(step_sizes, positive_direction);
}
}
template <typename T, bool PositiveDirection>
__device__ void
MoveDstSliceWindow(const T& step_sizes,
integral_constant<bool, PositiveDirection> positive_direction)
{
if(BlockSize == mThreadClusterDesc.GetElementSize() or
get_thread_local_1d_id() < mThreadClusterDesc.GetElementSize())
{
mThreadwiseCopy.MoveDstSliceWindow(step_sizes, positive_direction);
}
}
private:
using ThreadBufferDesc = decltype(make_native_tensor_descriptor_packed(ThreadSliceLengths{}));
using ThreadwiseCopy = ThreadwiseGenericTensorSliceCopy_v5<BlockSrcDesc,
BlockDstDesc,
ThreadSliceLengths,
SrcDimAccessOrder,
SrcVectoReadDim,
SrcDataPerRead,
DstDataPerWrite,
SrcAddressSpace,
DstAddressSpace,
InMemoryDataOperation::Set,
SrcDataStride,
DstDataStride>;
static constexpr auto mThreadClusterDesc =
make_cluster_descriptor(ThreadClusterLengths{}, ThreadClusterArrangeOrder{});
ThreadwiseCopy mThreadwiseCopy;
};
} // namespace ck
#endif
......@@ -6,6 +6,7 @@
#include "tensor_descriptor_helper.hpp"
#include "ConstantMatrixDescriptor.hpp"
#include "blockwise_generic_tensor_slice_copy.hpp"
#include "blockwise_generic_tensor_slice_copy_v2.hpp"
#include "threadwise_generic_tensor_slice_copy.hpp"
#include "blockwise_gemm.hpp"
......@@ -152,7 +153,7 @@ struct GridwiseGemmTransposedANormalBNormalC_v1
// B matrix blockwise copy
auto b_blockwise_copy =
BlockwiseGenericTensorSliceCopy_v4<BlockSize,
BlockwiseGenericTensorSliceCopy_v5<BlockSize,
decltype(b_k_n_global_desc),
decltype(b_k_n_block_desc),
decltype(b_k_n_block_desc.GetLengths()),
......
#ifndef CK_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V2_HPP
#define CK_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V2_HPP
#include "common_header.hpp"
#include "tensor_descriptor.hpp"
#include "tensor_descriptor_helper.hpp"
#include "tensor_coordinate.hpp"
namespace ck {
// This threadwise copy allow vector access of src and dst.
// It allows the vector size to be different on src and dst.
// The dimensions of vector access should be the same on src and dst.
// The dimension access order should be the same on src and dst.
// Will do valid mapping check on src data: Read 0 if src data has a invalid mapping
// Will do valid mapping check on dst data: No write if dst data has a invalid mapping
template <typename SrcDesc,
typename DstDesc,
typename SliceLengths,
typename SrcDstDimAccessOrder,
index_t SrcDstVectorReadWriteDim,
index_t SrcDataPerRead,
index_t DstDataPerWrite,
AddressSpace SrcAddressSpace = AddressSpace::Generic,
AddressSpace DstAddressSpace = AddressSpace::Generic,
InMemoryDataOperation DstInMemOp = InMemoryDataOperation::Set,
index_t SrcDataStride = 1,
index_t DstDataStride = 1>
struct ThreadwiseGenericTensorSliceCopy_v5
{
using ThreadBufferDesc = decltype(make_native_tensor_descriptor_packed(SliceLengths{}));
static constexpr index_t ThreadBufferSize = ThreadBufferDesc::GetElementSpace();
static constexpr index_t nDim = SliceLengths::Size();
using Index = MultiIndex<nDim>;
using SrcCoord = typename TensorCoordinate<SrcDesc>::type;
using DstCoord = typename TensorCoordinate<DstDesc>::type;
__device__ constexpr ThreadwiseGenericTensorSliceCopy_v5(const Index& src_slice_origin,
const Index& dst_slice_origin)
: mSrcSliceOrigin(src_slice_origin), mDstSliceOrigin(dst_slice_origin)
{
static_assert(nDim == SrcDesc::GetNumOfDimension() &&
nDim == DstDesc::GetNumOfDimension() && nDim == SliceLengths::Size() &&
nDim == SrcDstDimAccessOrder::Size(),
"wrong! # of dimensions not the same");
static_assert(is_valid_sequence_map<SrcDstDimAccessOrder>{}, "wrong! map is not valid");
static_assert(SliceLengths{}[SrcDstVectorReadWriteDim] %
math::lcm(SrcDataPerRead, DstDataPerWrite) ==
0,
"wrong! cannot evenly divide");
static_assert(ThreadBufferSize == 4, "");
// TODO:: sanity-check if vectorized memory read/write is allowed on src and dst
}
__device__ constexpr ThreadwiseGenericTensorSliceCopy_v5()
: ThreadwiseGenericTensorSliceCopy_v5(make_zero_multi_index<nDim>(),
make_zero_multi_index<nDim>())
{
}
__device__ void SetSrcSliceOrigin(SrcCoord src_slice_origin)
{
mSrcSliceOrigin = src_slice_origin;
}
__device__ void SetDstSliceOrigin(DstCoord dst_slice_origin)
{
mDstSliceOrigin = dst_slice_origin;
}
template <typename SrcData, typename DstData>
__device__ void Load(const SrcData* p_src, DstData* p_dst)
{
constexpr auto vector_access_dim = Number<SrcDstVectorReadWriteDim>{};
constexpr auto src_data_per_access = Number<SrcDataPerRead>{};
constexpr auto dst_data_per_access = Number<DstDataPerWrite>{};
static_assert(SrcDataPerRead == 1 && DstDataPerWrite == 1, "");
constexpr auto long_vector_size = Number<math::lcm(SrcDataPerRead, DstDataPerWrite)>{};
constexpr auto long_vector_access_lengths = SliceLengths::Modify(
vector_access_dim, SliceLengths::Get(vector_access_dim) / long_vector_size);
ford<decltype(long_vector_access_lengths), SrcDstDimAccessOrder>{}(
[&](auto long_vector_access_id) {
// data id w.r.t slicing-window
auto long_vector_data_begin_id = long_vector_access_id;
long_vector_data_begin_id(vector_access_dim) =
long_vector_size * long_vector_access_id[vector_access_dim];
// buffer to hold a src long-vector
SrcData p_src_long_vector[long_vector_size];
#if 1
// zero out buffer
static_for<0, long_vector_size, 1>{}([&](auto i) { p_src_long_vector[i] = 0; });
#endif
// load data from src to the long-vector buffer
static_for<0, long_vector_size / src_data_per_access, 1>{}([&](auto i) {
auto scalar_id = make_zero_multi_index<nDim>();
scalar_id(vector_access_dim) = i * src_data_per_access;
const index_t buffer_offset = i * src_data_per_access;
const auto src_coord =
mSrcSliceOrigin + (long_vector_data_begin_id + scalar_id);
// Check src data's valid mapping situation, only check the first data in this
// src
// vector. It's user's responsiblity to make sure all data in the src vector
// has the valid/invalid mapping situation
transfer_data<SrcData,
SrcDataPerRead,
SrcAddressSpace,
AddressSpace::Vgpr,
InMemoryDataOperation::Set,
SrcDataStride,
1>(p_src,
src_coord.GetOffset(),
src_coord.IsOffsetValidAssumingUpperIndexIsValid(),
SrcDesc::GetElementSpace(),
p_src_long_vector,
buffer_offset,
true,
long_vector_size);
});
// SrcData to DstData conversion
DstData p_dst_long_vector[long_vector_size];
static_for<0, long_vector_size, 1>{}([&](auto i) {
p_dst_long_vector[i] = type_convert<DstData>{}(p_src_long_vector[i]);
});
// store data from the long-vector buffer to dst
static_for<0, long_vector_size / dst_data_per_access, 1>{}([&](auto i) {
auto scalar_id = make_zero_multi_index<nDim>();
scalar_id(vector_access_dim) = i * dst_data_per_access;
const index_t buffer_offset = i * dst_data_per_access;
const auto dst_coord =
mDstSliceOrigin + (long_vector_data_begin_id + scalar_id);
auto buff_off = ThreadBufferDesc::CalculateOffset(long_vector_data_begin_id + scalar_id);
thread_buff[buff_off] = p_dst_long_vector[buffer_offset];
// Check dst data's valid mapping situation, only check the first data in this
// dst
// vector. It's user's responsiblity to make sure all data in the dst vector
// has the valid/invalid mapping situation
//transfer_data<DstData,
//DstDataPerWrite,
//AddressSpace::Vgpr,
//DstAddressSpace,
//DstInMemOp,
//1,
//DstDataStride>(p_dst_long_vector,
//buffer_offset,
//true,
//long_vector_size,
//thread_buff,
//dst_coord.GetOffset(),
//dst_coord.IsOffsetValidAssumingUpperIndexIsValid(),
//DstDesc::GetElementSpace());
});
});
}
template <typename SrcData, typename DstData>
__device__ void Store(const SrcData* p_src, DstData* p_dst)
{
constexpr auto vector_access_dim = Number<SrcDstVectorReadWriteDim>{};
constexpr auto src_data_per_access = Number<SrcDataPerRead>{};
constexpr auto dst_data_per_access = Number<DstDataPerWrite>{};
static_assert(SrcDataPerRead == 1 && DstDataPerWrite == 1, "");
constexpr auto long_vector_size = Number<math::lcm(SrcDataPerRead, DstDataPerWrite)>{};
constexpr auto long_vector_access_lengths = SliceLengths::Modify(
vector_access_dim, SliceLengths::Get(vector_access_dim) / long_vector_size);
ford<decltype(long_vector_access_lengths), SrcDstDimAccessOrder>{}(
[&](auto long_vector_access_id) {
// data id w.r.t slicing-window
auto long_vector_data_begin_id = long_vector_access_id;
long_vector_data_begin_id(vector_access_dim) =
long_vector_size * long_vector_access_id[vector_access_dim];
// buffer to hold a src long-vector
SrcData p_src_long_vector[long_vector_size];
#if 1
// zero out buffer
static_for<0, long_vector_size, 1>{}([&](auto i) { p_src_long_vector[i] = 0; });
#endif
// load data from src to the long-vector buffer
static_for<0, long_vector_size / src_data_per_access, 1>{}([&](auto i) {
auto scalar_id = make_zero_multi_index<nDim>();
scalar_id(vector_access_dim) = i * src_data_per_access;
const index_t buffer_offset = i * src_data_per_access;
auto buff_off = ThreadBufferDesc::CalculateOffset(long_vector_data_begin_id + scalar_id);
p_src_long_vector[buffer_offset] = thread_buff[buff_off];
//const auto src_coord =
//mSrcSliceOrigin + (long_vector_data_begin_id + scalar_id);
// Check src data's valid mapping situation, only check the first data in this
// src
// vector. It's user's responsiblity to make sure all data in the src vector
// has the valid/invalid mapping situation
//transfer_data<SrcData,
//SrcDataPerRead,
//SrcAddressSpace,
//AddressSpace::Vgpr,
//InMemoryDataOperation::Set,
//SrcDataStride,
//1>(thread_buff,
//src_coord.GetOffset(),
//src_coord.IsOffsetValidAssumingUpperIndexIsValid(),
//SrcDesc::GetElementSpace(),
//p_src_long_vector,
//buffer_offset,
//true,
//long_vector_size);
});
// SrcData to DstData conversion
DstData p_dst_long_vector[long_vector_size];
static_for<0, long_vector_size, 1>{}([&](auto i) {
p_dst_long_vector[i] = type_convert<DstData>{}(p_src_long_vector[i]);
});
// store data from the long-vector buffer to dst
static_for<0, long_vector_size / dst_data_per_access, 1>{}([&](auto i) {
auto scalar_id = make_zero_multi_index<nDim>();
scalar_id(vector_access_dim) = i * dst_data_per_access;
const index_t buffer_offset = i * dst_data_per_access;
const auto dst_coord =
mDstSliceOrigin + (long_vector_data_begin_id + scalar_id);
// Check dst data's valid mapping situation, only check the first data in this
// dst
// vector. It's user's responsiblity to make sure all data in the dst vector
// has the valid/invalid mapping situation
transfer_data<DstData,
DstDataPerWrite,
AddressSpace::Vgpr,
DstAddressSpace,
DstInMemOp,
1,
DstDataStride>(p_dst_long_vector,
buffer_offset,
true,
long_vector_size,
p_dst,
dst_coord.GetOffset(),
dst_coord.IsOffsetValidAssumingUpperIndexIsValid(),
DstDesc::GetElementSpace());
});
});
}
template <typename T, bool PositiveDirection>
__device__ void MoveSrcSliceWindow(const T& step_sizes_,
integral_constant<bool, PositiveDirection>)
{
const auto step_sizes = to_multi_index(step_sizes_);
static_if<PositiveDirection>{}([&](auto) { mSrcSliceOrigin += to_multi_index(step_sizes); })
.Else([&](auto) { mSrcSliceOrigin -= step_sizes; });
}
template <typename T, bool PositiveDirection>
__device__ void MoveDstSliceWindow(const T& step_sizes_,
integral_constant<bool, PositiveDirection>)
{
const auto step_sizes = to_multi_index(step_sizes_);
static_if<PositiveDirection>{}([&](auto) { mDstSliceOrigin += step_sizes; })
.Else([&](auto) { mDstSliceOrigin -= step_sizes; });
}
float thread_buff[8];
private:
SrcCoord mSrcSliceOrigin;
DstCoord mDstSliceOrigin;
};
} // namespace ck
#endif
......@@ -3,14 +3,14 @@ rm -f CMakeCache.txt
rm -f *.cmake
rm -rf CMakeFiles
MY_PROJECT_SOURCE=../../../
MY_PROJECT_SOURCE=../
MY_PROJECT_INSTALL=../install.dir
cmake \
-D CMAKE_INSTALL_PREFIX=${MY_PROJECT_INSTALL} \
-D CMAKE_BUILD_TYPE=Release \
-D DEVICE_BACKEND="AMD" \
-D CMAKE_CXX_FLAGS="-O3 --amdgpu-target=gfx906 -gline-tables-only -save-temps=$CWD" \
-D CMAKE_CXX_FLAGS="-O3 --amdgpu-target=gfx908 -gline-tables-only -save-temps=$CWD" \
-D CMAKE_CXX_COMPILER=/opt/rocm/bin/hipcc \
-D CMAKE_PREFIX_PATH="/opt/rocm" \
-D CMAKE_VERBOSE_MAKEFILE:BOOL=ON \
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment