"...text-generation-inference.git" did not exist on "4f4857a4ac4d09483f72465e5adcd29f38b03b16"
Commit 184c6e7d authored by Chao Liu's avatar Chao Liu
Browse files

nvidia build

parent f00c1381
...@@ -325,14 +325,14 @@ struct TensorCoordinate ...@@ -325,14 +325,14 @@ struct TensorCoordinate
private: private:
template <class... Ts> template <class... Ts>
__host__ __device__ static constexpr auto __host__ __device__ static constexpr auto
MakeDummyTensorCoordinate(ConstantTensorDescriptor<Ts...>) MakeDummyTensorCoordinate(ConstantTensorDescriptor<Ts...>)
{ {
return NormalTensorCoordinate<ConstantTensorDescriptor<Ts...>>(); return NormalTensorCoordinate<ConstantTensorDescriptor<Ts...>>();
} }
template <class... Ts> template <class... Ts>
__host__ __device__ static constexpr auto __host__ __device__ static constexpr auto
MakeDummyTensorCoordinate(ConstantMergedTensorDescriptor<Ts...>) MakeDummyTensorCoordinate(ConstantMergedTensorDescriptor<Ts...>)
{ {
return MergedTensorCoordinate<ConstantMergedTensorDescriptor<Ts...>>(); return MergedTensorCoordinate<ConstantMergedTensorDescriptor<Ts...>>();
} }
......
...@@ -188,7 +188,7 @@ struct TensorCoordinate_v2 ...@@ -188,7 +188,7 @@ struct TensorCoordinate_v2
private: private:
template <typename... Ts> template <typename... Ts>
__host__ __device__ static constexpr auto __host__ __device__ static constexpr auto
MakeDummyTensorCoordinate(NativeTensorDescriptor<Ts...>) MakeDummyTensorCoordinate(NativeTensorDescriptor<Ts...>)
{ {
return NativeTensorCoordinate<NativeTensorDescriptor<Ts...>>( return NativeTensorCoordinate<NativeTensorDescriptor<Ts...>>(
make_zero_array<index_t, TensorDesc::GetNumOfDimension()>()); make_zero_array<index_t, TensorDesc::GetNumOfDimension()>());
...@@ -196,7 +196,7 @@ struct TensorCoordinate_v2 ...@@ -196,7 +196,7 @@ struct TensorCoordinate_v2
template <typename... Ts> template <typename... Ts>
__host__ __device__ static constexpr auto __host__ __device__ static constexpr auto
MakeDummyTensorCoordinate(TransformedTensorDescriptor<Ts...>) MakeDummyTensorCoordinate(TransformedTensorDescriptor<Ts...>)
{ {
return TransformedTensorCoordinate<TransformedTensorDescriptor<Ts...>>( return TransformedTensorCoordinate<TransformedTensorDescriptor<Ts...>>(
make_zero_array<index_t, TensorDesc::GetNumOfDimension()>()); make_zero_array<index_t, TensorDesc::GetNumOfDimension()>());
......
...@@ -346,7 +346,7 @@ struct TransformedTensorDescriptor ...@@ -346,7 +346,7 @@ struct TransformedTensorDescriptor
return GetLowerTensorDescriptor().CalculateOffset(CalculateLowerIndex(idx_up)); return GetLowerTensorDescriptor().CalculateOffset(CalculateLowerIndex(idx_up));
} }
#if 0 #if 1
struct lambda_sequence_logic_or struct lambda_sequence_logic_or
{ {
template <typename... Seqs> template <typename... Seqs>
......
...@@ -21,6 +21,10 @@ ...@@ -21,6 +21,10 @@
#define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V2R1 0 #define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V2R1 0
#endif #endif
#ifndef CK_EXPERIMENTAL_USE_AMD_BUFFER_LOAD_STORE_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V2R1
#define CK_EXPERIMENTAL_USE_AMD_BUFFER_LOAD_STORE_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V2R1 0
#endif
namespace ck { namespace ck {
// This threadwise copy allow vector access of src and dst. // This threadwise copy allow vector access of src and dst.
...@@ -835,19 +839,15 @@ struct ThreadwiseGenericTensorSliceCopy_v2r1 ...@@ -835,19 +839,15 @@ struct ThreadwiseGenericTensorSliceCopy_v2r1
// 2. src_normal_offset must be calculatd at compile time (guaranteed) // 2. src_normal_offset must be calculatd at compile time (guaranteed)
// 3. src_merged_offset can be runtime value (no assumption imposed) // 3. src_merged_offset can be runtime value (no assumption imposed)
static_if<SrcMemorySpace == 2>{}([&](auto) { static_if<SrcMemorySpace == 2>{}([&](auto) {
#if 0 // source code #if CK_USE_AMD_INTRINSIC && \
vector_data = *reinterpret_cast<const src_vector_t*>( CK_USE_AMD_INTRINSIC_BUFFER_LOAD_STORE_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V2R1
&p_src[src_normal_offset + src_merged_offset]);
#elif 0 // inline asm using global_load
vector_data = __global_load<TData, SrcDataPerAccess>(
p_src,
static_cast<uint32_t>(src_merged_offset),
static_cast<uint32_t>(src_normal_offset));
#elif 1 // inline asm using buffer_load
vector_data = __buffer_load<TData, SrcDataPerAccess>( vector_data = __buffer_load<TData, SrcDataPerAccess>(
p_src, p_src,
static_cast<uint32_t>(src_merged_offset), static_cast<uint32_t>(src_merged_offset),
static_cast<uint32_t>(src_normal_offset)); static_cast<uint32_t>(src_normal_offset));
#else
vector_data = *reinterpret_cast<const src_vector_t*>(
&p_src[src_normal_offset + src_merged_offset]);
#endif #endif
}).Else([&](auto) { }).Else([&](auto) {
// src can be all kinds of memory-space. // src can be all kinds of memory-space.
...@@ -940,15 +940,13 @@ struct ThreadwiseGenericTensorSliceCopy_v2r1 ...@@ -940,15 +940,13 @@ struct ThreadwiseGenericTensorSliceCopy_v2r1
// 2. dst_normal_offset must be calculatd at compile time (guaranteed) // 2. dst_normal_offset must be calculatd at compile time (guaranteed)
// 3. dst_merged_offset can be runtime value (no assumption imposed) // 3. dst_merged_offset can be runtime value (no assumption imposed)
static_if<DstMemorySpace == 2>{}([&](auto) { static_if<DstMemorySpace == 2>{}([&](auto) {
#if 0 // source code #if CK_USE_AMD_INTRINSIC && \
*reinterpret_cast<dst_vector_t*>( CK_USE_AMD_INTRINSIC_BUFFER_LOAD_STORE_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V2R1
&p_dst[dst_normal_offset + dst_merged_offset]) = vector_data;
#elif 0 // inline asm using global_store
__global_store<TData, DstDataPerAccess>(
vector_data, p_dst, dst_merged_offset, dst_normal_offset);
#elif 1 // inline asm using buffer_store
__buffer_store<TData, DstDataPerAccess>( __buffer_store<TData, DstDataPerAccess>(
vector_data, p_dst, dst_merged_offset, dst_normal_offset); vector_data, p_dst, dst_merged_offset, dst_normal_offset);
#else
*reinterpret_cast<dst_vector_t*>(
&p_dst[dst_normal_offset + dst_merged_offset]) = vector_data;
#endif #endif
}).Else([&](auto) { }).Else([&](auto) {
// dst can be all kinds of memory-space // dst can be all kinds of memory-space
...@@ -1053,15 +1051,6 @@ struct ThreadwiseGenericTensorSliceCopy_v3r1 ...@@ -1053,15 +1051,6 @@ struct ThreadwiseGenericTensorSliceCopy_v3r1
auto src_slice_vectorized = auto src_slice_vectorized =
mSrcSlice.Vectorize(src_vector_access_dim, src_data_per_access); mSrcSlice.Vectorize(src_vector_access_dim, src_data_per_access);
#if 0
if(get_thread_local_1d_id() == 0 && get_block_1d_id() == 0)
{
print_ConstantTensorDescriptor("mSrcSlice: ", typename decltype(mSrcSlice)::tensor_desc_type{});
print_ConstantTensorDescriptor("src_slice_vector: ", typename decltype(src_slice_vectorized)::tensor_desc_type{});
}
#endif
#if 1 // debug
ford<decltype(src_slice_vectorized.GetLengths()), SrcDimAccessOrder>{}( ford<decltype(src_slice_vectorized.GetLengths()), SrcDimAccessOrder>{}(
[&](auto src_vector_id) { [&](auto src_vector_id) {
// load vector from src // load vector from src
...@@ -1080,7 +1069,6 @@ struct ThreadwiseGenericTensorSliceCopy_v3r1 ...@@ -1080,7 +1069,6 @@ struct ThreadwiseGenericTensorSliceCopy_v3r1
reinterpret_cast<const data_type*>(&vector_data)[i]; reinterpret_cast<const data_type*>(&vector_data)[i];
} }
}); });
#endif
} }
// copy data from buffer into dst // copy data from buffer into dst
...@@ -1093,15 +1081,6 @@ struct ThreadwiseGenericTensorSliceCopy_v3r1 ...@@ -1093,15 +1081,6 @@ struct ThreadwiseGenericTensorSliceCopy_v3r1
auto dst_slice_vectorized = auto dst_slice_vectorized =
mDstSlice.Vectorize(dst_vector_access_dim, dst_data_per_access); mDstSlice.Vectorize(dst_vector_access_dim, dst_data_per_access);
#if 0
if(get_thread_local_1d_id() == 0 && get_block_1d_id() == 0)
{
print_ConstantTensorDescriptor("mDstSlice: ", typename decltype(mDstSlice)::tensor_desc_type{});
print_ConstantTensorDescriptor("dst_slice_vector: ", typename decltype(dst_slice_vectorized)::tensor_desc_type{});
}
#endif
#if 1 // debug
ford<decltype(dst_slice_vectorized.GetLengths()), DstDimAccessOrder>{}( ford<decltype(dst_slice_vectorized.GetLengths()), DstDimAccessOrder>{}(
[&](auto dst_vector_id) { [&](auto dst_vector_id) {
...@@ -1122,7 +1101,6 @@ struct ThreadwiseGenericTensorSliceCopy_v3r1 ...@@ -1122,7 +1101,6 @@ struct ThreadwiseGenericTensorSliceCopy_v3r1
// write vector into dst // write vector into dst
dst_slice_vectorized(dst_vector_id) = vector_data; dst_slice_vectorized(dst_vector_id) = vector_data;
}); });
#endif
} }
} }
...@@ -1330,13 +1308,14 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2 ...@@ -1330,13 +1308,14 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2
const index_t buffer_offset = i * src_data_per_access; const index_t buffer_offset = i * src_data_per_access;
static_if<SrcMemorySpace == 2>{}([&](auto) { static_if<SrcMemorySpace == 2>{}([&](auto) {
#if 0 // source code #if CK_USE_AMD_INTRINSIC && \
*reinterpret_cast<src_vector_t*>(&p_long_vector[buffer_offset]) = CK_USE_AMD_INTRINSIC_BUFFER_LOAD_STORE_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V2R1
*reinterpret_cast<const src_vector_t*>(&p_src[src_offset]);
#elif 1 // inline asm using buffer_load
*reinterpret_cast<src_vector_t*>(&p_long_vector[buffer_offset]) = *reinterpret_cast<src_vector_t*>(&p_long_vector[buffer_offset]) =
__buffer_load<TData, SrcDataPerAccess>( __buffer_load<TData, SrcDataPerAccess>(
p_src, static_cast<uint32_t>(src_offset), static_cast<uint32_t>(0)); p_src, static_cast<uint32_t>(src_offset), static_cast<uint32_t>(0));
#else
*reinterpret_cast<src_vector_t*>(&p_long_vector[buffer_offset]) =
*reinterpret_cast<const src_vector_t*>(&p_src[src_offset]);
#endif #endif
}).Else([&](auto) { }).Else([&](auto) {
// src can be all kinds of memory-space. // src can be all kinds of memory-space.
...@@ -1358,15 +1337,16 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2 ...@@ -1358,15 +1337,16 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2
(mDstSliceOrigin + (long_vector_data_begin_id + scalar_id)).GetOffset(); (mDstSliceOrigin + (long_vector_data_begin_id + scalar_id)).GetOffset();
static_if<DstMemorySpace == 2>{}([&](auto) { static_if<DstMemorySpace == 2>{}([&](auto) {
#if 0 // source code #if CK_USE_AMD_INTRINSIC && \
*reinterpret_cast<dst_vector_t*>(&p_dst[dst_offset]) = CK_USE_AMD_INTRINSIC_BUFFER_LOAD_STORE_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V2R1
*reinterpret_cast<dst_vector_t*>(&p_long_vector[buffer_offset]);
#elif 1 // inline asm using buffer_store
__buffer_store<TData, DstDataPerAccess>( __buffer_store<TData, DstDataPerAccess>(
*reinterpret_cast<dst_vector_t*>(&p_long_vector[buffer_offset]), *reinterpret_cast<dst_vector_t*>(&p_long_vector[buffer_offset]),
p_dst, p_dst,
dst_offset, dst_offset,
0); 0);
#else
*reinterpret_cast<dst_vector_t*>(&p_dst[dst_offset]) =
*reinterpret_cast<dst_vector_t*>(&p_long_vector[buffer_offset]);
#endif #endif
}).Else([&](auto) { }).Else([&](auto) {
// dst can be all kinds of memory-space // dst can be all kinds of memory-space
......
...@@ -8,507 +8,6 @@ namespace ck { ...@@ -8,507 +8,6 @@ namespace ck {
// cast a pointer of LDS to its address // cast a pointer of LDS to its address
extern "C" __attribute__((address_space(3))) __device__ void* __to_local(void* p); extern "C" __attribute__((address_space(3))) __device__ void* __to_local(void* p);
__device__ float __llvm_amdgcn_buffer_load(int32x4_t rsrc,
uint32_t vindex,
uint32_t offset,
bool glc,
bool slc) __asm("llvm.amdgcn.buffer.load");
__device__ vector_type<float, 2>::MemoryType
__llvm_amdgcn_buffer_loadx2(int32x4_t rsrc,
uint32_t vindex,
uint32_t offset,
bool glc,
bool slc) __asm("llvm.amdgcn.buffer.load.dwordx2");
__device__ vector_type<float, 4>::MemoryType
__llvm_amdgcn_buffer_loadx4(int32x4_t rsrc,
uint32_t vindex,
uint32_t offset,
bool glc,
bool slc) __asm("llvm.amdgcn.buffer.load.dwordx4");
__device__ void __llvm_amdgcn_buffer_store(float vdata,
int32x4_t rsrc,
uint32_t vindex,
uint32_t offset,
bool glc,
bool slc) __asm("llvm.amdgcn.buffer.store");
__device__ void __llvm_amdgcn_buffer_storex2(vector_type<float, 2>::MemoryType vdata,
int32x4_t rsrc,
uint32_t vindex,
uint32_t offset,
bool glc,
bool slc) __asm("llvm.amdgcn.buffer.store.dwordx2");
__device__ void __llvm_amdgcn_buffer_storex4(vector_type<float, 4>::MemoryType vdata,
int32x4_t rsrc,
uint32_t vindex,
uint32_t offset,
bool glc,
bool slc) __asm("llvm.amdgcn.buffer.store.dwordx4");
// global_load and global_store
template <typename T, index_t VectorSize>
__device__ typename vector_type<T, VectorSize>::MemoryType __global_load(
const T* p_src_block, uint32_t src_thread_data_offset, uint32_t src_const_data_offset);
template <typename T, index_t VectorSize>
__device__ void __global_store(const typename vector_type<T, VectorSize>::MemoryType& src,
T* p_dst_block,
uint32_t dst_thread_data_offset,
uint32_t dst_const_data_offset);
template <>
__device__ float __global_load<float, 1>(const float* p_src_block,
uint32_t src_thread_data_offset,
uint32_t src_const_data_offset)
{
float dst;
#if 0 // source code
dst = p_src_block[src_const_data_offset + src_thread_data_offset];
#elif 0 // use VGPR only
const float* src_thread_addr_offset_u64 =
p_src_block + src_const_data_offset + src_thread_data_offset;
asm volatile("\n \
global_load_dword %0, %1 off offset:0 \n \
s_waitcnt 0 \n \
"
: "=v"(dst)
: "v"(src_thread_addr_offset_u64));
#elif 0 // use VGPR and SGPR, do compute on VALU
uint64_t src_thread_addr_offset_u64 =
(src_thread_data_offset + src_const_data_offset) * sizeof(float);
asm volatile("\n \
global_load_dword %0, %1, %2, offset:0 \n \
s_waitcnt 0 \n \
"
: "=v"(dst)
: "v"(src_thread_addr_offset_u64), "s"(p_src_block));
#elif 1 // use VGPR and SGPR, do compute on SALU
uint64_t src_thread_addr_offset_u64 =
static_cast<uint64_t>(src_thread_data_offset * sizeof(float));
const float* p_src_block_with_offset = p_src_block + src_const_data_offset;
asm volatile("\n \
global_load_dword %0, %1, %2, offset:0 \n \
s_waitcnt 0 \n \
"
: "=v"(dst)
: "v"(src_thread_addr_offset_u64), "s"(p_src_block_with_offset));
#endif
return dst;
}
template <>
__device__ vector_type<float, 2>::MemoryType __global_load<float, 2>(
const float* p_src_block, uint32_t src_thread_data_offset, uint32_t src_const_data_offset)
{
using vector_t = vector_type<float, 2>::MemoryType;
vector_t dst;
#if 0 // source code
dst = *reinterpret_cast<const vector_t*>(&p_src_block[src_const_data_offset + src_thread_data_offset]);
#elif 0 // use VGPR only
const float* src_thread_addr_offset_u64 =
p_src_block + src_const_data_offset + src_thread_data_offset;
asm volatile("\n \
global_load_dwordx2 %0, %1 off offset:0 \n \
s_waitcnt 0 \n \
"
: "=v"(dst)
: "v"(src_thread_addr_offset_u64));
#elif 0 // use VGPR and SGPR, do compute on VALU
uint64_t src_thread_addr_offset_u64 =
(src_thread_data_offset + src_const_data_offset) * sizeof(float);
asm volatile("\n \
global_load_dwordx2 %0, %1, %2, offset:0 \n \
s_waitcnt 0 \n \
"
: "=v"(dst)
: "v"(src_thread_addr_offset_u64), "s"(p_src_block));
#elif 1 // use VGPR and SGPR, do compute on SALU
uint64_t src_thread_addr_offset_u64 =
static_cast<uint64_t>(src_thread_data_offset * sizeof(float));
const float* p_src_block_with_offset = p_src_block + src_const_data_offset;
asm volatile("\n \
global_load_dwordx2 %0, %1, %2, offset:0 \n \
s_waitcnt 0 \n \
"
: "=v"(dst)
: "v"(src_thread_addr_offset_u64), "s"(p_src_block_with_offset));
#endif
return dst;
}
template <>
__device__ vector_type<float, 4>::MemoryType __global_load<float, 4>(
const float* p_src_block, uint32_t src_thread_data_offset, uint32_t src_const_data_offset)
{
using vector_t = vector_type<float, 4>::MemoryType;
vector_t dst;
#if 0 // source code
dst = *reinterpret_cast<const vector_t*>(&p_src_block[src_const_data_offset + src_thread_data_offset]);
#elif 0 // use VGPR only
const float* src_thread_addr_offset_u64 =
p_src_block + src_const_data_offset + src_thread_data_offset;
asm volatile("\n \
global_load_dwordx4 %0, %1 off offset:0 \n \
s_waitcnt 0 \n \
"
: "=v"(dst)
: "v"(src_thread_addr_offset_u64));
#elif 0 // use VGPR and SGPR, do compute on VALU
uint64_t src_thread_addr_offset_u64 =
(src_thread_data_offset + src_const_data_offset) * sizeof(float);
asm volatile("\n \
global_load_dwordx4 %0, %1, %2, offset:0 \n \
s_waitcnt 0 \n \
"
: "=v"(dst)
: "v"(src_thread_addr_offset_u64), "s"(p_src_block));
#elif 1 // use VGPR and SGPR, do compute on SALU
uint64_t src_thread_addr_offset_u64 =
static_cast<uint64_t>(src_thread_data_offset * sizeof(float));
const float* p_src_block_with_offset = p_src_block + src_const_data_offset;
asm volatile("\n \
global_load_dwordx4 %0, %1, %2, offset:0 \n \
s_waitcnt 0 \n \
"
: "=v"(dst)
: "v"(src_thread_addr_offset_u64), "s"(p_src_block_with_offset));
#endif
return dst;
}
template <>
__device__ void __global_store<float, 1>(const float& src,
float* p_dst_block,
uint32_t dst_thread_data_offset,
uint32_t dst_const_data_offset)
{
#if 0 // compute on VALU
uint64_t dst_thread_data_offset_u64 = (dst_thread_data_offset + dst_const_data_offset) * sizeof(float);
asm volatile("\n \
global_store_dword %0, %1, %2, offset:0 \n \
"
:
: "v"(dst_thread_data_offset_u64), "v"(src), "s"(p_dst_block));
#else // compute on SALU
uint64_t dst_thread_data_offset_u64 = dst_thread_data_offset * sizeof(float);
float* p_dst_block_with_offset = p_dst_block + dst_const_data_offset;
asm volatile("\n \
global_store_dword %0, %1, %2, offset:0 \n \
"
:
: "v"(dst_thread_data_offset_u64), "v"(src), "s"(p_dst_block_with_offset));
#endif
}
// buffer_load and buffer_store
template <typename T, index_t VectorSize>
__device__ typename vector_type<T, VectorSize>::MemoryType __buffer_load(
const T* p_src_block, uint32_t src_thread_data_offset, uint32_t src_const_data_offset);
template <typename T, index_t VectorSize>
__device__ void __buffer_store(const typename vector_type<T, VectorSize>::MemoryType& src,
T* p_dst_block,
uint32_t dst_thread_data_offset,
uint32_t dst_const_data_offset);
template <>
__device__ float __buffer_load<float, 1>(const float* p_src_block,
uint32_t src_thread_data_offset,
uint32_t src_const_data_offset)
{
#if 0
float dst;
uint32_t src_thread_addr_offset = src_thread_data_offset * sizeof(float);
uint32_t src_const_addr_offset = src_const_data_offset * sizeof(float);
int32x4_t src_block_setting{0};
// fill in byte 0 - 1
*reinterpret_cast<float**>(&src_block_setting) = const_cast<float*>(p_src_block);
// fill in byte 2
reinterpret_cast<int*>(&src_block_setting)[2] = -1;
// fill in byte 3
reinterpret_cast<int*>(&src_block_setting)[3] = 0x00027000;
asm volatile("\n \
buffer_load_dword %0, %1, %2, %3 offen offset:0 \n \
s_waitcnt 0 \n \
"
: "=v"(dst)
: "v"(src_thread_addr_offset), "s"(src_block_setting), "s"(src_const_addr_offset));
return dst;
#else
float dst;
uint32_t src_thread_addr_offset = src_thread_data_offset * sizeof(float);
uint32_t src_const_addr_offset = src_const_data_offset * sizeof(float);
int32x4_t src_block_setting{0};
// fill in byte 0 - 1
*reinterpret_cast<float**>(&src_block_setting) = const_cast<float*>(p_src_block);
// fill in byte 2
reinterpret_cast<int*>(&src_block_setting)[2] = -1;
// fill in byte 3
reinterpret_cast<int*>(&src_block_setting)[3] = 0x00027000;
dst = __llvm_amdgcn_buffer_load(
src_block_setting, 0, src_thread_addr_offset + src_const_addr_offset, false, false);
return dst;
#endif
}
template <>
__device__ vector_type<float, 2>::MemoryType __buffer_load<float, 2>(
const float* p_src_block, uint32_t src_thread_data_offset, uint32_t src_const_data_offset)
{
#if 0
vector_type<float, 2>::MemoryType dst;
uint32_t src_thread_addr_offset = src_thread_data_offset * sizeof(float);
uint32_t src_const_addr_offset = src_const_data_offset * sizeof(float);
int32x4_t src_block_setting{0};
// fill in byte 0 - 1
*reinterpret_cast<float**>(&src_block_setting) = const_cast<float*>(p_src_block);
// fill in byte 2
reinterpret_cast<int*>(&src_block_setting)[2] = -1;
// fill in byte 3
reinterpret_cast<int*>(&src_block_setting)[3] = 0x00027000;
asm volatile("\n \
buffer_load_dwordx2 %0, %1, %2, %3 offen offset:0 \n \
s_waitcnt 0 \n \
"
: "=v"(dst)
: "v"(src_thread_addr_offset), "s"(src_block_setting), "s"(src_const_addr_offset));
return dst;
#else
vector_type<float, 2>::MemoryType dst;
uint32_t src_thread_addr_offset = src_thread_data_offset * sizeof(float);
uint32_t src_const_addr_offset = src_const_data_offset * sizeof(float);
int32x4_t src_block_setting{0};
// fill in byte 0 - 1
*reinterpret_cast<float**>(&src_block_setting) = const_cast<float*>(p_src_block);
// fill in byte 2
reinterpret_cast<int*>(&src_block_setting)[2] = -1;
// fill in byte 3
reinterpret_cast<int*>(&src_block_setting)[3] = 0x00027000;
dst = __llvm_amdgcn_buffer_loadx2(
src_block_setting, 0, src_thread_addr_offset + src_const_addr_offset, false, false);
return dst;
#endif
}
template <>
__device__ vector_type<float, 4>::MemoryType __buffer_load<float, 4>(
const float* p_src_block, uint32_t src_thread_data_offset, uint32_t src_const_data_offset)
{
#if 0
vector_type<float, 4>::MemoryType dst;
uint32_t src_thread_addr_offset = src_thread_data_offset * sizeof(float);
uint32_t src_const_addr_offset = src_const_data_offset * sizeof(float);
int32x4_t src_block_setting{0};
// fill in byte 0 - 1
*reinterpret_cast<float**>(&src_block_setting) = const_cast<float*>(p_src_block);
// fill in byte 2
reinterpret_cast<int*>(&src_block_setting)[2] = -1;
// fill in byte 3
reinterpret_cast<int*>(&src_block_setting)[3] = 0x00027000;
asm volatile("\n \
buffer_load_dwordx4 %0, %1, %2, %3 offen offset:0 \n \
s_waitcnt 0 \n \
"
: "=v"(dst)
: "v"(src_thread_addr_offset), "s"(src_block_setting), "s"(src_const_addr_offset));
return dst;
#elif 1
vector_type<float, 4>::MemoryType dst;
uint32_t src_thread_addr_offset = src_thread_data_offset * sizeof(float);
uint32_t src_const_addr_offset = src_const_data_offset * sizeof(float);
int32x4_t src_block_setting{0};
// fill in byte 0 - 1
*reinterpret_cast<float**>(&src_block_setting) = const_cast<float*>(p_src_block);
// fill in byte 2
reinterpret_cast<int*>(&src_block_setting)[2] = -1;
// fill in byte 3
reinterpret_cast<int*>(&src_block_setting)[3] = 0x00027000;
dst = __llvm_amdgcn_buffer_loadx4(
src_block_setting, 0, src_thread_addr_offset + src_const_addr_offset, false, false);
return dst;
#endif
}
template <>
__device__ void __buffer_store<float, 1>(const float& src,
float* p_dst_block,
uint32_t dst_thread_data_offset,
uint32_t dst_const_data_offset)
{
#if 0
uint32_t dst_thread_addr_offset = dst_thread_data_offset * sizeof(float);
uint32_t dst_const_addr_offset = dst_const_data_offset * sizeof(float);
int32x4_t dst_block_setting{0};
// fill in byte 0 - 1
*reinterpret_cast<float**>(&dst_block_setting) = p_dst_block;
// fill in byte 2
reinterpret_cast<int*>(&dst_block_setting)[2] = -1;
// fill in byte 3
reinterpret_cast<int*>(&dst_block_setting)[3] = 0x00027000;
asm volatile("\n \
buffer_store_dword %1, %2, %0, %3 offen offset:0 \n \
"
:
: "s"(dst_block_setting),
"v"(src),
"v"(dst_thread_addr_offset),
"s"(dst_const_addr_offset));
#else
uint32_t dst_thread_addr_offset = dst_thread_data_offset * sizeof(float);
uint32_t dst_const_addr_offset = dst_const_data_offset * sizeof(float);
int32x4_t dst_block_setting{0};
// fill in byte 0 - 1
*reinterpret_cast<float**>(&dst_block_setting) = p_dst_block;
// fill in byte 2
reinterpret_cast<int*>(&dst_block_setting)[2] = -1;
// fill in byte 3
reinterpret_cast<int*>(&dst_block_setting)[3] = 0x00027000;
__llvm_amdgcn_buffer_store(
src, dst_block_setting, 0, dst_thread_addr_offset + dst_const_addr_offset, false, false);
#endif
}
template <>
__device__ void __buffer_store<float, 2>(const vector_type<float, 2>::MemoryType& src,
float* p_dst_block,
uint32_t dst_thread_data_offset,
uint32_t dst_const_data_offset)
{
#if 0
uint32_t dst_thread_addr_offset = dst_thread_data_offset * sizeof(float);
uint32_t dst_const_addr_offset = dst_const_data_offset * sizeof(float);
int32x4_t dst_block_setting{0};
// fill in byte 0 - 1
*reinterpret_cast<float**>(&dst_block_setting) = p_dst_block;
// fill in byte 2
reinterpret_cast<int*>(&dst_block_setting)[2] = -1;
// fill in byte 3
reinterpret_cast<int*>(&dst_block_setting)[3] = 0x00027000;
asm volatile("\n \
buffer_store_dwordx2 %1, %2, %0, %3 offen offset:0 \n \
"
:
: "s"(dst_block_setting),
"v"(src),
"v"(dst_thread_addr_offset),
"s"(dst_const_addr_offset));
#else
uint32_t dst_thread_addr_offset = dst_thread_data_offset * sizeof(float);
uint32_t dst_const_addr_offset = dst_const_data_offset * sizeof(float);
int32x4_t dst_block_setting{0};
// fill in byte 0 - 1
*reinterpret_cast<float**>(&dst_block_setting) = p_dst_block;
// fill in byte 2
reinterpret_cast<int*>(&dst_block_setting)[2] = -1;
// fill in byte 3
reinterpret_cast<int*>(&dst_block_setting)[3] = 0x00027000;
__llvm_amdgcn_buffer_storex2(
src, dst_block_setting, 0, dst_thread_addr_offset + dst_const_addr_offset, false, false);
#endif
}
template <>
__device__ void __buffer_store<float, 4>(const vector_type<float, 4>::MemoryType& src,
float* p_dst_block,
uint32_t dst_thread_data_offset,
uint32_t dst_const_data_offset)
{
#if 0
uint32_t dst_thread_addr_offset = dst_thread_data_offset * sizeof(float);
uint32_t dst_const_addr_offset = dst_const_data_offset * sizeof(float);
int32x4_t dst_block_setting{0};
// fill in byte 0 - 1
*reinterpret_cast<float**>(&dst_block_setting) = p_dst_block;
// fill in byte 2
reinterpret_cast<int*>(&dst_block_setting)[2] = -1;
// fill in byte 3
reinterpret_cast<int*>(&dst_block_setting)[3] = 0x00027000;
asm volatile("\n \
buffer_store_dwordx4 %1, %2, %0, %3 offen offset:0 \n \
"
:
: "s"(dst_block_setting),
"v"(src),
"v"(dst_thread_addr_offset),
"s"(dst_const_addr_offset));
#else
uint32_t dst_thread_addr_offset = dst_thread_data_offset * sizeof(float);
uint32_t dst_const_addr_offset = dst_const_data_offset * sizeof(float);
int32x4_t dst_block_setting{0};
// fill in byte 0 - 1
*reinterpret_cast<float**>(&dst_block_setting) = p_dst_block;
// fill in byte 2
reinterpret_cast<int*>(&dst_block_setting)[2] = -1;
// fill in byte 3
reinterpret_cast<int*>(&dst_block_setting)[3] = 0x00027000;
__llvm_amdgcn_buffer_storex4(
src, dst_block_setting, 0, dst_thread_addr_offset + dst_const_addr_offset, false, false);
#endif
}
__device__ void vmcnt(index_t cnt) __device__ void vmcnt(index_t cnt)
{ {
if(cnt == 0) if(cnt == 0)
......
...@@ -22,4 +22,8 @@ ...@@ -22,4 +22,8 @@
#include "amd_inline_asm.hpp" #include "amd_inline_asm.hpp"
#endif #endif
#if CK_USE_AMD_INTRINCIS
#include "amd_intrinsic.hpp"
#endif
#endif #endif
...@@ -4,9 +4,11 @@ ...@@ -4,9 +4,11 @@
#include "hip/hip_runtime.h" #include "hip/hip_runtime.h"
#include "hip/hip_fp16.h" #include "hip/hip_fp16.h"
#define CK_UNSIGNED_INDEX_TYPE 0
#define CK_DEVICE_BACKEND_AMD 1 #define CK_DEVICE_BACKEND_AMD 1
#define CK_USE_UNSIGNED_INDEX_TYPE 0 #define CK_USE_AMD_INTRINSIC 1
#define CK_USE_AMD_INLINE_ASM 1 #define CK_USE_AMD_INLINE_ASM 1
#define CK_USE_AMD_INTRINSIC_BUFFER_LOAD_STORE_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V2R1 1
#define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_BLOCKWISE_GENERIC_SLICE_COPY_V1 1 #define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_BLOCKWISE_GENERIC_SLICE_COPY_V1 1
#define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V1R1 0 #define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V1R1 0
#define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V1R2 0 #define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V1R2 0
...@@ -14,7 +16,7 @@ ...@@ -14,7 +16,7 @@
namespace ck { namespace ck {
#if CK_USE_UNSIGNED_INDEX_TYPE #if CK_UNSIGNED_INDEX_TYPE
using index_t = uint32_t; using index_t = uint32_t;
#else #else
using index_t = int32_t; using index_t = int32_t;
......
...@@ -6,9 +6,11 @@ ...@@ -6,9 +6,11 @@
#include "nvToolsExt.h" #include "nvToolsExt.h"
#include "helper_cuda.h" #include "helper_cuda.h"
#define CK_UNSIGNED_INDEX_TYPE 0
#define CK_DEVICE_BACKEND_NVIDIA 1 #define CK_DEVICE_BACKEND_NVIDIA 1
#define CK_USE_UNSIGNED_INDEX_TYPE 0 #define CK_USE_AMD_INTRINSIC 0
#define CK_USE_AMD_INLINE_ASM 0 #define CK_USE_AMD_INLINE_ASM 0
#define CK_USE_AMD_INTRINSIC_BUFFER_LOAD_STORE_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V2R1 0
#define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_BLOCKWISE_GENERIC_SLICE_COPY_V1 0 #define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_BLOCKWISE_GENERIC_SLICE_COPY_V1 0
#define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V1R1 0 #define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V1R1 0
#define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V1R2 0 #define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V1R2 0
...@@ -16,7 +18,7 @@ ...@@ -16,7 +18,7 @@
namespace ck { namespace ck {
#if CK_USE_UNSIGNED_INDEX_TYPE #if CK_UNSIGNED_INDEX_TYPE
using index_t = uint32_t; using index_t = uint32_t;
#else #else
using index_t = int32_t; using index_t = int32_t;
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment