Unverified Commit 562e1e27 authored by Chao Liu's avatar Chao Liu Committed by GitHub
Browse files

MIOpen integration: recent bug fixes from MIOpen (#5)

parent 52c3fe05
...@@ -114,7 +114,7 @@ struct ThreadwiseGemmTransANormalBNormalC ...@@ -114,7 +114,7 @@ struct ThreadwiseGemmTransANormalBNormalC
const index_t cindex_0 = MatrixC::CalculateOffset(m, 0); const index_t cindex_0 = MatrixC::CalculateOffset(m, 0);
const index_t cindex_1 = MatrixC::CalculateOffset(m, 1); const index_t cindex_1 = MatrixC::CalculateOffset(m, 1);
__outer_product_1x2( amd_assembly_outer_product_1x2(
p_a[aindex], p_b[bindex_0], p_b[bindex_1], p_c[cindex_0], p_c[cindex_1]); p_a[aindex], p_b[bindex_0], p_b[bindex_1], p_c[cindex_0], p_c[cindex_1]);
}); });
...@@ -129,15 +129,15 @@ struct ThreadwiseGemmTransANormalBNormalC ...@@ -129,15 +129,15 @@ struct ThreadwiseGemmTransANormalBNormalC
const index_t cindex_2 = MatrixC::CalculateOffset(m, 2); const index_t cindex_2 = MatrixC::CalculateOffset(m, 2);
const index_t cindex_3 = MatrixC::CalculateOffset(m, 3); const index_t cindex_3 = MatrixC::CalculateOffset(m, 3);
__outer_product_1x4(p_a[aindex], amd_assembly_outer_product_1x4(p_a[aindex],
p_b[bindex_0], p_b[bindex_0],
p_b[bindex_1], p_b[bindex_1],
p_b[bindex_2], p_b[bindex_2],
p_b[bindex_3], p_b[bindex_3],
p_c[cindex_0], p_c[cindex_0],
p_c[cindex_1], p_c[cindex_1],
p_c[cindex_2], p_c[cindex_2],
p_c[cindex_3]); p_c[cindex_3]);
}); });
} }
} }
......
...@@ -123,7 +123,7 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2 ...@@ -123,7 +123,7 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2
static_if<SrcAddressSpace == AddressSpace::global>{}([&](auto fwd) { static_if<SrcAddressSpace == AddressSpace::global>{}([&](auto fwd) {
#if CK_USE_AMD_BUFFER_ADDRESSING #if CK_USE_AMD_BUFFER_ADDRESSING
*reinterpret_cast<src_vector_t*>(&p_src_long_vector[buffer_offset]) = *reinterpret_cast<src_vector_t*>(&p_src_long_vector[buffer_offset]) =
__buffer_load<SrcData, SrcDataPerAccess>( amd_intrinsic_buffer_load<SrcData, SrcDataPerAccess>(
fwd(p_src), src_coord.GetOffset(), 0); fwd(p_src), src_coord.GetOffset(), 0);
#else #else
*reinterpret_cast<src_vector_t*>(&p_src_long_vector[buffer_offset]) = *reinterpret_cast<src_vector_t*>(&p_src_long_vector[buffer_offset]) =
...@@ -162,7 +162,7 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2 ...@@ -162,7 +162,7 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2
{ {
static_if<DstAddressSpace == AddressSpace::global>{}([&](auto fwd) { static_if<DstAddressSpace == AddressSpace::global>{}([&](auto fwd) {
#if CK_USE_AMD_BUFFER_ADDRESSING #if CK_USE_AMD_BUFFER_ADDRESSING
__buffer_store<DstData, DstDataPerAccess>( amd_intrinsic_buffer_store<DstData, DstDataPerAccess>(
*reinterpret_cast<dst_vector_t*>(&p_dst_long_vector[buffer_offset]), *reinterpret_cast<dst_vector_t*>(&p_dst_long_vector[buffer_offset]),
fwd(p_dst), fwd(p_dst),
dst_coord.GetOffset(), dst_coord.GetOffset(),
...@@ -311,7 +311,7 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2 ...@@ -311,7 +311,7 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2
static_if<SrcAddressSpace == AddressSpace::global>{}([&](auto) { static_if<SrcAddressSpace == AddressSpace::global>{}([&](auto) {
#if CK_USE_AMD_BUFFER_ADDRESSING #if CK_USE_AMD_BUFFER_ADDRESSING
*reinterpret_cast<src_vector_t*>(&p_src_long_vector[buffer_offset]) = *reinterpret_cast<src_vector_t*>(&p_src_long_vector[buffer_offset]) =
__buffer_load<SrcData, SrcDataPerAccess>( amd_intrinsic_buffer_load<SrcData, SrcDataPerAccess>(
p_src, src_nonlinear_coord.GetOffset(), src_linear_offset); p_src, src_nonlinear_coord.GetOffset(), src_linear_offset);
#else #else
*reinterpret_cast<src_vector_t*>(&p_src_long_vector[buffer_offset]) = *reinterpret_cast<src_vector_t*>(&p_src_long_vector[buffer_offset]) =
...@@ -503,7 +503,7 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2 ...@@ -503,7 +503,7 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2
{ {
static_if<DstAddressSpace == AddressSpace::global>{}([&](auto) { static_if<DstAddressSpace == AddressSpace::global>{}([&](auto) {
#if CK_USE_AMD_BUFFER_ADDRESSING #if CK_USE_AMD_BUFFER_ADDRESSING
__buffer_store<DstData, DstDataPerAccess>( amd_intrinsic_buffer_store<DstData, DstDataPerAccess>(
*reinterpret_cast<dst_vector_t*>(&p_dst_long_vector[buffer_offset]), *reinterpret_cast<dst_vector_t*>(&p_dst_long_vector[buffer_offset]),
p_dst, p_dst,
dst_nonlinear_coord.GetOffset(), dst_nonlinear_coord.GetOffset(),
......
...@@ -335,7 +335,7 @@ struct ThreadwiseGenericTensorSliceCopy_v2r1_deprecated ...@@ -335,7 +335,7 @@ struct ThreadwiseGenericTensorSliceCopy_v2r1_deprecated
// 3. src_merged_offset can be runtime value (no assumption imposed) // 3. src_merged_offset can be runtime value (no assumption imposed)
static_if<SrcAddressSpace == AddressSpace::global>{}([&](auto fwd) { static_if<SrcAddressSpace == AddressSpace::global>{}([&](auto fwd) {
#if CK_USE_AMD_BUFFER_ADDRESSING #if CK_USE_AMD_BUFFER_ADDRESSING
vector_data = __buffer_load<SrcData, SrcDataPerAccess>( vector_data = amd_intrinsic_buffer_load<SrcData, SrcDataPerAccess>(
fwd(p_src), src_merged_offset, src_normal_offset); fwd(p_src), src_merged_offset, src_normal_offset);
#else #else
vector_data = *reinterpret_cast<const src_vector_t*>( vector_data = *reinterpret_cast<const src_vector_t*>(
...@@ -375,7 +375,7 @@ struct ThreadwiseGenericTensorSliceCopy_v2r1_deprecated ...@@ -375,7 +375,7 @@ struct ThreadwiseGenericTensorSliceCopy_v2r1_deprecated
// copy data from buffer into dst // copy data from buffer into dst
{ {
using dst_vector_t = typename vector_type<SrcData, DstDataPerAccess>::MemoryType; using dst_vector_t = typename vector_type<DstData, DstDataPerAccess>::MemoryType;
constexpr auto dst_vector_access_dim = Number<DstVectorAccessDim>{}; constexpr auto dst_vector_access_dim = Number<DstVectorAccessDim>{};
constexpr auto dst_data_per_access = Number<DstDataPerAccess>{}; constexpr auto dst_data_per_access = Number<DstDataPerAccess>{};
...@@ -420,7 +420,7 @@ struct ThreadwiseGenericTensorSliceCopy_v2r1_deprecated ...@@ -420,7 +420,7 @@ struct ThreadwiseGenericTensorSliceCopy_v2r1_deprecated
const index_t buffer_offset = buffer_desc.GetOffsetFromMultiIndex( const index_t buffer_offset = buffer_desc.GetOffsetFromMultiIndex(
dst_merged_dim_data_id + dst_normal_dim_data_id + scalar_id); dst_merged_dim_data_id + dst_normal_dim_data_id + scalar_id);
reinterpret_cast<SrcData*>(&vector_data)[i] = p_dst_buffer[buffer_offset]; reinterpret_cast<DstData*>(&vector_data)[i] = p_dst_buffer[buffer_offset];
} }
// offset w.r.t. normal dimension is known at compile-time // offset w.r.t. normal dimension is known at compile-time
...@@ -444,7 +444,7 @@ struct ThreadwiseGenericTensorSliceCopy_v2r1_deprecated ...@@ -444,7 +444,7 @@ struct ThreadwiseGenericTensorSliceCopy_v2r1_deprecated
// 3. dst_merged_offset can be runtime value (no assumption imposed) // 3. dst_merged_offset can be runtime value (no assumption imposed)
static_if<DstAddressSpace == AddressSpace::global>{}([&](auto fwd) { static_if<DstAddressSpace == AddressSpace::global>{}([&](auto fwd) {
#if CK_USE_AMD_BUFFER_ADDRESSING #if CK_USE_AMD_BUFFER_ADDRESSING
__buffer_store<SrcData, DstDataPerAccess>( amd_intrinsic_buffer_store<DstData, DstDataPerAccess>(
vector_data, fwd(p_dst), dst_merged_offset, dst_normal_offset); vector_data, fwd(p_dst), dst_merged_offset, dst_normal_offset);
#else #else
*reinterpret_cast<dst_vector_t*>( *reinterpret_cast<dst_vector_t*>(
......
...@@ -19,55 +19,56 @@ __device__ float __llvm_amdgcn_buffer_load(int32x4_t rsrc, ...@@ -19,55 +19,56 @@ __device__ float __llvm_amdgcn_buffer_load(int32x4_t rsrc,
index_t vindex, index_t vindex,
index_t offset, index_t offset,
bool glc, bool glc,
bool slc) __asm("llvm.amdgcn.buffer.load"); bool slc) __asm("llvm.amdgcn.buffer.load.f32");
__device__ float2_t __llvm_amdgcn_buffer_loadx2(int32x4_t rsrc, __device__ float2_t __llvm_amdgcn_buffer_loadx2(int32x4_t rsrc,
index_t vindex, index_t vindex,
index_t offset, index_t offset,
bool glc, bool glc,
bool slc) __asm("llvm.amdgcn.buffer.load.dwordx2"); bool slc) __asm("llvm.amdgcn.buffer.load.v2f32");
__device__ float4_t __llvm_amdgcn_buffer_loadx4(int32x4_t rsrc, __device__ float4_t __llvm_amdgcn_buffer_loadx4(int32x4_t rsrc,
index_t vindex, index_t vindex,
index_t offset, index_t offset,
bool glc, bool glc,
bool slc) __asm("llvm.amdgcn.buffer.load.dwordx4"); bool slc) __asm("llvm.amdgcn.buffer.load.v4f32");
__device__ void __llvm_amdgcn_buffer_store(float vdata, __device__ void __llvm_amdgcn_buffer_store(float vdata,
int32x4_t rsrc, int32x4_t rsrc,
index_t vindex, index_t vindex,
index_t offset, index_t offset,
bool glc, bool glc,
bool slc) __asm("llvm.amdgcn.buffer.store"); bool slc) __asm("llvm.amdgcn.buffer.store.f32");
__device__ void __llvm_amdgcn_buffer_storex2(float2_t vdata, __device__ void __llvm_amdgcn_buffer_storex2(float2_t vdata,
int32x4_t rsrc, int32x4_t rsrc,
index_t vindex, index_t vindex,
index_t offset, index_t offset,
bool glc, bool glc,
bool slc) __asm("llvm.amdgcn.buffer.store.dwordx2"); bool slc) __asm("llvm.amdgcn.buffer.store.v2f32");
__device__ void __llvm_amdgcn_buffer_storex4(float4_t vdata, __device__ void __llvm_amdgcn_buffer_storex4(float4_t vdata,
int32x4_t rsrc, int32x4_t rsrc,
index_t vindex, index_t vindex,
index_t offset, index_t offset,
bool glc, bool glc,
bool slc) __asm("llvm.amdgcn.buffer.store.dwordx4"); bool slc) __asm("llvm.amdgcn.buffer.store.v4f32");
template <typename T, index_t VectorSize> template <typename T, index_t VectorSize>
__device__ typename vector_type<T, VectorSize>::MemoryType __device__ typename vector_type<T, VectorSize>::MemoryType amd_intrinsic_buffer_load(
__buffer_load(const T* p_src_block, index_t src_thread_data_offset, index_t src_const_data_offset); const T* p_src_block, index_t src_thread_data_offset, index_t src_const_data_offset);
template <typename T, index_t VectorSize> template <typename T, index_t VectorSize>
__device__ void __buffer_store(const typename vector_type<T, VectorSize>::MemoryType& src, __device__ void
T* p_dst_block, amd_intrinsic_buffer_store(const typename vector_type<T, VectorSize>::MemoryType& src,
index_t dst_thread_data_offset, T* p_dst_block,
index_t dst_const_data_offset); index_t dst_thread_data_offset,
index_t dst_const_data_offset);
template <> template <>
__device__ float __buffer_load<float, 1>(const float* p_src_block, __device__ float amd_intrinsic_buffer_load<float, 1>(const float* p_src_block,
index_t src_thread_data_offset, index_t src_thread_data_offset,
index_t src_const_data_offset) index_t src_const_data_offset)
{ {
float dst; float dst;
...@@ -100,9 +101,9 @@ __device__ float __buffer_load<float, 1>(const float* p_src_block, ...@@ -100,9 +101,9 @@ __device__ float __buffer_load<float, 1>(const float* p_src_block,
} }
template <> template <>
__device__ float2_t __buffer_load<float, 2>(const float* p_src_block, __device__ float2_t amd_intrinsic_buffer_load<float, 2>(const float* p_src_block,
index_t src_thread_data_offset, index_t src_thread_data_offset,
index_t src_const_data_offset) index_t src_const_data_offset)
{ {
float2_t dst; float2_t dst;
...@@ -135,9 +136,9 @@ __device__ float2_t __buffer_load<float, 2>(const float* p_src_block, ...@@ -135,9 +136,9 @@ __device__ float2_t __buffer_load<float, 2>(const float* p_src_block,
} }
template <> template <>
__device__ float4_t __buffer_load<float, 4>(const float* p_src_block, __device__ float4_t amd_intrinsic_buffer_load<float, 4>(const float* p_src_block,
index_t src_thread_data_offset, index_t src_thread_data_offset,
index_t src_const_data_offset) index_t src_const_data_offset)
{ {
float4_t dst; float4_t dst;
...@@ -170,10 +171,10 @@ __device__ float4_t __buffer_load<float, 4>(const float* p_src_block, ...@@ -170,10 +171,10 @@ __device__ float4_t __buffer_load<float, 4>(const float* p_src_block,
} }
template <> template <>
__device__ void __buffer_store<float, 1>(const float& src, __device__ void amd_intrinsic_buffer_store<float, 1>(const float& src,
float* p_dst_block, float* p_dst_block,
index_t dst_thread_data_offset, index_t dst_thread_data_offset,
index_t dst_const_data_offset) index_t dst_const_data_offset)
{ {
index_t dst_thread_addr_offset = dst_thread_data_offset * sizeof(float); index_t dst_thread_addr_offset = dst_thread_data_offset * sizeof(float);
index_t dst_const_addr_offset = dst_const_data_offset * sizeof(float); index_t dst_const_addr_offset = dst_const_data_offset * sizeof(float);
...@@ -207,10 +208,10 @@ __device__ void __buffer_store<float, 1>(const float& src, ...@@ -207,10 +208,10 @@ __device__ void __buffer_store<float, 1>(const float& src,
} }
template <> template <>
__device__ void __buffer_store<float, 2>(const float2_t& src, __device__ void amd_intrinsic_buffer_store<float, 2>(const float2_t& src,
float* p_dst_block, float* p_dst_block,
index_t dst_thread_data_offset, index_t dst_thread_data_offset,
index_t dst_const_data_offset) index_t dst_const_data_offset)
{ {
index_t dst_thread_addr_offset = dst_thread_data_offset * sizeof(float); index_t dst_thread_addr_offset = dst_thread_data_offset * sizeof(float);
index_t dst_const_addr_offset = dst_const_data_offset * sizeof(float); index_t dst_const_addr_offset = dst_const_data_offset * sizeof(float);
...@@ -244,10 +245,10 @@ __device__ void __buffer_store<float, 2>(const float2_t& src, ...@@ -244,10 +245,10 @@ __device__ void __buffer_store<float, 2>(const float2_t& src,
} }
template <> template <>
__device__ void __buffer_store<float, 4>(const float4_t& src, __device__ void amd_intrinsic_buffer_store<float, 4>(const float4_t& src,
float* p_dst_block, float* p_dst_block,
index_t dst_thread_data_offset, index_t dst_thread_data_offset,
index_t dst_const_data_offset) index_t dst_const_data_offset)
{ {
index_t dst_thread_addr_offset = dst_thread_data_offset * sizeof(float); index_t dst_thread_addr_offset = dst_thread_data_offset * sizeof(float);
index_t dst_const_addr_offset = dst_const_data_offset * sizeof(float); index_t dst_const_addr_offset = dst_const_data_offset * sizeof(float);
......
...@@ -6,7 +6,7 @@ ...@@ -6,7 +6,7 @@
namespace ck { namespace ck {
// outer-product: c[i,j] += inner_product(a[i], b[j]) // outer-product: c[i,j] += inner_product(a[i], b[j])
__device__ void __outer_product_1x2(float a, float b0, float b1, float& c0, float& c1) __device__ void amd_assembly_outer_product_1x2(float a, float b0, float b1, float& c0, float& c1)
{ {
// disable inline asm due to the compiler issue: SWDEV-202749 // disable inline asm due to the compiler issue: SWDEV-202749
///\to-do: enable the inline asm after the compiler fix ///\to-do: enable the inline asm after the compiler fix
...@@ -24,7 +24,7 @@ __device__ void __outer_product_1x2(float a, float b0, float b1, float& c0, floa ...@@ -24,7 +24,7 @@ __device__ void __outer_product_1x2(float a, float b0, float b1, float& c0, floa
} }
// outer-product: c[i,j] += inner_product(a[i], b[j]) // outer-product: c[i,j] += inner_product(a[i], b[j])
__device__ void __outer_product_1x4( __device__ void amd_assembly_outer_product_1x4(
float a, float b0, float b1, float b2, float b3, float& c0, float& c1, float& c2, float& c3) float a, float b0, float b1, float b2, float b3, float& c0, float& c1, float& c2, float& c3)
{ {
asm volatile("\n \ asm volatile("\n \
...@@ -38,11 +38,12 @@ __device__ void __outer_product_1x4( ...@@ -38,11 +38,12 @@ __device__ void __outer_product_1x4(
} }
// outer-product: c[i,j] += inner_product(a[i], b[j]) // outer-product: c[i,j] += inner_product(a[i], b[j])
__device__ void __outer_product_1x2(half2_t a, half2_t b0, half2_t b1, float& c0, float& c1) __device__ void
amd_assembly_outer_product_1x2(half2_t a, half2_t b0, half2_t b1, float& c0, float& c1)
{ {
asm volatile("\n \ asm volatile("\n \
v_dot2_f32_f16 %0, %2, %3 %0\n \ v_dot2_f32_f16 %0, %2, %3, %0\n \
v_dot2_f32_f16 %1, %2, %4 %1\n \ v_dot2_f32_f16 %1, %2, %4, %1\n \
" "
: "=v"(c0), "=v"(c1) // Dest registers : "=v"(c0), "=v"(c1) // Dest registers
: "v"(a), // 1st Src register for 1 half2 registers : "v"(a), // 1st Src register for 1 half2 registers
...@@ -53,7 +54,8 @@ __device__ void __outer_product_1x2(half2_t a, half2_t b0, half2_t b1, float& c0 ...@@ -53,7 +54,8 @@ __device__ void __outer_product_1x2(half2_t a, half2_t b0, half2_t b1, float& c0
} }
// outer-product: c[i,j] += inner_product(a[i], b[j]) // outer-product: c[i,j] += inner_product(a[i], b[j])
__device__ void __outer_product_1x2(half4_t a, half4_t b0, half4_t b1, float& c0, float& c1) __device__ void
amd_assembly_outer_product_1x2(half4_t a, half4_t b0, half4_t b1, float& c0, float& c1)
{ {
const half2_t* p_a_half2 = reinterpret_cast<const half2_t*>(&a); const half2_t* p_a_half2 = reinterpret_cast<const half2_t*>(&a);
const half2_t* p_b0_half2 = reinterpret_cast<const half2_t*>(&b0); const half2_t* p_b0_half2 = reinterpret_cast<const half2_t*>(&b0);
...@@ -61,10 +63,10 @@ __device__ void __outer_product_1x2(half4_t a, half4_t b0, half4_t b1, float& c0 ...@@ -61,10 +63,10 @@ __device__ void __outer_product_1x2(half4_t a, half4_t b0, half4_t b1, float& c0
// do dot2 two times // do dot2 two times
asm volatile("\n \ asm volatile("\n \
v_dot2_f32_f16 %0, %2, %4 %0\n \ v_dot2_f32_f16 %0, %2, %4, %0\n \
v_dot2_f32_f16 %1, %2, %6 %1\n \ v_dot2_f32_f16 %1, %2, %6, %1\n \
v_dot2_f32_f16 %0, %3, %5 %0\n \ v_dot2_f32_f16 %0, %3, %5, %0\n \
v_dot2_f32_f16 %1, %3, %7 %1\n \ v_dot2_f32_f16 %1, %3, %7, %1\n \
" "
: "=v"(c0), "=v"(c1) // Dest registers : "=v"(c0), "=v"(c1) // Dest registers
: "v"(p_a_half2[0]), : "v"(p_a_half2[0]),
...@@ -78,21 +80,21 @@ __device__ void __outer_product_1x2(half4_t a, half4_t b0, half4_t b1, float& c0 ...@@ -78,21 +80,21 @@ __device__ void __outer_product_1x2(half4_t a, half4_t b0, half4_t b1, float& c0
} }
// outer-product: c[i,j] += inner_product(a[i], b[j]) // outer-product: c[i,j] += inner_product(a[i], b[j])
__device__ void __outer_product_1x4(half2_t a, __device__ void amd_assembly_outer_product_1x4(half2_t a,
half2_t b0, half2_t b0,
half2_t b1, half2_t b1,
half2_t b2, half2_t b2,
half2_t b3, half2_t b3,
float& c0, float& c0,
float& c1, float& c1,
float& c2, float& c2,
float& c3) float& c3)
{ {
asm volatile("\n \ asm volatile("\n \
v_dot2_f32_f16 %0, %4, %5 %0\n \ v_dot2_f32_f16 %0, %4, %5, %0\n \
v_dot2_f32_f16 %1, %4, %6 %1\n \ v_dot2_f32_f16 %1, %4, %6, %1\n \
v_dot2_f32_f16 %2, %4, %7 %2\n \ v_dot2_f32_f16 %2, %4, %7, %2\n \
v_dot2_f32_f16 %3, %4, %8 %3\n \ v_dot2_f32_f16 %3, %4, %8, %3\n \
" "
: "=v"(c0), "=v"(c1), "=v"(c2), "=v"(c3) // Dest registers : "=v"(c0), "=v"(c1), "=v"(c2), "=v"(c3) // Dest registers
: "v"(a), // 1st Src register for 1 half2 registers : "v"(a), // 1st Src register for 1 half2 registers
...@@ -107,15 +109,15 @@ __device__ void __outer_product_1x4(half2_t a, ...@@ -107,15 +109,15 @@ __device__ void __outer_product_1x4(half2_t a,
} }
// outer-product: c[i,j] += inner_product(a[i], b[j]) // outer-product: c[i,j] += inner_product(a[i], b[j])
__device__ void __outer_product_1x4(half4_t a, __device__ void amd_assembly_outer_product_1x4(half4_t a,
half4_t b0, half4_t b0,
half4_t b1, half4_t b1,
half4_t b2, half4_t b2,
half4_t b3, half4_t b3,
float& c0, float& c0,
float& c1, float& c1,
float& c2, float& c2,
float& c3) float& c3)
{ {
const half2_t* p_a_half2 = reinterpret_cast<const half2_t*>(&a); const half2_t* p_a_half2 = reinterpret_cast<const half2_t*>(&a);
const half2_t* p_b0_half2 = reinterpret_cast<const half2_t*>(&b0); const half2_t* p_b0_half2 = reinterpret_cast<const half2_t*>(&b0);
...@@ -125,14 +127,14 @@ __device__ void __outer_product_1x4(half4_t a, ...@@ -125,14 +127,14 @@ __device__ void __outer_product_1x4(half4_t a,
// do dot2 two times // do dot2 two times
asm volatile("\n \ asm volatile("\n \
v_dot2_f32_f16 %0, %4, %6 %0\n \ v_dot2_f32_f16 %0, %4, %6, %0\n \
v_dot2_f32_f16 %1, %4, %8 %1\n \ v_dot2_f32_f16 %1, %4, %8, %1\n \
v_dot2_f32_f16 %2, %4, %10 %2\n \ v_dot2_f32_f16 %2, %4, %10, %2\n \
v_dot2_f32_f16 %3, %4, %12 %3\n \ v_dot2_f32_f16 %3, %4, %12, %3\n \
v_dot2_f32_f16 %0, %5, %7 %0\n \ v_dot2_f32_f16 %0, %5, %7, %0\n \
v_dot2_f32_f16 %1, %5, %9 %1\n \ v_dot2_f32_f16 %1, %5, %9, %1\n \
v_dot2_f32_f16 %2, %5, %11 %2\n \ v_dot2_f32_f16 %2, %5, %11, %2\n \
v_dot2_f32_f16 %3, %5, %13 %3\n \ v_dot2_f32_f16 %3, %5, %13, %3\n \
" "
: "=v"(c0), "=v"(c1), "=v"(c2), "=v"(c3) // Dest registers : "=v"(c0), "=v"(c1), "=v"(c2), "=v"(c3) // Dest registers
: "v"(p_a_half2[0]), : "v"(p_a_half2[0]),
......
...@@ -24,4 +24,8 @@ ...@@ -24,4 +24,8 @@
#include "amd_buffer_addressing.hpp" #include "amd_buffer_addressing.hpp"
#endif #endif
#if CK_USE_AMD_XDLOPS
#include "amd_xdlops.hpp"
#endif
#endif #endif
...@@ -31,11 +31,11 @@ ...@@ -31,11 +31,11 @@
// AMD XDLOPS // AMD XDLOPS
#ifndef CK_USE_AMD_XDLOPS #ifndef CK_USE_AMD_XDLOPS
#define CK_USE_AMD_XDLOPS 1 #define CK_USE_AMD_XDLOPS 0
#endif #endif
#ifndef CK_USE_AMD_XDLOPS_INLINE_ASM #ifndef CK_USE_AMD_XDLOPS_INLINE_ASM
#define CK_USE_AMD_XDLOPS_INLINE_ASM 1 #define CK_USE_AMD_XDLOPS_INLINE_ASM 0
#endif #endif
// experimental implementation // experimental implementation
......
#ifndef CK_FLOAT_TYPE_AMD_HPP
#define CK_FLOAT_TYPE_AMD_HPP
namespace ck {
// For some reason, HIP compiler need this definition to generate optimal ISA
// float
typedef float float2_t __attribute__((ext_vector_type(2)));
typedef float float4_t __attribute__((ext_vector_type(4)));
typedef float float16_t __attribute__((ext_vector_type(16)));
typedef float float32_t __attribute__((ext_vector_type(32)));
// float16
typedef _Float16 half2_t __attribute__((ext_vector_type(2)));
typedef _Float16 half4_t __attribute__((ext_vector_type(4)));
// bfloat16
typedef ushort ushort2_t __attribute__((ext_vector_type(2)));
typedef ushort ushort4_t __attribute__((ext_vector_type(4)));
template <class T, index_t N>
struct vector_type
{
typedef struct
{
T scalar[N];
} MemoryType;
};
template <>
struct vector_type<float, 1>
{
using MemoryType = float;
template <index_t I>
__host__ __device__ static void SetScalar(MemoryType& v, float s, Number<I>)
{
static_assert(I < 1, "wrong");
*(reinterpret_cast<float*>(&v) + I) = s;
}
};
template <>
struct vector_type<float, 2>
{
using MemoryType = float2_t;
union DataType
{
MemoryType vector;
float scalar[2];
};
template <index_t I>
__host__ __device__ static void SetScalar(MemoryType& v, float s, Number<I>)
{
static_assert(I < 2, "wrong");
*(reinterpret_cast<float*>(&v) + I) = s;
}
__host__ __device__ static MemoryType Pack(float s0, float s1)
{
DataType data;
data.scalar[0] = s0;
data.scalar[1] = s1;
return data.vector;
}
};
template <>
struct vector_type<float, 4>
{
using MemoryType = float4_t;
__host__ __device__ static constexpr index_t GetSize() { return 4; }
template <index_t I>
__host__ __device__ static void SetScalar(MemoryType& v, float s, Number<I>)
{
static_assert(I < 4, "wrong");
*(reinterpret_cast<float*>(&v) + I) = s;
}
};
template <>
struct vector_type<half, 1>
{
using MemoryType = half;
template <index_t I>
__host__ __device__ static void SetScalar(MemoryType& v, half s, Number<I>)
{
static_assert(I < 1, "wrong");
*(reinterpret_cast<half*>(&v) + I) = s;
}
};
template <>
struct vector_type<half, 2>
{
using MemoryType = half2_t;
union DataType
{
MemoryType vector;
half scalar[2];
};
template <index_t I>
__host__ __device__ static void SetScalar(MemoryType& v, half s, Number<I>)
{
static_assert(I < 2, "wrong");
*(reinterpret_cast<half*>(&v) + I) = s;
}
__host__ __device__ static MemoryType Pack(half s0, half s1)
{
DataType data;
data.scalar[0] = s0;
data.scalar[1] = s1;
return data.vector;
}
};
template <>
struct vector_type<half, 4>
{
using MemoryType = half4_t;
union DataType
{
MemoryType vector;
half scalar[4];
};
template <index_t I>
__host__ __device__ static void SetScalar(MemoryType& v, half s, Number<I>)
{
static_assert(I < 4, "wrong");
*(reinterpret_cast<half*>(&v) + I) = s;
}
__host__ __device__ static MemoryType Pack(half s0, half s1, half s2, half s3)
{
DataType data;
data.scalar[0] = s0;
data.scalar[1] = s1;
data.scalar[2] = s2;
data.scalar[3] = s3;
return data.vector;
}
};
template <>
struct vector_type<ushort, 1>
{
using MemoryType = ushort;
template <index_t I>
__host__ __device__ static void SetScalar(MemoryType& v, ushort s, Number<I>)
{
static_assert(I < 1, "wrong");
*(reinterpret_cast<ushort*>(&v) + I) = s;
}
};
template <>
struct vector_type<ushort, 2>
{
using MemoryType = ushort2_t;
union DataType
{
MemoryType vector;
ushort scalar[2];
};
template <index_t I>
__host__ __device__ static void SetScalar(MemoryType& v, ushort s, Number<I>)
{
static_assert(I < 2, "wrong");
*(reinterpret_cast<ushort*>(&v) + I) = s;
}
__host__ __device__ static MemoryType Pack(ushort s0, ushort s1)
{
DataType data;
data.scalar[0] = s0;
data.scalar[1] = s1;
return data.vector;
}
};
template <>
struct vector_type<ushort, 4>
{
using MemoryType = ushort4_t;
union DataType
{
MemoryType vector;
ushort scalar[4];
};
template <index_t I>
__host__ __device__ static void SetScalar(MemoryType& v, ushort s, Number<I>)
{
static_assert(I < 4, "wrong");
*(reinterpret_cast<ushort*>(&v) + I) = s;
}
__host__ __device__ static MemoryType Pack(ushort s0, ushort s1, ushort s2, ushort s3)
{
DataType data;
data.scalar[0] = s0;
data.scalar[1] = s1;
data.scalar[2] = s2;
data.scalar[3] = s3;
return data.vector;
}
};
// data type conversion
template <typename T>
struct type_convert
{
template <typename X>
__device__ T operator()(X x) const
{
return static_cast<T>(x);
}
};
template <>
template <>
__device__ float type_convert<float>::operator()<ushort>(ushort x) const
{
return bfloat16_to_float(x);
}
template <>
template <>
__device__ ushort type_convert<ushort>::operator()<float>(float x) const
{
return float_to_bfloat16(x);
}
template <typename T>
struct inner_product_with_conversion
{
static constexpr auto convert = type_convert<T>();
__device__ T operator()(float a, float b) const { return convert(a) * convert(b); }
__device__ T operator()(half2_t a, half2_t b) const
{
const half* p_a_half = reinterpret_cast<const half*>(&a);
const half* p_b_half = reinterpret_cast<const half*>(&b);
T acc = 0;
for(index_t v = 0; v < 2; ++v)
{
acc += convert(p_a_half[v]) * convert(p_b_half[v]);
}
return acc;
}
__device__ T operator()(half4_t a, half4_t b) const
{
const half* p_a_half = reinterpret_cast<const half*>(&a);
const half* p_b_half = reinterpret_cast<const half*>(&b);
T acc = 0;
for(index_t v = 0; v < 4; ++v)
{
acc += convert(p_a_half[v]) * convert(p_b_half[v]);
}
return acc;
}
__device__ T operator()(ushort2_t a, ushort2_t b) const
{
const ushort* p_a_bfloat16 = reinterpret_cast<const ushort*>(&a);
const ushort* p_b_bfloat16 = reinterpret_cast<const ushort*>(&b);
T acc = 0;
for(index_t v = 0; v < 2; ++v)
{
acc += convert(p_a_bfloat16[v]) * convert(p_b_bfloat16[v]);
}
return acc;
}
__device__ T operator()(ushort4_t a, ushort4_t b) const
{
const ushort* p_a_bfloat16 = reinterpret_cast<const ushort*>(&a);
const ushort* p_b_bfloat16 = reinterpret_cast<const ushort*>(&b);
T acc = 0;
for(index_t v = 0; v < 4; ++v)
{
acc += convert(p_a_bfloat16[v]) * convert(p_b_bfloat16[v]);
}
return acc;
}
};
} // namespace ck
#endif
...@@ -297,7 +297,7 @@ int main(int argc, char* argv[]) ...@@ -297,7 +297,7 @@ int main(int argc, char* argv[])
using LeftPads = Sequence<0, 0>; using LeftPads = Sequence<0, 0>;
using RightPads = Sequence<0, 0>; using RightPads = Sequence<0, 0>;
#elif 1 #elif 0
// 3x3 filter, 2x2 stride, 35x35 input, 17x17 output // 3x3 filter, 2x2 stride, 35x35 input, 17x17 output
// cudnn@V100 90%, ck@V100 93%, ck@P100 83%, ck@VII 81% // cudnn@V100 90%, ck@V100 93%, ck@P100 83%, ck@VII 81%
constexpr index_t N = 128; constexpr index_t N = 128;
...@@ -343,7 +343,7 @@ int main(int argc, char* argv[]) ...@@ -343,7 +343,7 @@ int main(int argc, char* argv[])
using LeftPads = Sequence<3, 0>; using LeftPads = Sequence<3, 0>;
using RightPads = Sequence<3, 0>; using RightPads = Sequence<3, 0>;
#elif 0 #elif 1
// 1x7 filter, 0x3 pad, 17x17 input // 1x7 filter, 0x3 pad, 17x17 input
constexpr index_t N = 128; constexpr index_t N = 128;
constexpr index_t C = 128; constexpr index_t C = 128;
...@@ -450,7 +450,7 @@ int main(int argc, char* argv[]) ...@@ -450,7 +450,7 @@ int main(int argc, char* argv[])
ConvStrides{}, ConvStrides{},
ConvDilations{}, ConvDilations{},
nrepeat); nrepeat);
#elif 0 #elif 1
device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw(in_nchw_desc, device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw(in_nchw_desc,
in_nchw, in_nchw,
wei_kcyx_desc, wei_kcyx_desc,
...@@ -492,7 +492,7 @@ int main(int argc, char* argv[]) ...@@ -492,7 +492,7 @@ int main(int argc, char* argv[])
ConvStrides{}, ConvStrides{},
ConvDilations{}, ConvDilations{},
nrepeat); nrepeat);
#elif 1 #elif 0
device_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw(in_nchw_desc, device_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw(in_nchw_desc,
in_nchw, in_nchw,
wei_kcyx_desc, wei_kcyx_desc,
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment