Commit 4cd8f454 authored by Chao Liu's avatar Chao Liu
Browse files

sync with miopen

parent 6fc49f91
...@@ -56,10 +56,12 @@ if(DEVICE_BACKEND STREQUAL "AMD") ...@@ -56,10 +56,12 @@ if(DEVICE_BACKEND STREQUAL "AMD")
configure_file("${PROJECT_SOURCE_DIR}/composable_kernel/include/utility/config.amd.hpp.in" "${PROJECT_BINARY_DIR}/composable_kernel/include/utility/config.hpp") configure_file("${PROJECT_SOURCE_DIR}/composable_kernel/include/utility/config.amd.hpp.in" "${PROJECT_BINARY_DIR}/composable_kernel/include/utility/config.hpp")
configure_file("${PROJECT_SOURCE_DIR}/composable_kernel/include/utility/float_type.amd.hpp.in" "${PROJECT_BINARY_DIR}/composable_kernel/include/utility/float_type.hpp") configure_file("${PROJECT_SOURCE_DIR}/composable_kernel/include/utility/float_type.amd.hpp.in" "${PROJECT_BINARY_DIR}/composable_kernel/include/utility/float_type.hpp")
configure_file("${PROJECT_SOURCE_DIR}/composable_kernel/include/utility/in_memory_operation.amd.hpp.in" "${PROJECT_BINARY_DIR}/composable_kernel/include/utility/in_memory_operation.hpp") configure_file("${PROJECT_SOURCE_DIR}/composable_kernel/include/utility/in_memory_operation.amd.hpp.in" "${PROJECT_BINARY_DIR}/composable_kernel/include/utility/in_memory_operation.hpp")
configure_file("${PROJECT_SOURCE_DIR}/composable_kernel/include/utility/synchronization.amd.hpp.in" "${PROJECT_BINARY_DIR}/composable_kernel/include/utility/synchronization.hpp")
elseif(DEVICE_BACKEND STREQUAL "NVIDIA") elseif(DEVICE_BACKEND STREQUAL "NVIDIA")
configure_file("${PROJECT_SOURCE_DIR}/composable_kernel/include/utility/config.nvidia.hpp.in" "${PROJECT_BINARY_DIR}/composable_kernel/include/utility/config.hpp") configure_file("${PROJECT_SOURCE_DIR}/composable_kernel/include/utility/config.nvidia.hpp.in" "${PROJECT_BINARY_DIR}/composable_kernel/include/utility/config.hpp")
configure_file("${PROJECT_SOURCE_DIR}/composable_kernel/include/utility/float_type.nvidia.hpp.in" "${PROJECT_BINARY_DIR}/composable_kernel/include/utility/float_type.hpp") configure_file("${PROJECT_SOURCE_DIR}/composable_kernel/include/utility/float_type.nvidia.hpp.in" "${PROJECT_BINARY_DIR}/composable_kernel/include/utility/float_type.hpp")
configure_file("${PROJECT_SOURCE_DIR}/composable_kernel/include/utility/in_memory_operation.nvidia.hpp.in" "${PROJECT_BINARY_DIR}/composable_kernel/include/utility/in_memory_operation.hpp") configure_file("${PROJECT_SOURCE_DIR}/composable_kernel/include/utility/in_memory_operation.nvidia.hpp.in" "${PROJECT_BINARY_DIR}/composable_kernel/include/utility/in_memory_operation.hpp")
configure_file("${PROJECT_SOURCE_DIR}/composable_kernel/include/utility/synchronization.nvidia.hpp.in" "${PROJECT_BINARY_DIR}/composable_kernel/include/utility/synchronization.hpp")
endif() endif()
add_subdirectory(driver) add_subdirectory(driver)
...@@ -36,8 +36,8 @@ template <index_t GridSize, ...@@ -36,8 +36,8 @@ template <index_t GridSize,
index_t ThreadGemmDataPerRead_GemmN, index_t ThreadGemmDataPerRead_GemmN,
typename GemmABlockCopyThreadSliceLengths_GemmK_GemmM, typename GemmABlockCopyThreadSliceLengths_GemmK_GemmM,
typename GemmABlockCopyThreadClusterLengths_GemmK_GemmM, typename GemmABlockCopyThreadClusterLengths_GemmK_GemmM,
index_t GemmABlockCopySrcDataPerRead_GemmN, index_t GemmABlockCopySrcDataPerRead_GemmM,
index_t GemmABlockCopyDstDataPerWrite_GemmN, index_t GemmABlockCopyDstDataPerWrite_GemmM,
typename GemmBBlockCopyThreadSliceLengths_GemmK_GemmN, typename GemmBBlockCopyThreadSliceLengths_GemmK_GemmN,
typename GemmBBlockCopyThreadClusterLengths_GemmK_GemmN, typename GemmBBlockCopyThreadClusterLengths_GemmK_GemmN,
index_t GemmBBlockCopySrcDataPerRead_GemmN, index_t GemmBBlockCopySrcDataPerRead_GemmN,
...@@ -82,13 +82,6 @@ struct GridwiseConvolutionBackwardDataImplicitGemm_v1r1_nchw_kcyx_nkhw ...@@ -82,13 +82,6 @@ struct GridwiseConvolutionBackwardDataImplicitGemm_v1r1_nchw_kcyx_nkhw
constexpr auto wei_gemmk_gemmm_global_desc = constexpr auto wei_gemmk_gemmm_global_desc =
unfold_tensor_descriptor(wei_k_c_y_x_global_desc, I1, I3); unfold_tensor_descriptor(wei_k_c_y_x_global_desc, I1, I3);
// output tensor
constexpr auto out_gemmk_gemmn_global_desc =
transform_tensor_descriptor(unfold_tensor_descriptor(out_n_k_ho_wo_global_desc, I2, I3),
make_tuple(PassThrough<K>{}, Merge<Sequence<N, Ho * Wo>>{}),
make_tuple(Sequence<1>{}, Sequence<0, 2>{}),
make_tuple(Sequence<0>{}, Sequence<1>{}));
// input tensor // input tensor
constexpr auto in_n_c_hip_wip_global_desc = transform_tensor_descriptor( constexpr auto in_n_c_hip_wip_global_desc = transform_tensor_descriptor(
in_n_c_hi_wi_global_desc, in_n_c_hi_wi_global_desc,
...@@ -98,16 +91,15 @@ struct GridwiseConvolutionBackwardDataImplicitGemm_v1r1_nchw_kcyx_nkhw ...@@ -98,16 +91,15 @@ struct GridwiseConvolutionBackwardDataImplicitGemm_v1r1_nchw_kcyx_nkhw
make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2, 3>{}), make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2, 3>{}),
make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2, 3>{})); make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2, 3>{}));
constexpr index_t Hip = in_n_c_hip_wip_global_desc.GetLengths()[2];
constexpr index_t Wip = in_n_c_hip_wip_global_desc.GetLengths()[3];
constexpr auto in_n_c_y_ho_x_wo_global_desc = transform_tensor_descriptor( constexpr auto in_n_c_y_ho_x_wo_global_desc = transform_tensor_descriptor(
in_n_c_hip_wip_global_desc, in_n_c_hip_wip_global_desc,
make_tuple(PassThrough<N>{}, make_tuple(PassThrough<N>{},
PassThrough<C>{}, PassThrough<C>{},
Embed<Hi + InLeftPads::At(0) + InRightPads::At(0), Embed<Hip, Sequence<Y, Ho>, Sequence<ConvDilationH, ConvStrideH, 0>>{},
Sequence<Y, Ho>, Embed<Wip, Sequence<X, Wo>, Sequence<ConvDilationW, ConvStrideW, 0>>{}),
Sequence<ConvDilationH, ConvStrideH, 0>>{},
Embed<Wi + InLeftPads::At(1) + InRightPads::At(1),
Sequence<X, Wo>,
Sequence<ConvDilationW, ConvStrideW, 0>>{}),
make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}), make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2, 3>{}, Sequence<4, 5>{})); make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2, 3>{}, Sequence<4, 5>{}));
...@@ -117,6 +109,13 @@ struct GridwiseConvolutionBackwardDataImplicitGemm_v1r1_nchw_kcyx_nkhw ...@@ -117,6 +109,13 @@ struct GridwiseConvolutionBackwardDataImplicitGemm_v1r1_nchw_kcyx_nkhw
make_tuple(Sequence<1, 2, 4>{}, Sequence<0, 3, 5>{}), make_tuple(Sequence<1, 2, 4>{}, Sequence<0, 3, 5>{}),
make_tuple(Sequence<0>{}, Sequence<1>{})); make_tuple(Sequence<0>{}, Sequence<1>{}));
// output tensor
constexpr auto out_gemmk_gemmn_global_desc =
transform_tensor_descriptor(unfold_tensor_descriptor(out_n_k_ho_wo_global_desc, I2, I3),
make_tuple(PassThrough<K>{}, Merge<Sequence<N, Ho * Wo>>{}),
make_tuple(Sequence<1>{}, Sequence<0, 2>{}),
make_tuple(Sequence<0>{}, Sequence<1>{}));
// GEMM // GEMM
// \todo there are more combinations of Y, ConvDilationH and ConvStrideH that don't need // \todo there are more combinations of Y, ConvDilationH and ConvStrideH that don't need
// atomic, find out all of them // atomic, find out all of them
...@@ -152,8 +151,8 @@ struct GridwiseConvolutionBackwardDataImplicitGemm_v1r1_nchw_kcyx_nkhw ...@@ -152,8 +151,8 @@ struct GridwiseConvolutionBackwardDataImplicitGemm_v1r1_nchw_kcyx_nkhw
Sequence<0, 1>, Sequence<0, 1>,
Sequence<0, 1>, Sequence<0, 1>,
1, 1,
GemmABlockCopySrcDataPerRead_GemmN, GemmABlockCopySrcDataPerRead_GemmM,
GemmABlockCopyDstDataPerWrite_GemmN, GemmABlockCopyDstDataPerWrite_GemmM,
GemmBBlockCopyThreadSliceLengths_GemmK_GemmN, GemmBBlockCopyThreadSliceLengths_GemmK_GemmN,
GemmBBlockCopyThreadClusterLengths_GemmK_GemmN, GemmBBlockCopyThreadClusterLengths_GemmK_GemmN,
Sequence<0, 1>, Sequence<0, 1>,
......
...@@ -86,7 +86,7 @@ struct GridwiseConvolutionImplicitGemm_v4r4_nchw_kcyx_nkhw ...@@ -86,7 +86,7 @@ struct GridwiseConvolutionImplicitGemm_v4r4_nchw_kcyx_nkhw
#endif #endif
// weight tensor // weight tensor
constexpr auto wei_e_k_global_desc = reorder_tensor_descriptor_given_upper2lower( constexpr auto wei_gemmk_gemmm_global_desc = reorder_tensor_descriptor_given_upper2lower(
unfold_tensor_descriptor(wei_k_c_y_x_global_desc, I1, I3), Sequence<1, 0>{}); unfold_tensor_descriptor(wei_k_c_y_x_global_desc, I1, I3), Sequence<1, 0>{});
// input tensor // input tensor
...@@ -110,14 +110,14 @@ struct GridwiseConvolutionImplicitGemm_v4r4_nchw_kcyx_nkhw ...@@ -110,14 +110,14 @@ struct GridwiseConvolutionImplicitGemm_v4r4_nchw_kcyx_nkhw
make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}), make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2, 3>{}, Sequence<4, 5>{})); make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2, 3>{}, Sequence<4, 5>{}));
constexpr auto in_e_b_global_desc = transform_tensor_descriptor( constexpr auto in_gemmm_gemmn_global_desc = transform_tensor_descriptor(
in_n_c_y_ho_x_wo_global_desc, in_n_c_y_ho_x_wo_global_desc,
make_tuple(Merge<Sequence<C, Y, X>>{}, Merge<Sequence<N, Ho, Wo>>{}), make_tuple(Merge<Sequence<C, Y, X>>{}, Merge<Sequence<N, Ho, Wo>>{}),
make_tuple(Sequence<1, 2, 4>{}, Sequence<0, 3, 5>{}), make_tuple(Sequence<1, 2, 4>{}, Sequence<0, 3, 5>{}),
make_tuple(Sequence<0>{}, Sequence<1>{})); make_tuple(Sequence<0>{}, Sequence<1>{}));
// output tensor // output tensor
constexpr auto out_k_b_global_desc = constexpr auto out_gemmk_gemmn_global_desc =
transform_tensor_descriptor(unfold_tensor_descriptor(out_n_k_ho_wo_global_desc, I2, I3), transform_tensor_descriptor(unfold_tensor_descriptor(out_n_k_ho_wo_global_desc, I2, I3),
make_tuple(PassThrough<K>{}, Merge<Sequence<N, Ho * Wo>>{}), make_tuple(PassThrough<K>{}, Merge<Sequence<N, Ho * Wo>>{}),
make_tuple(Sequence<1>{}, Sequence<0, 2>{}), make_tuple(Sequence<1>{}, Sequence<0, 2>{}),
...@@ -129,9 +129,9 @@ struct GridwiseConvolutionImplicitGemm_v4r4_nchw_kcyx_nkhw ...@@ -129,9 +129,9 @@ struct GridwiseConvolutionImplicitGemm_v4r4_nchw_kcyx_nkhw
BlockSize, BlockSize,
Float, Float,
AccFloat, AccFloat,
decltype(wei_e_k_global_desc), decltype(wei_gemmk_gemmm_global_desc),
decltype(in_e_b_global_desc), decltype(in_gemmm_gemmn_global_desc),
decltype(out_k_b_global_desc), decltype(out_gemmk_gemmn_global_desc),
InMemoryDataOperation::Set, InMemoryDataOperation::Set,
GemmMPerBlock, GemmMPerBlock,
GemmNPerBlock, GemmNPerBlock,
......
...@@ -31,7 +31,9 @@ template <index_t BlockSize, ...@@ -31,7 +31,9 @@ template <index_t BlockSize,
AddressSpace SrcAddressSpace = AddressSpace::Generic, AddressSpace SrcAddressSpace = AddressSpace::Generic,
AddressSpace ThreadBufferAddressSpace = AddressSpace::Generic, AddressSpace ThreadBufferAddressSpace = AddressSpace::Generic,
AddressSpace DstAddressSpace = AddressSpace::Generic, AddressSpace DstAddressSpace = AddressSpace::Generic,
InMemoryDataOperation DstInMemOp = InMemoryDataOperation::Set> InMemoryDataOperation DstInMemOp = InMemoryDataOperation::Set,
index_t SrcDataStride = 1,
index_t DstDataStride = 1>
struct BlockwiseGenericTensorSliceCopy_v4 struct BlockwiseGenericTensorSliceCopy_v4
{ {
static constexpr index_t nDim = BlockSrcDesc::GetNumOfDimension(); static constexpr index_t nDim = BlockSrcDesc::GetNumOfDimension();
...@@ -178,7 +180,9 @@ struct BlockwiseGenericTensorSliceCopy_v4 ...@@ -178,7 +180,9 @@ struct BlockwiseGenericTensorSliceCopy_v4
1, 1,
SrcAddressSpace, SrcAddressSpace,
ThreadBufferAddressSpace, ThreadBufferAddressSpace,
InMemoryDataOperation::Set>; InMemoryDataOperation::Set,
SrcDataStride,
1>;
using ThreadwiseStore = ThreadwiseGenericTensorSliceCopy_v4r2<ThreadBufferDesc, using ThreadwiseStore = ThreadwiseGenericTensorSliceCopy_v4r2<ThreadBufferDesc,
BlockDstDesc, BlockDstDesc,
...@@ -189,7 +193,9 @@ struct BlockwiseGenericTensorSliceCopy_v4 ...@@ -189,7 +193,9 @@ struct BlockwiseGenericTensorSliceCopy_v4
DstDataPerWrite, DstDataPerWrite,
ThreadBufferAddressSpace, ThreadBufferAddressSpace,
DstAddressSpace, DstAddressSpace,
DstInMemOp>; DstInMemOp,
1,
DstDataStride>;
static constexpr auto mThreadClusterDesc = static constexpr auto mThreadClusterDesc =
make_cluster_descriptor(ThreadClusterLengths{}, ThreadClusterArrangeOrder{}); make_cluster_descriptor(ThreadClusterLengths{}, ThreadClusterArrangeOrder{});
......
...@@ -23,7 +23,9 @@ template <typename SrcDesc, ...@@ -23,7 +23,9 @@ template <typename SrcDesc,
index_t DstDataPerWrite, index_t DstDataPerWrite,
AddressSpace SrcAddressSpace = AddressSpace::Generic, AddressSpace SrcAddressSpace = AddressSpace::Generic,
AddressSpace DstAddressSpace = AddressSpace::Generic, AddressSpace DstAddressSpace = AddressSpace::Generic,
InMemoryDataOperation DstInMemOp = InMemoryDataOperation::Set> InMemoryDataOperation DstInMemOp = InMemoryDataOperation::Set,
index_t SrcDataStride = 1,
index_t DstDataStride = 1>
struct ThreadwiseGenericTensorSliceCopy_v4r2 struct ThreadwiseGenericTensorSliceCopy_v4r2
{ {
static constexpr index_t nDim = SliceLengths::Size(); static constexpr index_t nDim = SliceLengths::Size();
......
...@@ -8,65 +8,149 @@ namespace ck { ...@@ -8,65 +8,149 @@ namespace ck {
// For 128bit SGPRs in buffer_load and buffer_store instructions // For 128bit SGPRs in buffer_load and buffer_store instructions
// https://rocm-documentation.readthedocs.io/en/latest/GCN_ISA_Manuals/testdocbook.html#vector-memory-buffer-instructions // https://rocm-documentation.readthedocs.io/en/latest/GCN_ISA_Manuals/testdocbook.html#vector-memory-buffer-instructions
template <typename T> template <typename T>
union BufferLoadStoreDwordConfig union BufferAddressConfig
{ {
int32x4_t data; int32x4_t data;
T* address[2]; T* address[2];
int32_t range[4]; int32_t range[4];
}; };
__device__ float __llvm_amdgcn_buffer_load(int32x4_t rsrc, __device__ float __llvm_amdgcn_buffer_load_f32(int32x4_t rsrc,
index_t vindex, index_t vindex,
index_t offset, index_t offset,
bool glc, bool glc,
bool slc) __asm("llvm.amdgcn.buffer.load.f32"); bool slc) __asm("llvm.amdgcn.buffer.load.f32");
__device__ float2_t __llvm_amdgcn_buffer_loadx2(int32x4_t rsrc, __device__ float2_t
__llvm_amdgcn_buffer_load_f32x2(int32x4_t rsrc,
index_t vindex,
index_t offset,
bool glc,
bool slc) __asm("llvm.amdgcn.buffer.load.v2f32");
__device__ float4_t
__llvm_amdgcn_buffer_load_f32x4(int32x4_t rsrc,
index_t vindex,
index_t offset,
bool glc,
bool slc) __asm("llvm.amdgcn.buffer.load.v4f32");
__device__ half_t __llvm_amdgcn_buffer_load_f16(int32x4_t rsrc,
index_t vindex, index_t vindex,
index_t offset, index_t offset,
bool glc, bool glc,
bool slc) __asm("llvm.amdgcn.buffer.load.v2f32"); bool slc) __asm("llvm.amdgcn.buffer.load.f16");
__device__ float4_t __llvm_amdgcn_buffer_loadx4(int32x4_t rsrc, __device__ half2_t __llvm_amdgcn_buffer_load_f16x2(int32x4_t rsrc,
index_t vindex,
index_t offset,
bool glc,
bool slc) __asm("llvm.amdgcn.buffer.load.v2f16");
__device__ half4_t __llvm_amdgcn_buffer_load_f16x4(int32x4_t rsrc,
index_t vindex,
index_t offset,
bool glc,
bool slc) __asm("llvm.amdgcn.buffer.load.v4f16");
__device__ ushort __llvm_amdgcn_buffer_load_bf16(int32x4_t rsrc,
index_t vindex,
index_t offset,
bool glc,
bool slc) __asm("llvm.amdgcn.buffer.load.bf16");
__device__ ushort2_t
__llvm_amdgcn_buffer_load_bf16x2(int32x4_t rsrc,
index_t vindex,
index_t offset,
bool glc,
bool slc) __asm("llvm.amdgcn.buffer.load.v2bf16");
__device__ ushort4_t
__llvm_amdgcn_buffer_load_bf16x4(int32x4_t rsrc,
index_t vindex,
index_t offset,
bool glc,
bool slc) __asm("llvm.amdgcn.buffer.load.v4bf16");
__device__ void __llvm_amdgcn_buffer_store_f32(float vdata,
int32x4_t rsrc,
index_t vindex,
index_t offset,
bool glc,
bool slc) __asm("llvm.amdgcn.buffer.store.f32");
__device__ void __llvm_amdgcn_buffer_store_f32x2(float2_t vdata,
int32x4_t rsrc,
index_t vindex,
index_t offset,
bool glc,
bool slc) __asm("llvm.amdgcn.buffer.store.v2f32");
__device__ void __llvm_amdgcn_buffer_store_f32x4(float4_t vdata,
int32x4_t rsrc,
index_t vindex,
index_t offset,
bool glc,
bool slc) __asm("llvm.amdgcn.buffer.store.v4f32");
__device__ void __llvm_amdgcn_buffer_store_f16(half_t vdata,
int32x4_t rsrc,
index_t vindex,
index_t offset,
bool glc,
bool slc) __asm("llvm.amdgcn.buffer.store.f16");
__device__ void __llvm_amdgcn_buffer_store_f16x2(half2_t vdata,
int32x4_t rsrc,
index_t vindex,
index_t offset,
bool glc,
bool slc) __asm("llvm.amdgcn.buffer.store.v2f16");
__device__ void __llvm_amdgcn_buffer_store_f16x4(half4_t vdata,
int32x4_t rsrc,
index_t vindex,
index_t offset,
bool glc,
bool slc) __asm("llvm.amdgcn.buffer.store.v4f16");
__device__ void __llvm_amdgcn_buffer_store_bf16(ushort vdata,
int32x4_t rsrc,
index_t vindex, index_t vindex,
index_t offset, index_t offset,
bool glc, bool glc,
bool slc) __asm("llvm.amdgcn.buffer.load.v4f32"); bool slc) __asm("llvm.amdgcn.buffer.store.bf16");
__device__ void __llvm_amdgcn_buffer_store(float vdata,
int32x4_t rsrc,
index_t vindex,
index_t offset,
bool glc,
bool slc) __asm("llvm.amdgcn.buffer.store.f32");
__device__ void __llvm_amdgcn_buffer_storex2(float2_t vdata,
int32x4_t rsrc,
index_t vindex,
index_t offset,
bool glc,
bool slc) __asm("llvm.amdgcn.buffer.store.v2f32");
__device__ void __llvm_amdgcn_buffer_storex4(float4_t vdata,
int32x4_t rsrc,
index_t vindex,
index_t offset,
bool glc,
bool slc) __asm("llvm.amdgcn.buffer.store.v4f32");
__device__ void __device__ void
__llvm_amdgcn_buffer_atomic_add(float vdata, __llvm_amdgcn_buffer_store_bf16x2(ushort2_t vdata,
int32x4_t rsrc, int32x4_t rsrc,
index_t vindex, index_t vindex,
index_t offset, index_t offset,
bool slc) __asm("llvm.amdgcn.buffer.atomic.fadd.f32"); bool glc,
bool slc) __asm("llvm.amdgcn.buffer.store.v2bf16");
__device__ void
__llvm_amdgcn_buffer_store_bf16x4(ushort4_t vdata,
int32x4_t rsrc,
index_t vindex,
index_t offset,
bool glc,
bool slc) __asm("llvm.amdgcn.buffer.store.v4bf16");
__device__ void
__llvm_amdgcn_buffer_atomic_add_f32(float vdata,
int32x4_t rsrc,
index_t vindex,
index_t offset,
bool slc) __asm("llvm.amdgcn.buffer.atomic.fadd.f32");
// buffer_load requires: // buffer_load requires:
// 1) p_src must be in global memory space, d_dst must be vgpr // 1) p_src must be in global memory space, d_dst must be vgpr
// 2) p_src to be a block-invariant pointer. // 2) p_src to be a block-invariant pointer.
// It is user's responsibility to make sure that is true. // It is user's responsibility to make sure that is true.
template <typename T, index_t VectorSize> template <typename T, index_t VectorSize>
__device__ typename vector_type<T, VectorSize>::MemoryType amd_intrinsic_buffer_load( __device__ typename vector_type<T, VectorSize>::MemoryType amd_buffer_load(
const T* p_src_block, index_t src_thread_data_offset, index_t src_const_data_offset); const T* p_src_block, index_t src_thread_data_offset, index_t src_const_data_offset);
// buffer_store requires: // buffer_store requires:
...@@ -74,30 +158,44 @@ __device__ typename vector_type<T, VectorSize>::MemoryType amd_intrinsic_buffer_ ...@@ -74,30 +158,44 @@ __device__ typename vector_type<T, VectorSize>::MemoryType amd_intrinsic_buffer_
// 2) p_dst to be a block-invariant pointer. // 2) p_dst to be a block-invariant pointer.
// It is user's responsibility to make sure that is true. // It is user's responsibility to make sure that is true.
template <typename T, index_t VectorSize> template <typename T, index_t VectorSize>
__device__ void __device__ void amd_buffer_store(const T* p_src,
amd_intrinsic_buffer_store(const typename vector_type<T, VectorSize>::MemoryType& src, T* p_dst_block,
T* p_dst_block, index_t dst_thread_data_offset,
index_t dst_thread_data_offset, index_t dst_const_data_offset);
index_t dst_const_data_offset);
template <typename T, index_t VectorSize> template <typename T, index_t VectorSize>
__device__ void __device__ void amd_buffer_atomic_add(const T* p_src,
amd_intrinsic_buffer_atomic_add(const typename vector_type<T, VectorSize>::MemoryType& src, T* p_dst_block,
T* p_dst_block, index_t dst_thread_data_offset,
index_t dst_thread_data_offset, index_t dst_const_data_offset);
index_t dst_const_data_offset);
template <> template <>
__device__ float amd_intrinsic_buffer_load<float, 1>(const float* p_src_block, __device__ float amd_buffer_load<float, 1>(const float* p_src_block,
index_t src_thread_data_offset, index_t src_thread_data_offset,
index_t src_const_data_offset) index_t src_const_data_offset)
{ {
float dst; BufferAddressConfig<float> src_block_config;
// fill in byte 0 - 1
src_block_config.address[0] = const_cast<float*>(p_src_block);
// fill in byte 2
src_block_config.range[2] = -1;
// fill in byte 3
src_block_config.range[3] = 0x00027000;
index_t src_thread_addr_offset = src_thread_data_offset * sizeof(float); index_t src_thread_addr_offset = src_thread_data_offset * sizeof(float);
index_t src_const_addr_offset = src_const_data_offset * sizeof(float); index_t src_const_addr_offset = src_const_data_offset * sizeof(float);
BufferLoadStoreDwordConfig<float> src_block_config; return __llvm_amdgcn_buffer_load_f32(
src_block_config.data, 0, src_thread_addr_offset + src_const_addr_offset, false, false);
}
template <>
__device__ float2_t amd_buffer_load<float, 2>(const float* p_src_block,
index_t src_thread_data_offset,
index_t src_const_data_offset)
{
BufferAddressConfig<float> src_block_config;
// fill in byte 0 - 1 // fill in byte 0 - 1
src_block_config.address[0] = const_cast<float*>(p_src_block); src_block_config.address[0] = const_cast<float*>(p_src_block);
...@@ -106,102 +204,283 @@ __device__ float amd_intrinsic_buffer_load<float, 1>(const float* p_src_block, ...@@ -106,102 +204,283 @@ __device__ float amd_intrinsic_buffer_load<float, 1>(const float* p_src_block,
// fill in byte 3 // fill in byte 3
src_block_config.range[3] = 0x00027000; src_block_config.range[3] = 0x00027000;
#if CK_USE_AMD_BUFFER_ADDRESSING_INTRINSIC index_t src_thread_addr_offset = src_thread_data_offset * sizeof(float);
dst = __llvm_amdgcn_buffer_load( index_t src_const_addr_offset = src_const_data_offset * sizeof(float);
src_block_config.data, 0, src_thread_addr_offset + src_const_addr_offset, false, false);
#else
asm volatile(
"\n \
buffer_load_dword %0, %1, %2, %3 offen offset:0 \n \
s_waitcnt 0 \n \
"
: "=v"(dst)
: "v"(src_thread_addr_offset), "s"(src_block_config.data), "s"(src_const_addr_offset));
#endif
return dst; return __llvm_amdgcn_buffer_load_f32x2(
src_block_config.data, 0, src_thread_addr_offset + src_const_addr_offset, false, false);
} }
template <> template <>
__device__ float2_t amd_intrinsic_buffer_load<float, 2>(const float* p_src_block, __device__ float4_t amd_buffer_load<float, 4>(const float* p_src_block,
index_t src_thread_data_offset, index_t src_thread_data_offset,
index_t src_const_data_offset) index_t src_const_data_offset)
{ {
float2_t dst; BufferAddressConfig<float> src_block_config;
// fill in byte 0 - 1
src_block_config.address[0] = const_cast<float*>(p_src_block);
// fill in byte 2
src_block_config.range[2] = -1;
// fill in byte 3
src_block_config.range[3] = 0x00027000;
index_t src_thread_addr_offset = src_thread_data_offset * sizeof(float); index_t src_thread_addr_offset = src_thread_data_offset * sizeof(float);
index_t src_const_addr_offset = src_const_data_offset * sizeof(float); index_t src_const_addr_offset = src_const_data_offset * sizeof(float);
BufferLoadStoreDwordConfig<float> src_block_config; return __llvm_amdgcn_buffer_load_f32x4(
src_block_config.data, 0, src_thread_addr_offset + src_const_addr_offset, false, false);
}
template <>
__device__ half_t amd_buffer_load<half_t, 1>(const half_t* p_src_block,
index_t src_thread_data_offset,
index_t src_const_data_offset)
{
BufferAddressConfig<half_t> src_block_config;
// fill in byte 0 - 1 // fill in byte 0 - 1
src_block_config.address[0] = const_cast<float*>(p_src_block); src_block_config.address[0] = const_cast<half_t*>(p_src_block);
// fill in byte 2 // fill in byte 2
src_block_config.range[2] = -1; src_block_config.range[2] = -1;
// fill in byte 3 // fill in byte 3
src_block_config.range[3] = 0x00027000; src_block_config.range[3] = 0x00027000;
#if CK_USE_AMD_BUFFER_ADDRESSING_INTRINSIC #if !CK_WORKAROUND_SWDEV_231101
dst = __llvm_amdgcn_buffer_loadx2( index_t src_thread_addr_offset = src_thread_data_offset * sizeof(half_t);
index_t src_const_addr_offset = src_const_data_offset * sizeof(half_t);
return __llvm_amdgcn_buffer_load_f16(
src_block_config.data, 0, src_thread_addr_offset + src_const_addr_offset, false, false); src_block_config.data, 0, src_thread_addr_offset + src_const_addr_offset, false, false);
#else #else
asm volatile( return p_src_block[src_thread_data_offset + src_const_data_offset];
"\n \
buffer_load_dwordx2 %0, %1, %2, %3 offen offset:0 \n \
s_waitcnt 0 \n \
"
: "=v"(dst)
: "v"(src_thread_addr_offset), "s"(src_block_config.data), "s"(src_const_addr_offset));
#endif #endif
}
return dst; template <>
__device__ half2_t amd_buffer_load<half_t, 2>(const half_t* p_src_block,
index_t src_thread_data_offset,
index_t src_const_data_offset)
{
BufferAddressConfig<half_t> src_block_config;
// fill in byte 0 - 1
src_block_config.address[0] = const_cast<half_t*>(p_src_block);
// fill in byte 2
src_block_config.range[2] = -1;
// fill in byte 3
src_block_config.range[3] = 0x00027000;
index_t src_thread_addr_offset = src_thread_data_offset * sizeof(half_t);
index_t src_const_addr_offset = src_const_data_offset * sizeof(half_t);
#if !CK_WORKAROUND_SWDEV_231101
return __llvm_amdgcn_buffer_load_f16x2(
src_block_config.data, 0, src_thread_addr_offset + src_const_addr_offset, false, false);
#else
float dst_out_tmp = __llvm_amdgcn_buffer_load_f32(
src_block_config.data, 0, src_thread_addr_offset + src_const_addr_offset, false, false);
return *reinterpret_cast<half2_t*>(&dst_out_tmp);
#endif
} }
template <> template <>
__device__ float4_t amd_intrinsic_buffer_load<float, 4>(const float* p_src_block, __device__ half4_t amd_buffer_load<half_t, 4>(const half_t* p_src_block,
index_t src_thread_data_offset, index_t src_thread_data_offset,
index_t src_const_data_offset) index_t src_const_data_offset)
{ {
float4_t dst; BufferAddressConfig<half_t> src_block_config;
index_t src_thread_addr_offset = src_thread_data_offset * sizeof(float); // fill in byte 0 - 1
index_t src_const_addr_offset = src_const_data_offset * sizeof(float); src_block_config.address[0] = const_cast<half_t*>(p_src_block);
// fill in byte 2
src_block_config.range[2] = -1;
// fill in byte 3
src_block_config.range[3] = 0x00027000;
index_t src_thread_addr_offset = src_thread_data_offset * sizeof(half_t);
index_t src_const_addr_offset = src_const_data_offset * sizeof(half_t);
BufferLoadStoreDwordConfig<float> src_block_config; #if !CK_WORKAROUND_SWDEV_231101
return __llvm_amdgcn_buffer_load_f16x4(
src_block_config.data, 0, src_thread_addr_offset + src_const_addr_offset, false, false);
#else
float2_t dst_out_tmp = __llvm_amdgcn_buffer_load_f32x2(
src_block_config.data, 0, src_thread_addr_offset + src_const_addr_offset, false, false);
return *reinterpret_cast<half4_t*>(&dst_out_tmp);
#endif
}
template <>
__device__ half8_t amd_buffer_load<half_t, 8>(const half_t* p_src_block,
index_t src_thread_data_offset,
index_t src_const_data_offset)
{
BufferAddressConfig<half_t> src_block_config;
// fill in byte 0 - 1 // fill in byte 0 - 1
src_block_config.address[0] = const_cast<float*>(p_src_block); src_block_config.address[0] = const_cast<half_t*>(p_src_block);
// fill in byte 2
src_block_config.range[2] = -1;
// fill in byte 3
src_block_config.range[3] = 0x00027000;
index_t src_thread_addr_offset = src_thread_data_offset * sizeof(half_t);
index_t src_const_addr_offset = src_const_data_offset * sizeof(half_t);
#if !CK_WORKAROUND_SWDEV_231101
static_assert(false, "wrong! not supported");
#else
float4_t dst_out_tmp = __llvm_amdgcn_buffer_load_f32x4(
src_block_config.data, 0, src_thread_addr_offset + src_const_addr_offset, false, false);
return *reinterpret_cast<half8_t*>(&dst_out_tmp);
#endif
}
template <>
__device__ ushort amd_buffer_load<ushort, 1>(const ushort* p_src_block,
index_t src_thread_data_offset,
index_t src_const_data_offset)
{
BufferAddressConfig<ushort> src_block_config;
// fill in byte 0 - 1
src_block_config.address[0] = const_cast<ushort*>(p_src_block);
// fill in byte 2 // fill in byte 2
src_block_config.range[2] = -1; src_block_config.range[2] = -1;
// fill in byte 3 // fill in byte 3
src_block_config.range[3] = 0x00027000; src_block_config.range[3] = 0x00027000;
#if CK_USE_AMD_BUFFER_ADDRESSING_INTRINSIC #if !CK_WORKAROUND_SWDEV_231101
dst = __llvm_amdgcn_buffer_loadx4( index_t src_thread_addr_offset = src_thread_data_offset * sizeof(ushort);
index_t src_const_addr_offset = src_const_data_offset * sizeof(ushort);
return __llvm_amdgcn_buffer_load_bf16(
src_block_config.data, 0, src_thread_addr_offset + src_const_addr_offset, false, false); src_block_config.data, 0, src_thread_addr_offset + src_const_addr_offset, false, false);
#else #else
asm volatile( return p_src_block[src_thread_data_offset + src_const_data_offset];
"\n \
buffer_load_dwordx4 %0, %1, %2, %3 offen offset:0 \n \
s_waitcnt 0 \n \
"
: "=v"(dst)
: "v"(src_thread_addr_offset), "s"(src_block_config.data), "s"(src_const_addr_offset));
#endif #endif
}
template <>
__device__ ushort2_t amd_buffer_load<ushort, 2>(const ushort* p_src_block,
index_t src_thread_data_offset,
index_t src_const_data_offset)
{
BufferAddressConfig<ushort> src_block_config;
return dst; // fill in byte 0 - 1
src_block_config.address[0] = const_cast<ushort*>(p_src_block);
// fill in byte 2
src_block_config.range[2] = -1;
// fill in byte 3
src_block_config.range[3] = 0x00027000;
index_t src_thread_addr_offset = src_thread_data_offset * sizeof(ushort);
index_t src_const_addr_offset = src_const_data_offset * sizeof(ushort);
#if !CK_WORKAROUND_SWDEV_231101
return __llvm_amdgcn_buffer_load_bf16x2(
src_block_config.data, 0, src_thread_addr_offset + src_const_addr_offset, false, false);
#else
float dst_out_tmp = __llvm_amdgcn_buffer_load_f32(
src_block_config.data, 0, src_thread_addr_offset + src_const_addr_offset, false, false);
return *reinterpret_cast<ushort2_t*>(&dst_out_tmp);
#endif
} }
template <> template <>
__device__ void amd_intrinsic_buffer_store<float, 1>(const float& src, __device__ ushort4_t amd_buffer_load<ushort, 4>(const ushort* p_src_block,
float* p_dst_block, index_t src_thread_data_offset,
index_t dst_thread_data_offset, index_t src_const_data_offset)
index_t dst_const_data_offset)
{ {
BufferAddressConfig<ushort> src_block_config;
// fill in byte 0 - 1
src_block_config.address[0] = const_cast<ushort*>(p_src_block);
// fill in byte 2
src_block_config.range[2] = -1;
// fill in byte 3
src_block_config.range[3] = 0x00027000;
index_t src_thread_addr_offset = src_thread_data_offset * sizeof(ushort);
index_t src_const_addr_offset = src_const_data_offset * sizeof(ushort);
#if !CK_WORKAROUND_SWDEV_231101
return __llvm_amdgcn_buffer_load_bf16x4(
src_block_config.data, 0, src_thread_addr_offset + src_const_addr_offset, false, false);
#else
float2_t dst_out_tmp = __llvm_amdgcn_buffer_load_f32x2(
src_block_config.data, 0, src_thread_addr_offset + src_const_addr_offset, false, false);
return *reinterpret_cast<ushort4_t*>(&dst_out_tmp);
#endif
}
template <>
__device__ ushort8_t amd_buffer_load<ushort, 8>(const ushort* p_src_block,
index_t src_thread_data_offset,
index_t src_const_data_offset)
{
BufferAddressConfig<ushort> src_block_config;
// fill in byte 0 - 1
src_block_config.address[0] = const_cast<ushort*>(p_src_block);
// fill in byte 2
src_block_config.range[2] = -1;
// fill in byte 3
src_block_config.range[3] = 0x00027000;
index_t src_thread_addr_offset = src_thread_data_offset * sizeof(ushort);
index_t src_const_addr_offset = src_const_data_offset * sizeof(ushort);
#if !CK_WORKAROUND_SWDEV_231101
static_assert(false, "wrong! not implemented");
#else
float4_t dst_out_tmp = __llvm_amdgcn_buffer_load_f32x4(
src_block_config.data, 0, src_thread_addr_offset + src_const_addr_offset, false, false);
return *reinterpret_cast<ushort8_t*>(&dst_out_tmp);
#endif
}
template <>
__device__ void amd_buffer_store<float, 1>(const float* p_src,
float* p_dst_block,
index_t dst_thread_data_offset,
index_t dst_const_data_offset)
{
BufferAddressConfig<float> dst_block_config;
// fill in byte 0 - 1
dst_block_config.address[0] = p_dst_block;
// fill in byte 2
dst_block_config.range[2] = -1;
// fill in byte 3
dst_block_config.range[3] = 0x00027000;
index_t dst_thread_addr_offset = dst_thread_data_offset * sizeof(float); index_t dst_thread_addr_offset = dst_thread_data_offset * sizeof(float);
index_t dst_const_addr_offset = dst_const_data_offset * sizeof(float); index_t dst_const_addr_offset = dst_const_data_offset * sizeof(float);
BufferLoadStoreDwordConfig<float> dst_block_config; __llvm_amdgcn_buffer_store_f32(*p_src,
dst_block_config.data,
0,
dst_thread_addr_offset + dst_const_addr_offset,
false,
false);
}
template <>
__device__ void amd_buffer_store<float, 2>(const float* p_src,
float* p_dst_block,
index_t dst_thread_data_offset,
index_t dst_const_data_offset)
{
BufferAddressConfig<float> dst_block_config;
// fill in byte 0 - 1 // fill in byte 0 - 1
dst_block_config.address[0] = p_dst_block; dst_block_config.address[0] = p_dst_block;
...@@ -210,35 +489,50 @@ __device__ void amd_intrinsic_buffer_store<float, 1>(const float& src, ...@@ -210,35 +489,50 @@ __device__ void amd_intrinsic_buffer_store<float, 1>(const float& src,
// fill in byte 3 // fill in byte 3
dst_block_config.range[3] = 0x00027000; dst_block_config.range[3] = 0x00027000;
#if CK_USE_AMD_BUFFER_ADDRESSING_INTRINSIC index_t dst_thread_addr_offset = dst_thread_data_offset * sizeof(float);
__llvm_amdgcn_buffer_store(src, index_t dst_const_addr_offset = dst_const_data_offset * sizeof(float);
dst_block_config.data,
0, __llvm_amdgcn_buffer_store_f32x2(*reinterpret_cast<const float2_t*>(p_src),
dst_thread_addr_offset + dst_const_addr_offset, dst_block_config.data,
false, 0,
false); dst_thread_addr_offset + dst_const_addr_offset,
#else false,
asm volatile("\n \ false);
buffer_store_dword %1, %2, %0, %3 offen offset:0 \n \
"
:
: "s"(dst_block_config.data),
"v"(src),
"v"(dst_thread_addr_offset),
"s"(dst_const_addr_offset));
#endif
} }
template <> template <>
__device__ void amd_intrinsic_buffer_store<float, 2>(const float2_t& src, __device__ void amd_buffer_store<float, 4>(const float* p_src,
float* p_dst_block, float* p_dst_block,
index_t dst_thread_data_offset, index_t dst_thread_data_offset,
index_t dst_const_data_offset) index_t dst_const_data_offset)
{ {
BufferAddressConfig<float> dst_block_config;
// fill in byte 0 - 1
dst_block_config.address[0] = p_dst_block;
// fill in byte 2
dst_block_config.range[2] = -1;
// fill in byte 3
dst_block_config.range[3] = 0x00027000;
index_t dst_thread_addr_offset = dst_thread_data_offset * sizeof(float); index_t dst_thread_addr_offset = dst_thread_data_offset * sizeof(float);
index_t dst_const_addr_offset = dst_const_data_offset * sizeof(float); index_t dst_const_addr_offset = dst_const_data_offset * sizeof(float);
BufferLoadStoreDwordConfig<float> dst_block_config; __llvm_amdgcn_buffer_store_f32x4(*reinterpret_cast<const float4_t*>(p_src),
dst_block_config.data,
0,
dst_thread_addr_offset + dst_const_addr_offset,
false,
false);
}
template <>
__device__ void amd_buffer_store<half_t, 1>(const half_t* p_src,
half_t* p_dst_block,
index_t dst_thread_data_offset,
index_t dst_const_data_offset)
{
BufferAddressConfig<half_t> dst_block_config;
// fill in byte 0 - 1 // fill in byte 0 - 1
dst_block_config.address[0] = p_dst_block; dst_block_config.address[0] = p_dst_block;
...@@ -247,35 +541,68 @@ __device__ void amd_intrinsic_buffer_store<float, 2>(const float2_t& src, ...@@ -247,35 +541,68 @@ __device__ void amd_intrinsic_buffer_store<float, 2>(const float2_t& src,
// fill in byte 3 // fill in byte 3
dst_block_config.range[3] = 0x00027000; dst_block_config.range[3] = 0x00027000;
#if CK_USE_AMD_BUFFER_ADDRESSING_INTRINSIC #if !CK_WORKAROUND_SWDEV_231101
__llvm_amdgcn_buffer_storex2(src, index_t dst_thread_addr_offset = dst_thread_data_offset * sizeof(half_t);
dst_block_config.data, index_t dst_const_addr_offset = dst_const_data_offset * sizeof(half_t);
0,
dst_thread_addr_offset + dst_const_addr_offset, __llvm_amdgcn_buffer_store_f16(*p_src,
false, dst_block_config.data,
false); 0,
dst_thread_addr_offset + dst_const_addr_offset,
false,
false);
#else #else
asm volatile("\n \ p_dst_block[dst_thread_data_offset + dst_const_data_offset] = *p_src;
buffer_store_dwordx2 %1, %2, %0, %3 offen offset:0 \n \
"
:
: "s"(dst_block_config.data),
"v"(src),
"v"(dst_thread_addr_offset),
"s"(dst_const_addr_offset));
#endif #endif
} }
template <> template <>
__device__ void amd_intrinsic_buffer_store<float, 4>(const float4_t& src, __device__ void amd_buffer_store<half_t, 2>(const half_t* p_src,
float* p_dst_block, half_t* p_dst_block,
index_t dst_thread_data_offset, index_t dst_thread_data_offset,
index_t dst_const_data_offset) index_t dst_const_data_offset)
{ {
index_t dst_thread_addr_offset = dst_thread_data_offset * sizeof(float); BufferAddressConfig<half_t> dst_block_config;
index_t dst_const_addr_offset = dst_const_data_offset * sizeof(float);
// fill in byte 0 - 1
dst_block_config.address[0] = p_dst_block;
// fill in byte 2
dst_block_config.range[2] = -1;
// fill in byte 3
dst_block_config.range[3] = 0x00027000;
index_t dst_thread_addr_offset = dst_thread_data_offset * sizeof(half_t);
index_t dst_const_addr_offset = dst_const_data_offset * sizeof(half_t);
BufferLoadStoreDwordConfig<float> dst_block_config; #if !CK_WORKAROUND_SWDEV_231101
__llvm_amdgcn_buffer_store_f16x2(*reinterpret_cast<const half2_t*>(p_src),
dst_block_config.data,
0,
dst_thread_addr_offset + dst_const_addr_offset,
false,
false);
#else
const float* p_src_tmp = reinterpret_cast<const float*>(p_src);
__llvm_amdgcn_buffer_store_f32(*p_src_tmp,
dst_block_config.data,
0,
dst_thread_addr_offset + dst_const_addr_offset,
false,
false);
#endif
}
template <>
__device__ void amd_buffer_store<half_t, 4>(const half_t* p_src,
half_t* p_dst_block,
index_t dst_thread_data_offset,
index_t dst_const_data_offset)
{
index_t dst_thread_addr_offset = dst_thread_data_offset * sizeof(half_t);
index_t dst_const_addr_offset = dst_const_data_offset * sizeof(half_t);
BufferAddressConfig<half_t> dst_block_config;
// fill in byte 0 - 1 // fill in byte 0 - 1
dst_block_config.address[0] = p_dst_block; dst_block_config.address[0] = p_dst_block;
...@@ -284,35 +611,99 @@ __device__ void amd_intrinsic_buffer_store<float, 4>(const float4_t& src, ...@@ -284,35 +611,99 @@ __device__ void amd_intrinsic_buffer_store<float, 4>(const float4_t& src,
// fill in byte 3 // fill in byte 3
dst_block_config.range[3] = 0x00027000; dst_block_config.range[3] = 0x00027000;
#if CK_USE_AMD_BUFFER_ADDRESSING_INTRINSIC #if !CK_WORKAROUND_SWDEV_231101
__llvm_amdgcn_buffer_storex4(src, __llvm_amdgcn_buffer_store_f16x4(*reinterpret_cast<const half4_t*>(p_src),
dst_block_config.data, dst_block_config.data,
0, 0,
dst_thread_addr_offset + dst_const_addr_offset, dst_thread_addr_offset + dst_const_addr_offset,
false, false,
false); false);
#else #else
asm volatile("\n \ const float2_t* p_src_tmp = reinterpret_cast<const float2_t*>(p_src);
buffer_store_dwordx4 %1, %2, %0, %3 offen offset:0 \n \
" __llvm_amdgcn_buffer_store_f32x2(*p_src_tmp,
: dst_block_config.data,
: "s"(dst_block_config.data), 0,
"v"(src), dst_thread_addr_offset + dst_const_addr_offset,
"v"(dst_thread_addr_offset), false,
"s"(dst_const_addr_offset)); false);
#endif #endif
} }
template <> template <>
__device__ void amd_intrinsic_buffer_atomic_add<float, 1>(const float& src, __device__ void amd_buffer_store<ushort, 1>(const ushort* p_src,
float* p_dst_block, ushort* p_dst_block,
index_t dst_thread_data_offset, index_t dst_thread_data_offset,
index_t dst_const_data_offset) index_t dst_const_data_offset)
{ {
index_t dst_thread_addr_offset = dst_thread_data_offset * sizeof(float); BufferAddressConfig<ushort> dst_block_config;
index_t dst_const_addr_offset = dst_const_data_offset * sizeof(float);
// fill in byte 0 - 1
dst_block_config.address[0] = p_dst_block;
// fill in byte 2
dst_block_config.range[2] = -1;
// fill in byte 3
dst_block_config.range[3] = 0x00027000;
#if !CK_WORKAROUND_SWDEV_231101
index_t dst_thread_addr_offset = dst_thread_data_offset * sizeof(ushort);
index_t dst_const_addr_offset = dst_const_data_offset * sizeof(ushort);
__llvm_amdgcn_buffer_store_bf16(*p_src,
dst_block_config.data,
0,
dst_thread_addr_offset + dst_const_addr_offset,
false,
false);
#else
p_dst_block[dst_thread_data_offset + dst_const_data_offset] = *p_src;
#endif
}
template <>
__device__ void amd_buffer_store<ushort, 2>(const ushort* p_src,
ushort* p_dst_block,
index_t dst_thread_data_offset,
index_t dst_const_data_offset)
{
BufferAddressConfig<ushort> dst_block_config;
// fill in byte 0 - 1
dst_block_config.address[0] = p_dst_block;
// fill in byte 2
dst_block_config.range[2] = -1;
// fill in byte 3
dst_block_config.range[3] = 0x00027000;
BufferLoadStoreDwordConfig<float> dst_block_config; index_t dst_thread_addr_offset = dst_thread_data_offset * sizeof(ushort);
index_t dst_const_addr_offset = dst_const_data_offset * sizeof(ushort);
#if !CK_WORKAROUND_SWDEV_231101
__llvm_amdgcn_buffer_store_bf16x2(*p_src,
dst_block_config.data,
0,
dst_thread_addr_offset + dst_const_addr_offset,
false,
false);
#else
const float* p_src_tmp = reinterpret_cast<const float*>(p_src);
__llvm_amdgcn_buffer_store_f32(*p_src_tmp,
dst_block_config.data,
0,
dst_thread_addr_offset + dst_const_addr_offset,
false,
false);
#endif
}
template <>
__device__ void amd_buffer_store<ushort, 4>(const ushort* p_src,
ushort* p_dst_block,
index_t dst_thread_data_offset,
index_t dst_const_data_offset)
{
BufferAddressConfig<ushort> dst_block_config;
// fill in byte 0 - 1 // fill in byte 0 - 1
dst_block_config.address[0] = p_dst_block; dst_block_config.address[0] = p_dst_block;
...@@ -321,13 +712,75 @@ __device__ void amd_intrinsic_buffer_atomic_add<float, 1>(const float& src, ...@@ -321,13 +712,75 @@ __device__ void amd_intrinsic_buffer_atomic_add<float, 1>(const float& src,
// fill in byte 3 // fill in byte 3
dst_block_config.range[3] = 0x00027000; dst_block_config.range[3] = 0x00027000;
#if CK_USE_AMD_BUFFER_ADDRESSING_INTRINSIC index_t dst_thread_addr_offset = dst_thread_data_offset * sizeof(ushort);
__llvm_amdgcn_buffer_atomic_add( index_t dst_const_addr_offset = dst_const_data_offset * sizeof(ushort);
src, dst_block_config.data, 0, dst_thread_addr_offset + dst_const_addr_offset, false);
#if !CK_WORKAROUND_SWDEV_231101
__llvm_amdgcn_buffer_store_bf16x4(*p_src,
dst_block_config.data,
0,
dst_thread_addr_offset + dst_const_addr_offset,
false,
false);
#else #else
static_assert(false, " wrong! not implemented"); const float2_t* p_src_tmp = reinterpret_cast<const float2_t*>(p_src);
__llvm_amdgcn_buffer_store_f32x2(*p_src_tmp,
dst_block_config.data,
0,
dst_thread_addr_offset + dst_const_addr_offset,
false,
false);
#endif #endif
} }
template <>
__device__ void amd_buffer_atomic_add<float, 1>(const float* p_src,
float* p_dst_block,
index_t dst_thread_data_offset,
index_t dst_const_data_offset)
{
BufferAddressConfig<float> dst_block_config;
// fill in byte 0 - 1
dst_block_config.address[0] = p_dst_block;
// fill in byte 2
dst_block_config.range[2] = -1;
// fill in byte 3
dst_block_config.range[3] = 0x00027000;
index_t dst_thread_addr_offset = dst_thread_data_offset * sizeof(float);
index_t dst_const_addr_offset = dst_const_data_offset * sizeof(float);
__llvm_amdgcn_buffer_atomic_add_f32(
*p_src, dst_block_config.data, 0, dst_thread_addr_offset + dst_const_addr_offset, false);
}
template <>
__device__ void amd_buffer_atomic_add<float, 2>(const float* p_src,
float* p_dst_block,
index_t dst_thread_data_offset,
index_t dst_const_data_offset)
{
for(index_t i = 0; i < 2; ++i)
{
amd_buffer_atomic_add<float, 1>(
&p_src[i], p_dst_block, dst_thread_data_offset, dst_const_data_offset + i);
}
}
template <>
__device__ void amd_buffer_atomic_add<float, 4>(const float* p_src,
float* p_dst_block,
index_t dst_thread_data_offset,
index_t dst_const_data_offset)
{
for(index_t i = 0; i < 4; ++i)
{
amd_buffer_atomic_add<float, 1>(
&p_src[i], p_dst_block, dst_thread_data_offset, dst_const_data_offset + i);
}
}
} // namespace ck } // namespace ck
#endif #endif
...@@ -16,15 +16,12 @@ ...@@ -16,15 +16,12 @@
#include "functional3.hpp" #include "functional3.hpp"
#include "functional4.hpp" #include "functional4.hpp"
#include "in_memory_operation.hpp" #include "in_memory_operation.hpp"
#include "synchronization.hpp"
#if CK_USE_AMD_INLINE_ASM #if CK_USE_AMD_INLINE_ASM
#include "amd_inline_asm.hpp" #include "amd_inline_asm.hpp"
#endif #endif
#if CK_USE_AMD_BUFFER_ADDRESSING
#include "amd_buffer_addressing.hpp"
#endif
#if CK_USE_AMD_XDLOPS #if CK_USE_AMD_XDLOPS
#include "amd_xdlops.hpp" #include "amd_xdlops.hpp"
#endif #endif
......
...@@ -25,11 +25,7 @@ ...@@ -25,11 +25,7 @@
#define CK_USE_AMD_BUFFER_ADDRESSING 1 #define CK_USE_AMD_BUFFER_ADDRESSING 1
#endif #endif
#ifndef CK_USE_AMD_BUFFER_ADDRESSING_INTRINSIC // only gfx908 support native floating point atomic add
#define CK_USE_AMD_BUFFER_ADDRESSING_INTRINSIC 1
#endif
// only support gfx908
#ifndef CK_USE_AMD_BUFFER_ATOMIC_ADD #ifndef CK_USE_AMD_BUFFER_ATOMIC_ADD
#define CK_USE_AMD_BUFFER_ATOMIC_ADD 0 #define CK_USE_AMD_BUFFER_ATOMIC_ADD 0
#endif #endif
...@@ -47,6 +43,11 @@ ...@@ -47,6 +43,11 @@
#define CK_USE_AMD_XDLOPS_EMULATE 0 // For internal debug purposes #define CK_USE_AMD_XDLOPS_EMULATE 0 // For internal debug purposes
#endif #endif
// block synchronization only s_wait lgkmcnt(0), not vmcnt(0)
#ifndef CK_BLOCK_SYNC_LDS_WITHOUT_SYNC_VMEM
#define CK_BLOCK_SYNC_LDS_WITHOUT_SYNC_VMEM 1
#endif
// experimental implementation // experimental implementation
#define CK_EXPERIMENTAL_BLOCKWISE_GEMM_USE_PIPELINE 1 #define CK_EXPERIMENTAL_BLOCKWISE_GEMM_USE_PIPELINE 1
#define CK_EXPERIMENTAL_TENSOR_COORDINATE_USE_CALCULATE_OFFSET_DIFF 0 #define CK_EXPERIMENTAL_TENSOR_COORDINATE_USE_CALCULATE_OFFSET_DIFF 0
...@@ -54,8 +55,24 @@ ...@@ -54,8 +55,24 @@
#define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_BLOCKWISE_GENERIC_SLICE_COPY_V1 0 #define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_BLOCKWISE_GENERIC_SLICE_COPY_V1 0
#define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V1R2 0 #define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V1R2 0
#define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V2R1 0 #define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V2R1 0
#ifndef CK_EXPERIMENTAL_IMPLICIT_GEMM_BACKWARD_DATA_V4R1_OUTPUT_SKIP_OUT_OF_BOUND_CHECK
#define CK_EXPERIMENTAL_IMPLICIT_GEMM_BACKWARD_DATA_V4R1_OUTPUT_SKIP_OUT_OF_BOUND_CHECK 0 #define CK_EXPERIMENTAL_IMPLICIT_GEMM_BACKWARD_DATA_V4R1_OUTPUT_SKIP_OUT_OF_BOUND_CHECK 0
#endif
#ifndef CK_EXPERIMENTAL_IMPLICIT_GEMM_BACKWARD_DATA_V4R1_INPUT_SKIP_OUT_OF_BOUND_CHECK
#define CK_EXPERIMENTAL_IMPLICIT_GEMM_BACKWARD_DATA_V4R1_INPUT_SKIP_OUT_OF_BOUND_CHECK 0 #define CK_EXPERIMENTAL_IMPLICIT_GEMM_BACKWARD_DATA_V4R1_INPUT_SKIP_OUT_OF_BOUND_CHECK 0
#endif
// workaround: put all workaround here
// workaround for unnecessary VGPA <--> AGRP data movement when using mfma LLVM intrinsic
#ifndef CK_WORKAROUND_SWDEV_229564
#define CK_WORKAROUND_SWDEV_229564 1
#endif
// workaround for buffer load/store fp16/bfp16 intrinsic bug
#ifndef CK_WORKAROUND_SWDEV_231101
#define CK_WORKAROUND_SWDEV_231101 1
#endif
namespace ck { namespace ck {
......
...@@ -14,10 +14,12 @@ typedef float float32_t __attribute__((ext_vector_type(32))); ...@@ -14,10 +14,12 @@ typedef float float32_t __attribute__((ext_vector_type(32)));
typedef _Float16 half_t; typedef _Float16 half_t;
typedef _Float16 half2_t __attribute__((ext_vector_type(2))); typedef _Float16 half2_t __attribute__((ext_vector_type(2)));
typedef _Float16 half4_t __attribute__((ext_vector_type(4))); typedef _Float16 half4_t __attribute__((ext_vector_type(4)));
typedef _Float16 half8_t __attribute__((ext_vector_type(8)));
// bfloat16 // bfloat16
typedef ushort ushort2_t __attribute__((ext_vector_type(2))); typedef ushort ushort2_t __attribute__((ext_vector_type(2)));
typedef ushort ushort4_t __attribute__((ext_vector_type(4))); typedef ushort ushort4_t __attribute__((ext_vector_type(4)));
typedef ushort ushort8_t __attribute__((ext_vector_type(8)));
template <class T, index_t N> template <class T, index_t N>
struct vector_type struct vector_type
...@@ -152,6 +154,25 @@ struct vector_type<half_t, 4> ...@@ -152,6 +154,25 @@ struct vector_type<half_t, 4>
} }
}; };
template <>
struct vector_type<half_t, 8>
{
using MemoryType = half8_t;
union DataType
{
MemoryType vector;
half_t scalar[8];
};
template <index_t I>
__host__ __device__ static void SetScalar(MemoryType& v, half_t s, Number<I>)
{
static_assert(I < 8, "wrong");
*(reinterpret_cast<half_t*>(&v) + I) = s;
}
};
template <> template <>
struct vector_type<ushort, 1> struct vector_type<ushort, 1>
{ {
...@@ -221,6 +242,25 @@ struct vector_type<ushort, 4> ...@@ -221,6 +242,25 @@ struct vector_type<ushort, 4>
} }
}; };
template <>
struct vector_type<ushort, 8>
{
using MemoryType = ushort8_t;
union DataType
{
MemoryType vector;
ushort scalar[8];
};
template <index_t I>
__host__ __device__ static void SetScalar(MemoryType& v, ushort s, Number<I>)
{
static_assert(I < 8, "wrong");
*(reinterpret_cast<ushort*>(&v) + I) = s;
}
};
// data type conversion // data type conversion
template <typename T> template <typename T>
struct type_convert struct type_convert
...@@ -251,6 +291,34 @@ struct inner_product_with_conversion ...@@ -251,6 +291,34 @@ struct inner_product_with_conversion
{ {
static constexpr auto convert = type_convert<T>(); static constexpr auto convert = type_convert<T>();
__device__ T operator()(float4_t a, float4_t b) const
{
const float* p_a_float = reinterpret_cast<const float*>(&a);
const float* p_b_float = reinterpret_cast<const float*>(&b);
T acc = 0;
for(index_t v = 0; v < 4; ++v)
{
acc += convert(p_a_float[v]) * convert(p_b_float[v]);
}
return acc;
}
__device__ T operator()(float2_t a, float2_t b) const
{
const float* p_a_float = reinterpret_cast<const float*>(&a);
const float* p_b_float = reinterpret_cast<const float*>(&b);
T acc = 0;
for(index_t v = 0; v < 2; ++v)
{
acc += convert(p_a_float[v]) * convert(p_b_float[v]);
}
return acc;
}
__device__ T operator()(float a, float b) const { return convert(a) * convert(b); } __device__ T operator()(float a, float b) const { return convert(a) * convert(b); }
__device__ T operator()(half2_t a, half2_t b) const __device__ T operator()(half2_t a, half2_t b) const
...@@ -280,6 +348,19 @@ struct inner_product_with_conversion ...@@ -280,6 +348,19 @@ struct inner_product_with_conversion
return acc; return acc;
} }
__device__ T operator()(half8_t a, half8_t b) const
{
const half_t* p_a_half = reinterpret_cast<const half_t*>(&a);
const half_t* p_b_half = reinterpret_cast<const half_t*>(&b);
T acc = 0;
for(index_t v = 0; v < 8; ++v)
{
acc += convert(p_a_half[v]) * convert(p_b_half[v]);
}
return acc;
}
__device__ T operator()(ushort2_t a, ushort2_t b) const __device__ T operator()(ushort2_t a, ushort2_t b) const
{ {
const ushort* p_a_bfloat16 = reinterpret_cast<const ushort*>(&a); const ushort* p_a_bfloat16 = reinterpret_cast<const ushort*>(&a);
...@@ -306,6 +387,19 @@ struct inner_product_with_conversion ...@@ -306,6 +387,19 @@ struct inner_product_with_conversion
} }
return acc; return acc;
} }
__device__ T operator()(ushort8_t a, ushort8_t b) const
{
const ushort* p_a_bfloat16 = reinterpret_cast<const ushort*>(&a);
const ushort* p_b_bfloat16 = reinterpret_cast<const ushort*>(&b);
T acc = 0;
for(index_t v = 0; v < 8; ++v)
{
acc += convert(p_a_bfloat16[v]) * convert(p_b_bfloat16[v]);
}
return acc;
}
}; };
} // namespace ck } // namespace ck
......
...@@ -2,91 +2,159 @@ ...@@ -2,91 +2,159 @@
#define CK_IN_MEMORY_OPERATION_AMD_HPP #define CK_IN_MEMORY_OPERATION_AMD_HPP
#include "float_type.hpp" #include "float_type.hpp"
#if CK_USE_AMD_BUFFER_ADDRESSING
#include "amd_buffer_addressing.hpp" #include "amd_buffer_addressing.hpp"
#endif
namespace ck { namespace ck {
template <typename T, template <typename T>
index_t DataPerAccess, __device__ void atomic_add_impl(T* p_dst, T src)
AddressSpace SrcAddressSpace, {
AddressSpace DstAddressSpace> atomicAdd(p_dst, src);
__device__ void set_data(const T* p_src, index_t src_offset, T* p_dst, index_t dst_offset) }
// atomicAdd for float does not support vector type
template <>
__device__ void atomic_add_impl<float2_t>(float2_t* p_dst, float2_t src)
{
float* p_dst_float = reinterpret_cast<float*>(p_dst);
const float* p_src_float = reinterpret_cast<const float*>(&src);
for(index_t i = 0; i < 2; ++i)
{
atomicAdd(&(p_dst_float[i]), p_src_float[i]);
}
}
template <>
__device__ void atomic_add_impl<float4_t>(float4_t* p_dst, float4_t src)
{
float* p_dst_float = reinterpret_cast<float*>(p_dst);
const float* p_src_float = reinterpret_cast<const float*>(&src);
for(index_t i = 0; i < 4; ++i)
{
atomicAdd(&(p_dst_float[i]), p_src_float[i]);
}
}
template <typename T, index_t DataPerAccess>
struct SetData
{ {
using vector_t = typename vector_type<T, DataPerAccess>::MemoryType; using vector_t = typename vector_type<T, DataPerAccess>::MemoryType;
// This version is only for compatibility, don't use this version if possible
template <AddressSpace SrcAddressSpace, AddressSpace DstAddressSpace>
__device__ void Run(const T* p_src, index_t src_offset, T* p_dst, index_t dst_offset) const
{
*reinterpret_cast<vector_t*>(&p_dst[dst_offset]) =
*reinterpret_cast<const vector_t*>(&p_src[src_offset]);
}
#if CK_USE_AMD_BUFFER_ADDRESSING #if CK_USE_AMD_BUFFER_ADDRESSING
// TODO: use static_if::ElseIf, instead of nested static_if // buffer_load requires:
static_if<SrcAddressSpace == AddressSpace::Global && // 1) p_src must be in global memory space, d_dst must be vgpr
DstAddressSpace == AddressSpace::Vgpr>{}([&](auto) { // 2) p_src to be a block-invariant pointer.
// buffer_load requires: // It is user's responsibility to make sure that is true.
// 1) p_src must be in global memory space, d_dst must be vgpr template <>
// 2) p_src to be a block-invariant pointer. __device__ void Run<AddressSpace::Global, AddressSpace::Vgpr>(const T* p_src,
// It is user's responsibility to make sure that is true. index_t src_offset,
T* p_dst,
index_t dst_offset) const
{
*reinterpret_cast<vector_t*>(&p_dst[dst_offset]) = *reinterpret_cast<vector_t*>(&p_dst[dst_offset]) =
amd_intrinsic_buffer_load<T, DataPerAccess>(p_src, src_offset, 0); amd_buffer_load<T, DataPerAccess>(p_src, src_offset, 0);
}).Else([&](auto) { }
static_if<SrcAddressSpace == AddressSpace::Vgpr &&
DstAddressSpace == AddressSpace::Global>{}([&](auto) { // buffer_store requires:
// buffer_store requires: // 1) p_src must be in vgpr space, d_dst must be global memory
// 1) p_src must be in vgpr space, d_dst must be global memory // 2) p_dst to be a block-invariant pointer.
// 2) p_dst to be a block-invariant pointer. // It is user's responsibility to make sure that is true.
// It is user's responsibility to make sure that is true. template <>
amd_intrinsic_buffer_store<T, DataPerAccess>( __device__ void Run<AddressSpace::Vgpr, AddressSpace::Global>(const T* p_src,
*reinterpret_cast<const vector_t*>(&p_src[src_offset]), p_dst, dst_offset, 0); index_t src_offset,
}).Else([&](auto) { T* p_dst,
*reinterpret_cast<vector_t*>(&p_dst[dst_offset]) = index_t dst_offset) const
*reinterpret_cast<const vector_t*>(&p_src[src_offset]); {
}); amd_buffer_store<T, DataPerAccess>(&(p_src[src_offset]), p_dst, dst_offset, 0);
}); }
#else
*reinterpret_cast<vector_t*>(&p_dst[dst_offset]) =
*reinterpret_cast<const vector_t*>(&p_src[src_offset]);
#endif #endif
} };
template <typename T, template <typename T, index_t DataPerAccess>
index_t DataPerAccess, struct AtomicAddData
AddressSpace SrcAddressSpace,
AddressSpace DstAddressSpace>
__device__ void atomic_add_data(const T* p_src, index_t src_offset, T* p_dst, index_t dst_offset)
{ {
using vector_t = typename vector_type<T, DataPerAccess>::MemoryType; using vector_t = typename vector_type<T, DataPerAccess>::MemoryType;
static_if<SrcAddressSpace == AddressSpace::Vgpr && // This version is only for compatibility, don't use this version if possible
DstAddressSpace == AddressSpace::Global>{}([&](auto) { template <AddressSpace SrcAddressSpace, AddressSpace DstAddressSpace>
#if CK_USE_AMD_BUFFER_ATOMIC_ADD __device__ void Run(const T* p_src, index_t src_offset, T* p_dst, index_t dst_offset) const
amd_intrinsic_buffer_atomic_add<T, DataPerAccess>( {
*reinterpret_cast<const vector_t*>(&p_src[src_offset]), p_dst, dst_offset, 0); atomic_add_impl(reinterpret_cast<vector_t*>(&p_dst[dst_offset]),
#else *reinterpret_cast<const vector_t*>(&p_src[src_offset]));
atomicAdd(reinterpret_cast<vector_t*>(&p_dst[dst_offset]), }
*reinterpret_cast<const vector_t*>(&p_src[src_offset]));
#if CK_USE_AMD_BUFFER_ADDRESSING && CK_USE_AMD_BUFFER_ATOMIC_ADD
// buffer_atomic_add requires:
// 1) p_src must be in vgpr space, d_dst must be global memory
// 2) p_dst to be a block-invariant pointer.
// It is user's responsibility to make sure that is true.
template <>
__device__ void Run<AddressSpace::Vgpr, AddressSpace::Global>(const T* p_src,
index_t src_offset,
T* p_dst,
index_t dst_offset) const
{
amd_buffer_atomic_add<T, DataPerAccess>(&(p_src[src_offset]), p_dst, dst_offset, 0);
}
#endif #endif
}).Else([&](auto fwd) { };
static_assert(fwd(false), "atomic_add doesn't support this memory space");
});
}
template <typename T, template <typename T,
index_t DataPerAccess, index_t DataPerAccess,
AddressSpace SrcAddressSpace, AddressSpace SrcAddressSpace,
AddressSpace DstAddressSpace, AddressSpace DstAddressSpace,
InMemoryDataOperation DstInMemOp> InMemoryDataOperation DstInMemOp,
index_t SrcDataStride = 1,
index_t DstDataStride = 1>
__device__ void transfer_data(const T* p_src, index_t src_offset, T* p_dst, index_t dst_offset) __device__ void transfer_data(const T* p_src, index_t src_offset, T* p_dst, index_t dst_offset)
{ {
static_assert(DstInMemOp == InMemoryDataOperation::Set || static_assert(DstInMemOp == InMemoryDataOperation::Set ||
DstInMemOp == InMemoryDataOperation::AtomicAdd, DstInMemOp == InMemoryDataOperation::AtomicAdd,
"wrong! InMemoryDataOperation not supported!"); "wrong! InMemoryDataOperation not supported!");
// TODO: use static_if::ElseIf // keep it simple, don't use static_if here, otherwise compiler will do weird things
static_if<DstInMemOp == InMemoryDataOperation::Set>{}([&](auto) { if(SrcDataStride == 1 && DstDataStride == 1)
set_data<T, DataPerAccess, SrcAddressSpace, DstAddressSpace>( {
p_src, src_offset, p_dst, dst_offset); // TODO: use static_if::ElseIf
}); static_if<DstInMemOp == InMemoryDataOperation::Set>{}([&](auto) {
SetData<T, DataPerAccess>{}.template Run<SrcAddressSpace, DstAddressSpace>(
p_src, src_offset, p_dst, dst_offset);
});
static_if<DstInMemOp == InMemoryDataOperation::AtomicAdd>{}([&](auto) {
AtomicAddData<T, DataPerAccess>{}.template Run<SrcAddressSpace, DstAddressSpace>(
p_src, src_offset, p_dst, dst_offset);
});
}
else
{
for(index_t i = 0; i < DataPerAccess; i++)
{
// TODO: use static_if::ElseIf
static_if<DstInMemOp == InMemoryDataOperation::Set>{}([&](auto) {
SetData<T, 1>{}.template Run<SrcAddressSpace, DstAddressSpace>(
p_src, src_offset + i * SrcDataStride, p_dst, dst_offset + i * DstDataStride);
});
static_if<DstInMemOp == InMemoryDataOperation::AtomicAdd>{}([&](auto) { static_if<DstInMemOp == InMemoryDataOperation::AtomicAdd>{}([&](auto) {
atomic_add_data<T, DataPerAccess, SrcAddressSpace, DstAddressSpace>( AtomicAddData<T, 1>{}.template Run<SrcAddressSpace, DstAddressSpace>(
p_src, src_offset, p_dst, dst_offset); p_src, src_offset + i * SrcDataStride, p_dst, dst_offset + i * DstDataStride);
}); });
}
}
} }
} // namespace ck } // namespace ck
......
#ifndef CK_SYNCHRONIZATION_AMD_HPP
#define CK_SYNCHRONIZATION_AMD_HPP
#include "config.hpp"
namespace ck {
__device__ void __llvm_amdgcn_s_barrier() __asm("llvm.amdgcn.s.barrier");
__device__ void block_sync_lds()
{
#if CK_BLOCK_SYNC_LDS_WITHOUT_SYNC_VMEM
asm volatile("\
s_waitcnt lgkmcnt(0) \n \
s_barrier \
" ::);
#else
__llvm_amdgcn_s_barrier();
#endif
}
__device__ void block_sync_lds_vmem() { __llvm_amdgcn_s_barrier(); }
} // namespace ck
#endif
...@@ -160,10 +160,10 @@ int main(int argc, char* argv[]) ...@@ -160,10 +160,10 @@ int main(int argc, char* argv[])
#elif 0 #elif 0
// 1x7 filter, 0x3 pad, 17x17 input // 1x7 filter, 0x3 pad, 17x17 input
constexpr index_t N = 128; constexpr index_t N = 128;
constexpr index_t C = 128; constexpr index_t C = 1024;
constexpr index_t HI = 17; constexpr index_t HI = 17;
constexpr index_t WI = 17; constexpr index_t WI = 17;
constexpr index_t K = 128; constexpr index_t K = 1024;
constexpr index_t Y = 1; constexpr index_t Y = 1;
constexpr index_t X = 7; constexpr index_t X = 7;
...@@ -247,7 +247,7 @@ int main(int argc, char* argv[]) ...@@ -247,7 +247,7 @@ int main(int argc, char* argv[])
#if 0 #if 0
device_convolution_backward_data_implicit_gemm_v1r1_nchw_kcyx_nkhw device_convolution_backward_data_implicit_gemm_v1r1_nchw_kcyx_nkhw
#elif 0 #elif 1
device_convolution_backward_data_implicit_gemm_v1r2_nchw_kcyx_nkhw device_convolution_backward_data_implicit_gemm_v1r2_nchw_kcyx_nkhw
#elif 0 #elif 0
device_convolution_backward_data_implicit_gemm_v2r1_nchw_kcyx_nkhw device_convolution_backward_data_implicit_gemm_v2r1_nchw_kcyx_nkhw
......
...@@ -19,7 +19,7 @@ int main(int argc, char* argv[]) ...@@ -19,7 +19,7 @@ int main(int argc, char* argv[])
{ {
using namespace ck; using namespace ck;
#if 1 #if 0
// 1x1, 17x17 // 1x1, 17x17
constexpr index_t N = 128; constexpr index_t N = 128;
constexpr index_t C = 1024; constexpr index_t C = 1024;
...@@ -97,10 +97,10 @@ int main(int argc, char* argv[]) ...@@ -97,10 +97,10 @@ int main(int argc, char* argv[])
#elif 0 #elif 0
// 7x1, 17x17 // 7x1, 17x17
constexpr index_t N = 128; constexpr index_t N = 128;
constexpr index_t C = 256; constexpr index_t C = 128;
constexpr index_t HI = 17; constexpr index_t HI = 17;
constexpr index_t WI = 17; constexpr index_t WI = 17;
constexpr index_t K = 320; constexpr index_t K = 128;
constexpr index_t Y = 7; constexpr index_t Y = 7;
constexpr index_t X = 1; constexpr index_t X = 1;
...@@ -109,13 +109,13 @@ int main(int argc, char* argv[]) ...@@ -109,13 +109,13 @@ int main(int argc, char* argv[])
using LeftPads = Sequence<3, 0>; using LeftPads = Sequence<3, 0>;
using RightPads = Sequence<3, 0>; using RightPads = Sequence<3, 0>;
#elif 0 #elif 1
// 1x7, 17x17 // 1x7, 17x17
constexpr index_t N = 128; constexpr index_t N = 128;
constexpr index_t C = 224; constexpr index_t C = 128;
constexpr index_t HI = 17; constexpr index_t HI = 17;
constexpr index_t WI = 17; constexpr index_t WI = 17;
constexpr index_t K = 224; constexpr index_t K = 128;
constexpr index_t Y = 1; constexpr index_t Y = 1;
constexpr index_t X = 7; constexpr index_t X = 7;
...@@ -124,7 +124,7 @@ int main(int argc, char* argv[]) ...@@ -124,7 +124,7 @@ int main(int argc, char* argv[])
using LeftPads = Sequence<0, 3>; using LeftPads = Sequence<0, 3>;
using RightPads = Sequence<0, 3>; using RightPads = Sequence<0, 3>;
#elif 1 #elif 0
// 3x3, 299x299 stride=2 // 3x3, 299x299 stride=2
constexpr index_t N = 128; constexpr index_t N = 128;
constexpr index_t C = 3; constexpr index_t C = 3;
...@@ -565,7 +565,7 @@ int main(int argc, char* argv[]) ...@@ -565,7 +565,7 @@ int main(int argc, char* argv[])
#endif #endif
} }
#if 0 #if 1
device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw(in_nchw_desc, device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw(in_nchw_desc,
in_nchw, in_nchw,
wei_kcyx_desc, wei_kcyx_desc,
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment