Commit 2a87a973 authored by Chao Liu's avatar Chao Liu
Browse files

use raw buffer intrinsic

parent 079b745d
...@@ -3,10 +3,10 @@ ...@@ -3,10 +3,10 @@
template <typename GridwiseOp, typename... Xs> template <typename GridwiseOp, typename... Xs>
__global__ void __global__ void
#if 1 #if 0
__launch_bounds__(256, 2) __launch_bounds__(256, 2)
#endif #endif
run_gridwise_operation(Xs... xs) run_gridwise_operation(Xs... xs)
{ {
GridwiseOp{}.Run(xs...); GridwiseOp{}.Run(xs...);
} }
......
...@@ -2,20 +2,10 @@ ...@@ -2,20 +2,10 @@
#define CK_AMD_BUFFER_ADDRESSING_HPP #define CK_AMD_BUFFER_ADDRESSING_HPP
#include "float_type.hpp" #include "float_type.hpp"
#include "amd_buffer_addressing_v2.hpp"
namespace ck { namespace ck {
// For 128 bit SGPRs to supply resource constant in buffer instructions
// https://rocm-documentation.readthedocs.io/en/latest/GCN_ISA_Manuals/testdocbook.html#vector-memory-buffer-instructions
template <typename T>
union BufferResourceConstant
{
int32x4_t data;
T* address[2];
int32_t range[4];
int32_t config[4];
};
__device__ float __llvm_amdgcn_buffer_load_f32(int32x4_t srsrc, __device__ float __llvm_amdgcn_buffer_load_f32(int32x4_t srsrc,
index_t vindex, index_t vindex,
index_t offset, index_t offset,
......
...@@ -2,11 +2,9 @@ ...@@ -2,11 +2,9 @@
#define CK_AMD_BUFFER_ADDRESSING_V2_HPP #define CK_AMD_BUFFER_ADDRESSING_V2_HPP
#include "float_type.hpp" #include "float_type.hpp"
#include "amd_buffer_addressing.hpp"
namespace ck { namespace ck {
#if 0
// For 128 bit SGPRs to supply resource constant in buffer instructions // For 128 bit SGPRs to supply resource constant in buffer instructions
// https://rocm-documentation.readthedocs.io/en/latest/GCN_ISA_Manuals/testdocbook.html#vector-memory-buffer-instructions // https://rocm-documentation.readthedocs.io/en/latest/GCN_ISA_Manuals/testdocbook.html#vector-memory-buffer-instructions
template <typename T> template <typename T>
...@@ -17,27 +15,45 @@ union BufferResourceConstant ...@@ -17,27 +15,45 @@ union BufferResourceConstant
int32_t range[4]; int32_t range[4];
int32_t config[4]; int32_t config[4];
}; };
#endif
__device__ float __llvm_amdgcn_buffer_load_f32(int32x4_t srsrc, __device__ float
index_t vindex, __llvm_amdgcn_raw_buffer_load_fp32(int32x4_t srsrc,
index_t offset, index_t voffset,
bool glc, index_t soffset,
bool slc) __asm("llvm.amdgcn.buffer.load.f32"); index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.f32");
__device__ float2_t __device__ float2_t
__llvm_amdgcn_buffer_load_f32x2(int32x4_t srsrc, __llvm_amdgcn_raw_buffer_load_fp32x2(int32x4_t srsrc,
index_t vindex, index_t voffset,
index_t offset, index_t soffset,
bool glc, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.v2f32");
bool slc) __asm("llvm.amdgcn.buffer.load.v2f32");
__device__ float4_t __device__ float4_t
__llvm_amdgcn_buffer_load_f32x4(int32x4_t srsrc, __llvm_amdgcn_raw_buffer_load_fp32x4(int32x4_t srsrc,
index_t vindex, index_t voffset,
index_t offset, index_t soffset,
bool glc, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.v4f32");
bool slc) __asm("llvm.amdgcn.buffer.load.v4f32");
__device__ void
__llvm_amdgcn_raw_buffer_store_fp32(float vdata,
int32x4_t rsrc,
index_t voffset,
index_t soffset,
index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.f32");
__device__ void
__llvm_amdgcn_raw_buffer_store_fp32x2(float2_t vdata,
int32x4_t rsrc,
index_t voffset,
index_t soffset,
index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.v2f32");
__device__ void
__llvm_amdgcn_raw_buffer_store_fp32x4(float4_t vdata,
int32x4_t rsrc,
index_t voffset,
index_t soffset,
index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.v4f32");
// buffer_load requires: // buffer_load requires:
// 1) p_src_wave must be in global memory space // 1) p_src_wave must be in global memory space
...@@ -82,11 +98,11 @@ __device__ float amd_buffer_load_v2<float, 1>(const float* p_src_wave, ...@@ -82,11 +98,11 @@ __device__ float amd_buffer_load_v2<float, 1>(const float* p_src_wave,
#if CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK #if CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK
uint32_t src_addr_shift = src_thread_data_valid ? 0 : 0x7fffffff; uint32_t src_addr_shift = src_thread_data_valid ? 0 : 0x7fffffff;
return __llvm_amdgcn_buffer_load_f32( return __llvm_amdgcn_raw_buffer_load_fp32(
src_wave_buffer_resource.data, 0, src_addr_shift + src_thread_addr_offset, false, false); src_wave_buffer_resource.data, src_addr_shift + src_thread_addr_offset, 0, 0);
#else #else
float tmp = __llvm_amdgcn_buffer_load_f32( float tmp = __llvm_amdgcn_raw_buffer_load_fp32(
src_wave_buffer_resource.data, 0, src_thread_addr_offset, false, false); src_wave_buffer_resource.data, src_thread_addr_offset, 0, 0);
return src_thread_data_valid ? tmp : float(0); return src_thread_data_valid ? tmp : float(0);
#endif #endif
...@@ -112,11 +128,11 @@ __device__ float2_t amd_buffer_load_v2<float, 2>(const float* p_src_wave, ...@@ -112,11 +128,11 @@ __device__ float2_t amd_buffer_load_v2<float, 2>(const float* p_src_wave,
#if CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK #if CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK
uint32_t src_addr_shift = src_thread_data_valid ? 0 : 0x7fffffff; uint32_t src_addr_shift = src_thread_data_valid ? 0 : 0x7fffffff;
return __llvm_amdgcn_buffer_load_f32x2( return __llvm_amdgcn_raw_buffer_load_fp32x2(
src_wave_buffer_resource.data, 0, src_addr_shift + src_thread_addr_offset, false, false); src_wave_buffer_resource.data, src_addr_shift + src_thread_addr_offset, 0, 0);
#else #else
float2_t tmp = __llvm_amdgcn_buffer_load_f32x2( float2_t tmp = __llvm_amdgcn_raw_buffer_load_fp32x2(
src_wave_buffer_resource.data, 0, src_thread_addr_offset, false, false); src_wave_buffer_resource.data, src_thread_addr_offset, 0, 0);
return src_thread_data_valid ? tmp : float2_t(0); return src_thread_data_valid ? tmp : float2_t(0);
#endif #endif
...@@ -142,11 +158,11 @@ __device__ float4_t amd_buffer_load_v2<float, 4>(const float* p_src_wave, ...@@ -142,11 +158,11 @@ __device__ float4_t amd_buffer_load_v2<float, 4>(const float* p_src_wave,
#if CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK #if CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK
uint32_t src_addr_shift = src_thread_data_valid ? 0 : 0x7fffffff; uint32_t src_addr_shift = src_thread_data_valid ? 0 : 0x7fffffff;
return __llvm_amdgcn_buffer_load_f32x4( return __llvm_amdgcn_raw_buffer_load_fp32x4(
src_wave_buffer_resource.data, 0, src_addr_shift + src_thread_addr_offset, false, false); src_wave_buffer_resource.data, src_addr_shift + src_thread_addr_offset, 0, 0);
#else #else
float4_t tmp = __llvm_amdgcn_buffer_load_f32x4( float4_t tmp = __llvm_amdgcn_raw_buffer_load_fp32x4(
src_wave_buffer_resource.data, 0, src_thread_addr_offset, false, false); src_wave_buffer_resource.data, src_thread_addr_offset, 0, 0);
return src_thread_data_valid ? tmp : float4_t(0); return src_thread_data_valid ? tmp : float4_t(0);
#endif #endif
...@@ -173,21 +189,16 @@ __device__ void amd_buffer_store_v2<float, 1>(const float src_thread_data, ...@@ -173,21 +189,16 @@ __device__ void amd_buffer_store_v2<float, 1>(const float src_thread_data,
#if CK_EXPERIMENTAL_USE_BUFFER_STORE_OOB_CHECK_OFFSET_TRICK #if CK_EXPERIMENTAL_USE_BUFFER_STORE_OOB_CHECK_OFFSET_TRICK
uint32_t dst_addr_shift = dst_thread_data_valid ? 0 : 0x7fffffff; uint32_t dst_addr_shift = dst_thread_data_valid ? 0 : 0x7fffffff;
__llvm_amdgcn_buffer_store_f32(src_thread_data, __llvm_amdgcn_raw_buffer_store_fp32(src_thread_data,
dst_wave_buffer_resource.data, dst_wave_buffer_resource.data,
0, dst_addr_shift + dst_thread_addr_offset,
dst_addr_shift + dst_thread_addr_offset, 0,
false, 0);
false);
#else #else
if(dst_thread_data_valid) if(dst_thread_data_valid)
{ {
__llvm_amdgcn_buffer_store_f32(src_thread_data, __llvm_amdgcn_buffer_store_fp32(
dst_wave_buffer_resource.data, src_thread_data, dst_wave_buffer_resource.data, dst_thread_addr_offset, 0, 0);
0,
dst_thread_addr_offset,
false,
false);
} }
#endif #endif
} }
...@@ -213,21 +224,16 @@ __device__ void amd_buffer_store_v2<float, 2>(const float2_t src_thread_data, ...@@ -213,21 +224,16 @@ __device__ void amd_buffer_store_v2<float, 2>(const float2_t src_thread_data,
#if CK_EXPERIMENTAL_USE_BUFFER_STORE_OOB_CHECK_OFFSET_TRICK #if CK_EXPERIMENTAL_USE_BUFFER_STORE_OOB_CHECK_OFFSET_TRICK
uint32_t dst_addr_shift = dst_thread_data_valid ? 0 : 0x7fffffff; uint32_t dst_addr_shift = dst_thread_data_valid ? 0 : 0x7fffffff;
__llvm_amdgcn_buffer_store_f32x2(src_thread_data, __llvm_amdgcn_raw_buffer_store_fp32x2(src_thread_data,
dst_wave_buffer_resource.data, dst_wave_buffer_resource.data,
0, dst_addr_shift + dst_thread_addr_offset,
dst_addr_shift + dst_thread_addr_offset, 0,
false, 0);
false);
#else #else
if(dst_thread_data_valid) if(dst_thread_data_valid)
{ {
__llvm_amdgcn_buffer_store_f32x2(src_thread_data, __llvm_amdgcn_raw_buffer_store_fp32x2(
dst_wave_buffer_resource.data, src_thread_data, dst_wave_buffer_resource.data, dst_thread_addr_offset, 0, 0);
0,
dst_thread_addr_offset,
false,
false);
} }
#endif #endif
} }
...@@ -253,21 +259,16 @@ __device__ void amd_buffer_store_v2<float, 4>(const float4_t src_thread_data, ...@@ -253,21 +259,16 @@ __device__ void amd_buffer_store_v2<float, 4>(const float4_t src_thread_data,
#if CK_EXPERIMENTAL_USE_BUFFER_STORE_OOB_CHECK_OFFSET_TRICK #if CK_EXPERIMENTAL_USE_BUFFER_STORE_OOB_CHECK_OFFSET_TRICK
uint32_t dst_addr_shift = dst_thread_data_valid ? 0 : 0x7fffffff; uint32_t dst_addr_shift = dst_thread_data_valid ? 0 : 0x7fffffff;
__llvm_amdgcn_buffer_store_f32x4(src_thread_data, __llvm_amdgcn_raw_buffer_store_fp32x4(src_thread_data,
dst_wave_buffer_resource.data, dst_wave_buffer_resource.data,
0, dst_addr_shift + dst_thread_addr_offset,
dst_addr_shift + dst_thread_addr_offset, 0,
false, 0);
false);
#else #else
if(dst_thread_data_valid) if(dst_thread_data_valid)
{ {
__llvm_amdgcn_buffer_store_f32x4(src_thread_data, __llvm_amdgcn_raw_buffer_store_fp32x4(
dst_wave_buffer_resource.data, src_thread_data, dst_wave_buffer_resource.data, dst_thread_addr_offset, 0, 0);
0,
dst_thread_addr_offset,
false,
false);
} }
#endif #endif
} }
......
...@@ -11,9 +11,6 @@ ...@@ -11,9 +11,6 @@
#define CK_HIP_VERSION_FLAT 0 #define CK_HIP_VERSION_FLAT 0
#endif #endif
// index type: unsigned or signed
#define CK_UNSIGNED_INDEX_TYPE 0
// multi index // multi index
#define CK_USE_DYNAMICALLY_INDEXED_MULTI_INDEX 0 #define CK_USE_DYNAMICALLY_INDEXED_MULTI_INDEX 0
...@@ -117,11 +114,7 @@ enum InMemoryDataOperation ...@@ -117,11 +114,7 @@ enum InMemoryDataOperation
AtomicAdd AtomicAdd
}; };
#if CK_UNSIGNED_INDEX_TYPE
using index_t = uint32_t;
#else
using index_t = int32_t; using index_t = int32_t;
#endif
typedef int32_t int32x2_t __attribute__((ext_vector_type(2))); typedef int32_t int32x2_t __attribute__((ext_vector_type(2)));
......
...@@ -145,7 +145,7 @@ void device_dynamic_convolution_forward_implicit_gemm_v4r4_nchw_kcyx_nkhw(InDesc ...@@ -145,7 +145,7 @@ void device_dynamic_convolution_forward_implicit_gemm_v4r4_nchw_kcyx_nkhw(InDesc
constexpr index_t GemmBBlockTransferDstScalarPerVector_GemmN = 1; constexpr index_t GemmBBlockTransferDstScalarPerVector_GemmN = 1;
constexpr index_t GemmCThreadTransferDstScalarPerVector_GemmN1 = 1; constexpr index_t GemmCThreadTransferDstScalarPerVector_GemmN1 = 1;
#elif 0 #elif 1
// cdata = 64, BlockSize = 256, 128x128x8 // cdata = 64, BlockSize = 256, 128x128x8
// b thread copy 2x2 // b thread copy 2x2
constexpr index_t BlockSize = 256; constexpr index_t BlockSize = 256;
...@@ -235,7 +235,7 @@ void device_dynamic_convolution_forward_implicit_gemm_v4r4_nchw_kcyx_nkhw(InDesc ...@@ -235,7 +235,7 @@ void device_dynamic_convolution_forward_implicit_gemm_v4r4_nchw_kcyx_nkhw(InDesc
constexpr auto conv_driver = constexpr auto conv_driver =
#if 1 #if 1
DriverDynamicConvolutionForwardImplicitGemm_v4r4_nchw_kcyx_nkhw_pad DriverDynamicConvolutionForwardImplicitGemm_v4r4_nchw_kcyx_nkhw_pad
#elif 0 #elif 1
DriverDynamicConvolutionForwardImplicitGemm_v4r4_nchw_kcyx_nkhw_no_pad DriverDynamicConvolutionForwardImplicitGemm_v4r4_nchw_kcyx_nkhw_no_pad
#elif 1 #elif 1
DriverDynamicConvolutionForwardImplicitGemm_v4r4_nchw_kcyx_nkhw_1x1 DriverDynamicConvolutionForwardImplicitGemm_v4r4_nchw_kcyx_nkhw_1x1
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment