"...composable_kernel.git" did not exist on "1f543bfa79de0687f9b6144b5dea10f4190c8892"
Commit c03945f0 authored by Chao Liu's avatar Chao Liu
Browse files

revert changes to buffer addressing that is irrelavent to magic number division

parent 0dc0fa5c
...@@ -6,17 +6,6 @@ ...@@ -6,17 +6,6 @@
namespace ck { namespace ck {
template <typename T>
union BufferResource
{
// 128 bit SGPRs to supply buffer resource in buffer instructions
// https://rocm-documentation.readthedocs.io/en/latest/GCN_ISA_Manuals/testdocbook.html#vector-memory-buffer-instructions
int32x4_t data;
T* address[2];
int32_t range[4];
int32_t config[4];
};
__device__ float __llvm_amdgcn_buffer_load_f32(int32x4_t srsrc, __device__ float __llvm_amdgcn_buffer_load_f32(int32x4_t srsrc,
index_t vindex, index_t vindex,
index_t offset, index_t offset,
......
...@@ -6,27 +6,27 @@ ...@@ -6,27 +6,27 @@
namespace ck { namespace ck {
template <typename T> template <typename T>
union BufferResource_v2 union BufferResource
{ {
// 128 bit SGPRs to supply buffer resource in buffer instructions // 128 bit SGPRs to supply buffer resource in buffer instructions
// https://rocm-documentation.readthedocs.io/en/latest/GCN_ISA_Manuals/testdocbook.html#vector-memory-buffer-instructions // https://rocm-documentation.readthedocs.io/en/latest/GCN_ISA_Manuals/testdocbook.html#vector-memory-buffer-instructions
int32x4_t data; int32x4_t data;
StaticallyIndexedArray<T*, 2> address; T* address[2];
StaticallyIndexedArray<int32_t, 4> range; int32_t range[4];
StaticallyIndexedArray<int32_t, 4> config; int32_t config[4];
}; };
template <typename T> template <typename T>
__device__ int32x4_t make_wave_buffer_resource(T* p_wave, index_t data_space_size) __device__ int32x4_t make_wave_buffer_resource(T* p_wave, index_t data_space_size)
{ {
BufferResource_v2<T> wave_buffer_resource; BufferResource<T> wave_buffer_resource;
// wavewise base address (64 bit) // wavewise base address (64 bit)
wave_buffer_resource.address(Number<0>{}) = const_cast<remove_cv_t<T>*>(p_wave); wave_buffer_resource.address[0] = const_cast<remove_cv_t<T>*>(p_wave);
// wavewise range (32 bit) // wavewise range (32 bit)
wave_buffer_resource.range(Number<2>{}) = data_space_size * sizeof(T); wave_buffer_resource.range[2] = data_space_size * sizeof(T);
// wavewise setting (32 bit) // wavewise setting (32 bit)
wave_buffer_resource.config(Number<3>{}) = CK_BUFFER_RESOURCE_3RD_DWORD; wave_buffer_resource.config[3] = CK_BUFFER_RESOURCE_3RD_DWORD;
return wave_buffer_resource.data; return wave_buffer_resource.data;
} }
...@@ -37,19 +37,6 @@ __llvm_amdgcn_raw_buffer_load_i8(int32x4_t srsrc, ...@@ -37,19 +37,6 @@ __llvm_amdgcn_raw_buffer_load_i8(int32x4_t srsrc,
index_t voffset, index_t voffset,
index_t soffset, index_t soffset,
index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.i8"); index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.i8");
__device__ int8x2_t
__llvm_amdgcn_raw_buffer_load_i8x2(int32x4_t srsrc,
index_t voffset,
index_t soffset,
index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.v2i8");
__device__ int8x4_t
__llvm_amdgcn_raw_buffer_load_i8x4(int32x4_t srsrc,
index_t voffset,
index_t soffset,
index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.v4i8");
__device__ int16_t __device__ int16_t
__llvm_amdgcn_raw_buffer_load_i16(int32x4_t srsrc, __llvm_amdgcn_raw_buffer_load_i16(int32x4_t srsrc,
index_t voffset, index_t voffset,
...@@ -118,20 +105,6 @@ __llvm_amdgcn_raw_buffer_store_i8(int8_t vdata, ...@@ -118,20 +105,6 @@ __llvm_amdgcn_raw_buffer_store_i8(int8_t vdata,
index_t soffset, index_t soffset,
index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.i8"); index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.i8");
__device__ void
__llvm_amdgcn_raw_buffer_store_i8x2(int8x2_t vdata,
int32x4_t rsrc,
index_t voffset,
index_t soffset,
index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.v2i8");
__device__ void
__llvm_amdgcn_raw_buffer_store_i8x4(int8x4_t vdata,
int32x4_t rsrc,
index_t voffset,
index_t soffset,
index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.v4i8");
__device__ void __device__ void
__llvm_amdgcn_raw_buffer_store_i16(int16_t vdata, __llvm_amdgcn_raw_buffer_store_i16(int16_t vdata,
int32x4_t rsrc, int32x4_t rsrc,
...@@ -210,7 +183,6 @@ amd_buffer_load_impl_v2(int32x4_t src_wave_buffer_resource, ...@@ -210,7 +183,6 @@ amd_buffer_load_impl_v2(int32x4_t src_wave_buffer_resource,
index_t src_wave_addr_offset) index_t src_wave_addr_offset)
{ {
static_assert((is_same<T, float>::value && (N == 1 || N == 2 || N == 4 || N == 8)) || static_assert((is_same<T, float>::value && (N == 1 || N == 2 || N == 4 || N == 8)) ||
(is_same<T, int8_t>::value && (N == 1 || N == 2 || N == 4 || N == 8)) ||
(is_same<T, half_t>::value && (N == 1 || N == 2 || N == 4)) || (is_same<T, half_t>::value && (N == 1 || N == 2 || N == 4)) ||
(is_same<T, half2_t>::value && (N == 1)) || (is_same<T, half2_t>::value && (N == 1)) ||
(is_same<T, half4_t>::value && (N == 1)) || (is_same<T, half4_t>::value && (N == 1)) ||
...@@ -334,38 +306,6 @@ amd_buffer_load_impl_v2(int32x4_t src_wave_buffer_resource, ...@@ -334,38 +306,6 @@ amd_buffer_load_impl_v2(int32x4_t src_wave_buffer_resource,
src_thread_addr_offset, src_thread_addr_offset,
src_wave_addr_offset + 4 * sizeof(int32_t), src_wave_addr_offset + 4 * sizeof(int32_t),
0); 0);
return tmp.Vector();
}
}
else if constexpr(is_same<T, int8_t>::value)
{
if constexpr(N == 1)
{
return __llvm_amdgcn_raw_buffer_load_i8(
src_wave_buffer_resource, src_thread_addr_offset, src_wave_addr_offset, 0);
}
else if constexpr(N == 2)
{
return __llvm_amdgcn_raw_buffer_load_i8x2(
src_wave_buffer_resource, src_thread_addr_offset, src_wave_addr_offset, 0);
}
else if constexpr(N == 4)
{
return __llvm_amdgcn_raw_buffer_load_i8x4(
src_wave_buffer_resource, src_thread_addr_offset, src_wave_addr_offset, 0);
}
else if constexpr(N == 8)
{
vector_type<int8_t, 8> tmp;
tmp.Vectors(Number<4>{})(Number<0>{}) = __llvm_amdgcn_raw_buffer_load_i8x4(
src_wave_buffer_resource, src_thread_addr_offset, src_wave_addr_offset, 0);
tmp.Vectors(Number<4>{})(Number<1>{}) =
__llvm_amdgcn_raw_buffer_load_i8x4(src_wave_buffer_resource,
src_thread_addr_offset,
src_wave_addr_offset + 4 * sizeof(int8_t),
0);
return tmp.Vector(); return tmp.Vector();
} }
......
...@@ -14,11 +14,11 @@ ...@@ -14,11 +14,11 @@
#define CK_DEVICE_BACKEND_AMD 1 #define CK_DEVICE_BACKEND_AMD 1
// GPU ID // GPU ID
#if 1 #if 0
#define CK_AMD_GPU_GFX906 1 #define CK_AMD_GPU_GFX906 1
#elif 0 #elif 0
#define CK_AMD_GPU_GFX908 1 #define CK_AMD_GPU_GFX908 1
#elif 0 #elif 1
#define CK_AMD_GPU_GFX1030 1 #define CK_AMD_GPU_GFX1030 1
#endif #endif
...@@ -88,7 +88,7 @@ ...@@ -88,7 +88,7 @@
// experimental implementation // experimental implementation
#ifndef CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK #ifndef CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK
#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 0 #define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
#endif #endif
#ifndef CK_EXPERIMENTAL_USE_BUFFER_STORE_OOB_CHECK_OFFSET_TRICK #ifndef CK_EXPERIMENTAL_USE_BUFFER_STORE_OOB_CHECK_OFFSET_TRICK
......
...@@ -437,7 +437,6 @@ struct vector_type<int8_t, 16> ...@@ -437,7 +437,6 @@ struct vector_type<int8_t, 16>
// i8 // i8
// hack for int8x4_t, because compiler does not have native support for int8x4_t // hack for int8x4_t, because compiler does not have native support for int8x4_t
// int8x4_t is defined as int32_t // int8x4_t is defined as int32_t
using int8x2_t = typename vector_type<int8_t, 2>::type;
using int8x4_t = typename vector_type<int8_t, 4>::type; using int8x4_t = typename vector_type<int8_t, 4>::type;
using int8x8_t = typename vector_type<int8_t, 8>::type; using int8x8_t = typename vector_type<int8_t, 8>::type;
using int8x16_t = typename vector_type<int8_t, 16>::type; using int8x16_t = typename vector_type<int8_t, 16>::type;
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment