Commit 1d011fef authored by root's avatar root
Browse files

fix load/store

parent 3321471c
...@@ -216,8 +216,11 @@ amd_buffer_load_impl_v2(int32x4_t src_wave_buffer_resource, ...@@ -216,8 +216,11 @@ amd_buffer_load_impl_v2(int32x4_t src_wave_buffer_resource,
tmp.Vectors(Number<4>{})(Number<0>{}) = __llvm_amdgcn_raw_buffer_load_fp32x4( tmp.Vectors(Number<4>{})(Number<0>{}) = __llvm_amdgcn_raw_buffer_load_fp32x4(
src_wave_buffer_resource, src_thread_addr_offset, src_wave_addr_offset, 0); src_wave_buffer_resource, src_thread_addr_offset, src_wave_addr_offset, 0);
tmp.Vectors(Number<4>{})(Number<1>{}) = __llvm_amdgcn_raw_buffer_load_fp32x4( tmp.Vectors(Number<4>{})(Number<1>{}) =
src_wave_buffer_resource, src_thread_addr_offset, 4 * sizeof(float), 0); __llvm_amdgcn_raw_buffer_load_fp32x4(src_wave_buffer_resource,
src_thread_addr_offset,
src_wave_addr_offset + 4 * sizeof(float),
0);
return tmp.Vector(); return tmp.Vector();
} }
...@@ -265,8 +268,11 @@ amd_buffer_load_impl_v2(int32x4_t src_wave_buffer_resource, ...@@ -265,8 +268,11 @@ amd_buffer_load_impl_v2(int32x4_t src_wave_buffer_resource,
tmp.Vectors(Number<4>{})(Number<0>{}) = __llvm_amdgcn_raw_buffer_load_fp16x4( tmp.Vectors(Number<4>{})(Number<0>{}) = __llvm_amdgcn_raw_buffer_load_fp16x4(
src_wave_buffer_resource, src_thread_addr_offset, src_wave_addr_offset, 0); src_wave_buffer_resource, src_thread_addr_offset, src_wave_addr_offset, 0);
tmp.Vectors(Number<4>{})(Number<1>{}) = __llvm_amdgcn_raw_buffer_load_fp16x4( tmp.Vectors(Number<4>{})(Number<1>{}) =
src_wave_buffer_resource, src_thread_addr_offset, src_wave_addr_offset, 0); __llvm_amdgcn_raw_buffer_load_fp16x4(src_wave_buffer_resource,
src_thread_addr_offset,
src_wave_addr_offset + 4 * sizeof(half_t),
0);
return tmp.Vector(); return tmp.Vector();
} }
...@@ -295,8 +301,11 @@ amd_buffer_load_impl_v2(int32x4_t src_wave_buffer_resource, ...@@ -295,8 +301,11 @@ amd_buffer_load_impl_v2(int32x4_t src_wave_buffer_resource,
tmp.Vectors(Number<4>{})(Number<0>{}) = __llvm_amdgcn_raw_buffer_load_i32x4( tmp.Vectors(Number<4>{})(Number<0>{}) = __llvm_amdgcn_raw_buffer_load_i32x4(
src_wave_buffer_resource, src_thread_addr_offset, src_wave_addr_offset, 0); src_wave_buffer_resource, src_thread_addr_offset, src_wave_addr_offset, 0);
tmp.Vectors(Number<4>{})(Number<1>{}) = __llvm_amdgcn_raw_buffer_load_i32x4( tmp.Vectors(Number<4>{})(Number<1>{}) =
src_wave_buffer_resource, src_thread_addr_offset, 4 * sizeof(int32_t), 0); __llvm_amdgcn_raw_buffer_load_i32x4(src_wave_buffer_resource,
src_thread_addr_offset,
src_wave_addr_offset + 4 * sizeof(int32_t),
0);
return tmp.Vector(); return tmp.Vector();
} }
...@@ -457,19 +466,18 @@ __device__ void amd_buffer_store_impl_v2(const typename vector_type<T, N>::type ...@@ -457,19 +466,18 @@ __device__ void amd_buffer_store_impl_v2(const typename vector_type<T, N>::type
} }
else if constexpr(N == 8) else if constexpr(N == 8)
{ {
vector_type<half_t, 8> tmp; vector_type<half_t, 8> tmp{src_thread_data};
tmp.Vector() = src_thread_data;
__llvm_amdgcn_raw_buffer_store_fp16x4(tmp.Vectors(Number<4>{})[Number<0>{}], __llvm_amdgcn_raw_buffer_store_fp16x4(tmp.Vectors(Number<4>{})[Number<0>{}],
dst_wave_buffer_resource, dst_wave_buffer_resource,
dst_thread_addr_offset, dst_thread_addr_offset,
dst_wave_addr_offset, dst_wave_addr_offset,
0); 0);
__llvm_amdgcn_raw_buffer_store_fp16x4(tmp.Vectors(Number<4>{})[Number<1>{}], __llvm_amdgcn_raw_buffer_store_fp16x4(tmp.Vectors(Number<4>{})[Number<1>{}],
dst_wave_buffer_resource, dst_wave_buffer_resource,
dst_thread_addr_offset, dst_thread_addr_offset,
dst_wave_addr_offset, dst_wave_addr_offset + 4 * sizeof(half_t),
0); 0);
} }
} }
......
...@@ -11,9 +11,9 @@ ...@@ -11,9 +11,9 @@
#define CK_DEVICE_BACKEND_AMD 1 #define CK_DEVICE_BACKEND_AMD 1
// GPU ID // GPU ID
#define CK_AMD_GPU_GFX906 0 #define CK_AMD_GPU_GFX906 1
#define CK_AMD_GPU_GFX908 0 #define CK_AMD_GPU_GFX908 0
#define CK_AMD_GPU_GFX1030 1 #define CK_AMD_GPU_GFX1030 0
// HIP version // HIP version
#ifndef CK_HIP_VERSION_FLAT #ifndef CK_HIP_VERSION_FLAT
......
...@@ -10,7 +10,7 @@ cmake ...@@ -10,7 +10,7 @@ cmake
-D CMAKE_INSTALL_PREFIX=${MY_PROJECT_INSTALL} \ -D CMAKE_INSTALL_PREFIX=${MY_PROJECT_INSTALL} \
-D CMAKE_BUILD_TYPE=Release \ -D CMAKE_BUILD_TYPE=Release \
-D DEVICE_BACKEND="AMD" \ -D DEVICE_BACKEND="AMD" \
-D CMAKE_CXX_FLAGS="-O3 --amdgpu-target=gfx1030 -gline-tables-only -save-temps=$CWD -ftemplate-backtrace-limit=0" \ -D CMAKE_CXX_FLAGS="-O3 --amdgpu-target=gfx906 -gline-tables-only -save-temps=$CWD -ftemplate-backtrace-limit=0" \
-D CMAKE_CXX_COMPILER=/opt/rocm/bin/hipcc \ -D CMAKE_CXX_COMPILER=/opt/rocm/bin/hipcc \
-D CMAKE_PREFIX_PATH="/opt/rocm" \ -D CMAKE_PREFIX_PATH="/opt/rocm" \
-D CMAKE_VERBOSE_MAKEFILE:BOOL=ON \ -D CMAKE_VERBOSE_MAKEFILE:BOOL=ON \
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment