Commit 1d011fef authored by root's avatar root
Browse files

fix load/store

parent 3321471c
......@@ -216,8 +216,11 @@ amd_buffer_load_impl_v2(int32x4_t src_wave_buffer_resource,
tmp.Vectors(Number<4>{})(Number<0>{}) = __llvm_amdgcn_raw_buffer_load_fp32x4(
src_wave_buffer_resource, src_thread_addr_offset, src_wave_addr_offset, 0);
tmp.Vectors(Number<4>{})(Number<1>{}) = __llvm_amdgcn_raw_buffer_load_fp32x4(
src_wave_buffer_resource, src_thread_addr_offset, 4 * sizeof(float), 0);
tmp.Vectors(Number<4>{})(Number<1>{}) =
__llvm_amdgcn_raw_buffer_load_fp32x4(src_wave_buffer_resource,
src_thread_addr_offset,
src_wave_addr_offset + 4 * sizeof(float),
0);
return tmp.Vector();
}
......@@ -265,8 +268,11 @@ amd_buffer_load_impl_v2(int32x4_t src_wave_buffer_resource,
tmp.Vectors(Number<4>{})(Number<0>{}) = __llvm_amdgcn_raw_buffer_load_fp16x4(
src_wave_buffer_resource, src_thread_addr_offset, src_wave_addr_offset, 0);
tmp.Vectors(Number<4>{})(Number<1>{}) = __llvm_amdgcn_raw_buffer_load_fp16x4(
src_wave_buffer_resource, src_thread_addr_offset, src_wave_addr_offset, 0);
tmp.Vectors(Number<4>{})(Number<1>{}) =
__llvm_amdgcn_raw_buffer_load_fp16x4(src_wave_buffer_resource,
src_thread_addr_offset,
src_wave_addr_offset + 4 * sizeof(half_t),
0);
return tmp.Vector();
}
......@@ -295,8 +301,11 @@ amd_buffer_load_impl_v2(int32x4_t src_wave_buffer_resource,
tmp.Vectors(Number<4>{})(Number<0>{}) = __llvm_amdgcn_raw_buffer_load_i32x4(
src_wave_buffer_resource, src_thread_addr_offset, src_wave_addr_offset, 0);
tmp.Vectors(Number<4>{})(Number<1>{}) = __llvm_amdgcn_raw_buffer_load_i32x4(
src_wave_buffer_resource, src_thread_addr_offset, 4 * sizeof(int32_t), 0);
tmp.Vectors(Number<4>{})(Number<1>{}) =
__llvm_amdgcn_raw_buffer_load_i32x4(src_wave_buffer_resource,
src_thread_addr_offset,
src_wave_addr_offset + 4 * sizeof(int32_t),
0);
return tmp.Vector();
}
......@@ -457,19 +466,18 @@ __device__ void amd_buffer_store_impl_v2(const typename vector_type<T, N>::type
}
else if constexpr(N == 8)
{
vector_type<half_t, 8> tmp;
tmp.Vector() = src_thread_data;
vector_type<half_t, 8> tmp{src_thread_data};
__llvm_amdgcn_raw_buffer_store_fp16x4(tmp.Vectors(Number<4>{})[Number<0>{}],
dst_wave_buffer_resource,
dst_thread_addr_offset,
dst_wave_addr_offset,
0);
__llvm_amdgcn_raw_buffer_store_fp16x4(tmp.Vectors(Number<4>{})[Number<1>{}],
dst_wave_buffer_resource,
dst_thread_addr_offset,
dst_wave_addr_offset,
dst_wave_addr_offset + 4 * sizeof(half_t),
0);
}
}
......
......@@ -11,9 +11,9 @@
#define CK_DEVICE_BACKEND_AMD 1
// GPU ID
#define CK_AMD_GPU_GFX906 0
#define CK_AMD_GPU_GFX906 1
#define CK_AMD_GPU_GFX908 0
#define CK_AMD_GPU_GFX1030 1
#define CK_AMD_GPU_GFX1030 0
// HIP version
#ifndef CK_HIP_VERSION_FLAT
......
......@@ -10,7 +10,7 @@ cmake
-D CMAKE_INSTALL_PREFIX=${MY_PROJECT_INSTALL} \
-D CMAKE_BUILD_TYPE=Release \
-D DEVICE_BACKEND="AMD" \
-D CMAKE_CXX_FLAGS="-O3 --amdgpu-target=gfx1030 -gline-tables-only -save-temps=$CWD -ftemplate-backtrace-limit=0" \
-D CMAKE_CXX_FLAGS="-O3 --amdgpu-target=gfx906 -gline-tables-only -save-temps=$CWD -ftemplate-backtrace-limit=0" \
-D CMAKE_CXX_COMPILER=/opt/rocm/bin/hipcc \
-D CMAKE_PREFIX_PATH="/opt/rocm" \
-D CMAKE_VERBOSE_MAKEFILE:BOOL=ON \
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment