Commit 6c2c50b0 authored by Chao Liu's avatar Chao Liu
Browse files

done: explicitly separate offset component into compile-time, block-invariant...

done: explicitly separate offset component into compile-time, block-invariant and per-thread components. Experimenting
parent 51884fc2
......@@ -438,7 +438,14 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_padded_lds_double_buf
0,
b_thread_data_on_global,
0})
.template Run_amd_experiment<Float, 0, 2>(p_out_thread, p_out_global);
#if 0
.Run_generic
#elif 1
.template Run_generic<Float, address_space_t::generic, address_space_t::global>
#elif 1
.template Run_optimized_dst_address_calculation<Float, address_space_t::global>
#endif
(p_out_thread, p_out_global);
}
}
};
......
......@@ -742,12 +742,15 @@ struct BlockwiseGenericTensorSliceCopy_v4
__device__ void RunLoadRegisterBuffer(const TData* p_src, TData* p_buffer) const
{
#if 0
mThreadwiseLoad.Run(p_src, p_buffer);
mThreadwiseLoad.Run_generic(p_src, p_buffer);
#elif 1
mThreadwiseLoad.Run_access_order_optimized_for_source_index_calculation(p_src, p_buffer);
#elif 0
// hardcoded: global to register
mThreadwiseLoad.template Run_amd_experiment<TData, 2, 0>(p_src, p_buffer);
// hardcoded: src is global memory
mThreadwiseLoad.template Run_generic<TData, address_space_t::global>(p_src, p_buffer);
#elif 1
// hardcoded: src is global memory
mThreadwiseLoad
.template Run_optimized_src_address_calculation<TData, address_space_t::global>(
p_src, p_buffer);
#endif
}
......@@ -755,10 +758,15 @@ struct BlockwiseGenericTensorSliceCopy_v4
__device__ void RunStoreRegisterBuffer(const TData* p_buffer, TData* p_dst) const
{
#if 0
mThreadwiseStore.Run(p_buffer, p_dst);
mThreadwiseStore.Run_generic(p_buffer, p_dst);
#elif 1
// hardcoded: register to LDS
mThreadwiseStore.template Run_amd_experiment<TData, 0, 1>(p_buffer, p_dst);
// hardcoded: dst is lds
mThreadwiseStore.template Run_generic<TData, address_space_t::lds>(p_buffer, p_dst);
#elif 1
// hardcoded: dst is lds
mThreadwiseStore
.template Run_optimized_dst_address_calculation<TData, address_space_t::lds>(p_buffer,
p_dst);
#endif
}
......
......@@ -22,7 +22,7 @@
#include "amd_inline_asm.hpp"
#endif
#if CK_USE_AMD_INTRINCIS
#if CK_USE_AMD_INTRINSIC
#include "amd_intrinsic.hpp"
#endif
......
......@@ -8,7 +8,7 @@
#define CK_DEVICE_BACKEND_AMD 1
#define CK_USE_AMD_INTRINSIC 1
#define CK_USE_AMD_INLINE_ASM 1
#define CK_USE_AMD_INTRINSIC_BUFFER_LOAD_STORE_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V2R1 1
#define CK_USE_AMD_INTRINSIC_BUFFER_LOAD_STORE 1
#define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_BLOCKWISE_GENERIC_SLICE_COPY_V1 1
#define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V1R1 0
#define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V1R2 0
......@@ -16,6 +16,14 @@
namespace ck {
enum address_space_t
{
generic = 0,
vgpr = 1,
lds = 2,
global = 3
};
#if CK_UNSIGNED_INDEX_TYPE
using index_t = uint32_t;
#else
......
......@@ -10,7 +10,7 @@
#define CK_DEVICE_BACKEND_NVIDIA 1
#define CK_USE_AMD_INTRINSIC 0
#define CK_USE_AMD_INLINE_ASM 0
#define CK_USE_AMD_INTRINSIC_BUFFER_LOAD_STORE_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V2R1 0
#define CK_USE_AMD_INTRINSIC_BUFFER_LOAD_STORE 0
#define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_BLOCKWISE_GENERIC_SLICE_COPY_V1 0
#define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V1R1 0
#define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V1R2 0
......@@ -18,6 +18,11 @@
namespace ck {
enum address_space_t
{
generic = 0
};
#if CK_UNSIGNED_INDEX_TYPE
using index_t = uint32_t;
#else
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment