Commit 6c2c50b0 authored by Chao Liu's avatar Chao Liu
Browse files

done: explicitly separate offset component into compile-time, block-invariant...

done: explicitly separate offset component into compile-time, block-invariant and per-thread components. Experimenting
parent 51884fc2
...@@ -438,7 +438,14 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_padded_lds_double_buf ...@@ -438,7 +438,14 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_padded_lds_double_buf
0, 0,
b_thread_data_on_global, b_thread_data_on_global,
0}) 0})
.template Run_amd_experiment<Float, 0, 2>(p_out_thread, p_out_global); #if 0
.Run_generic
#elif 1
.template Run_generic<Float, address_space_t::generic, address_space_t::global>
#elif 1
.template Run_optimized_dst_address_calculation<Float, address_space_t::global>
#endif
(p_out_thread, p_out_global);
} }
} }
}; };
......
...@@ -325,14 +325,14 @@ struct TensorCoordinate ...@@ -325,14 +325,14 @@ struct TensorCoordinate
private: private:
template <class... Ts> template <class... Ts>
__host__ __device__ static constexpr auto __host__ __device__ static constexpr auto
MakeDummyTensorCoordinate(ConstantTensorDescriptor<Ts...>) MakeDummyTensorCoordinate(ConstantTensorDescriptor<Ts...>)
{ {
return NormalTensorCoordinate<ConstantTensorDescriptor<Ts...>>(); return NormalTensorCoordinate<ConstantTensorDescriptor<Ts...>>();
} }
template <class... Ts> template <class... Ts>
__host__ __device__ static constexpr auto __host__ __device__ static constexpr auto
MakeDummyTensorCoordinate(ConstantMergedTensorDescriptor<Ts...>) MakeDummyTensorCoordinate(ConstantMergedTensorDescriptor<Ts...>)
{ {
return MergedTensorCoordinate<ConstantMergedTensorDescriptor<Ts...>>(); return MergedTensorCoordinate<ConstantMergedTensorDescriptor<Ts...>>();
} }
......
...@@ -188,7 +188,7 @@ struct TensorCoordinate_v2 ...@@ -188,7 +188,7 @@ struct TensorCoordinate_v2
private: private:
template <typename... Ts> template <typename... Ts>
__host__ __device__ static constexpr auto __host__ __device__ static constexpr auto
MakeDummyTensorCoordinate(NativeTensorDescriptor<Ts...>) MakeDummyTensorCoordinate(NativeTensorDescriptor<Ts...>)
{ {
return NativeTensorCoordinate<NativeTensorDescriptor<Ts...>>( return NativeTensorCoordinate<NativeTensorDescriptor<Ts...>>(
make_zero_array<index_t, TensorDesc::GetNumOfDimension()>()); make_zero_array<index_t, TensorDesc::GetNumOfDimension()>());
...@@ -196,7 +196,7 @@ struct TensorCoordinate_v2 ...@@ -196,7 +196,7 @@ struct TensorCoordinate_v2
template <typename... Ts> template <typename... Ts>
__host__ __device__ static constexpr auto __host__ __device__ static constexpr auto
MakeDummyTensorCoordinate(TransformedTensorDescriptor<Ts...>) MakeDummyTensorCoordinate(TransformedTensorDescriptor<Ts...>)
{ {
return TransformedTensorCoordinate<TransformedTensorDescriptor<Ts...>>( return TransformedTensorCoordinate<TransformedTensorDescriptor<Ts...>>(
make_zero_array<index_t, TensorDesc::GetNumOfDimension()>()); make_zero_array<index_t, TensorDesc::GetNumOfDimension()>());
......
...@@ -742,12 +742,15 @@ struct BlockwiseGenericTensorSliceCopy_v4 ...@@ -742,12 +742,15 @@ struct BlockwiseGenericTensorSliceCopy_v4
__device__ void RunLoadRegisterBuffer(const TData* p_src, TData* p_buffer) const __device__ void RunLoadRegisterBuffer(const TData* p_src, TData* p_buffer) const
{ {
#if 0 #if 0
mThreadwiseLoad.Run(p_src, p_buffer); mThreadwiseLoad.Run_generic(p_src, p_buffer);
#elif 1 #elif 1
mThreadwiseLoad.Run_access_order_optimized_for_source_index_calculation(p_src, p_buffer); // hardcoded: src is global memory
#elif 0 mThreadwiseLoad.template Run_generic<TData, address_space_t::global>(p_src, p_buffer);
// hardcoded: global to register #elif 1
mThreadwiseLoad.template Run_amd_experiment<TData, 2, 0>(p_src, p_buffer); // hardcoded: src is global memory
mThreadwiseLoad
.template Run_optimized_src_address_calculation<TData, address_space_t::global>(
p_src, p_buffer);
#endif #endif
} }
...@@ -755,10 +758,15 @@ struct BlockwiseGenericTensorSliceCopy_v4 ...@@ -755,10 +758,15 @@ struct BlockwiseGenericTensorSliceCopy_v4
__device__ void RunStoreRegisterBuffer(const TData* p_buffer, TData* p_dst) const __device__ void RunStoreRegisterBuffer(const TData* p_buffer, TData* p_dst) const
{ {
#if 0 #if 0
mThreadwiseStore.Run(p_buffer, p_dst); mThreadwiseStore.Run_generic(p_buffer, p_dst);
#elif 1 #elif 1
// hardcoded: register to LDS // hardcoded: dst is lds
mThreadwiseStore.template Run_amd_experiment<TData, 0, 1>(p_buffer, p_dst); mThreadwiseStore.template Run_generic<TData, address_space_t::lds>(p_buffer, p_dst);
#elif 1
// hardcoded: dst is lds
mThreadwiseStore
.template Run_optimized_dst_address_calculation<TData, address_space_t::lds>(p_buffer,
p_dst);
#endif #endif
} }
......
...@@ -22,7 +22,7 @@ ...@@ -22,7 +22,7 @@
#include "amd_inline_asm.hpp" #include "amd_inline_asm.hpp"
#endif #endif
#if CK_USE_AMD_INTRINCIS #if CK_USE_AMD_INTRINSIC
#include "amd_intrinsic.hpp" #include "amd_intrinsic.hpp"
#endif #endif
......
...@@ -8,7 +8,7 @@ ...@@ -8,7 +8,7 @@
#define CK_DEVICE_BACKEND_AMD 1 #define CK_DEVICE_BACKEND_AMD 1
#define CK_USE_AMD_INTRINSIC 1 #define CK_USE_AMD_INTRINSIC 1
#define CK_USE_AMD_INLINE_ASM 1 #define CK_USE_AMD_INLINE_ASM 1
#define CK_USE_AMD_INTRINSIC_BUFFER_LOAD_STORE_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V2R1 1 #define CK_USE_AMD_INTRINSIC_BUFFER_LOAD_STORE 1
#define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_BLOCKWISE_GENERIC_SLICE_COPY_V1 1 #define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_BLOCKWISE_GENERIC_SLICE_COPY_V1 1
#define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V1R1 0 #define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V1R1 0
#define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V1R2 0 #define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V1R2 0
...@@ -16,6 +16,14 @@ ...@@ -16,6 +16,14 @@
namespace ck { namespace ck {
enum address_space_t
{
generic = 0,
vgpr = 1,
lds = 2,
global = 3
};
#if CK_UNSIGNED_INDEX_TYPE #if CK_UNSIGNED_INDEX_TYPE
using index_t = uint32_t; using index_t = uint32_t;
#else #else
......
...@@ -10,7 +10,7 @@ ...@@ -10,7 +10,7 @@
#define CK_DEVICE_BACKEND_NVIDIA 1 #define CK_DEVICE_BACKEND_NVIDIA 1
#define CK_USE_AMD_INTRINSIC 0 #define CK_USE_AMD_INTRINSIC 0
#define CK_USE_AMD_INLINE_ASM 0 #define CK_USE_AMD_INLINE_ASM 0
#define CK_USE_AMD_INTRINSIC_BUFFER_LOAD_STORE_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V2R1 0 #define CK_USE_AMD_INTRINSIC_BUFFER_LOAD_STORE 0
#define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_BLOCKWISE_GENERIC_SLICE_COPY_V1 0 #define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_BLOCKWISE_GENERIC_SLICE_COPY_V1 0
#define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V1R1 0 #define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V1R1 0
#define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V1R2 0 #define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V1R2 0
...@@ -18,6 +18,11 @@ ...@@ -18,6 +18,11 @@
namespace ck { namespace ck {
enum address_space_t
{
generic = 0
};
#if CK_UNSIGNED_INDEX_TYPE #if CK_UNSIGNED_INDEX_TYPE
using index_t = uint32_t; using index_t = uint32_t;
#else #else
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment