"git@developer.sourcefind.cn:gaoqiong/composable_kernel.git" did not exist on "0595084452d6003ce7c74d537df863f07fba8e8c"
Commit 6c2c50b0 authored by Chao Liu's avatar Chao Liu
Browse files

done: explicitly separate offset component into compile-time, block-invariant...

done: explicitly separate offset component into compile-time, block-invariant and per-thread components. Experimenting
parent 51884fc2
......@@ -438,7 +438,14 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_padded_lds_double_buf
0,
b_thread_data_on_global,
0})
.template Run_amd_experiment<Float, 0, 2>(p_out_thread, p_out_global);
#if 0
.Run_generic
#elif 1
.template Run_generic<Float, address_space_t::generic, address_space_t::global>
#elif 1
.template Run_optimized_dst_address_calculation<Float, address_space_t::global>
#endif
(p_out_thread, p_out_global);
}
}
};
......
......@@ -325,14 +325,14 @@ struct TensorCoordinate
private:
template <class... Ts>
__host__ __device__ static constexpr auto
MakeDummyTensorCoordinate(ConstantTensorDescriptor<Ts...>)
MakeDummyTensorCoordinate(ConstantTensorDescriptor<Ts...>)
{
return NormalTensorCoordinate<ConstantTensorDescriptor<Ts...>>();
}
template <class... Ts>
__host__ __device__ static constexpr auto
MakeDummyTensorCoordinate(ConstantMergedTensorDescriptor<Ts...>)
MakeDummyTensorCoordinate(ConstantMergedTensorDescriptor<Ts...>)
{
return MergedTensorCoordinate<ConstantMergedTensorDescriptor<Ts...>>();
}
......
......@@ -188,7 +188,7 @@ struct TensorCoordinate_v2
private:
template <typename... Ts>
__host__ __device__ static constexpr auto
MakeDummyTensorCoordinate(NativeTensorDescriptor<Ts...>)
MakeDummyTensorCoordinate(NativeTensorDescriptor<Ts...>)
{
return NativeTensorCoordinate<NativeTensorDescriptor<Ts...>>(
make_zero_array<index_t, TensorDesc::GetNumOfDimension()>());
......@@ -196,7 +196,7 @@ struct TensorCoordinate_v2
template <typename... Ts>
__host__ __device__ static constexpr auto
MakeDummyTensorCoordinate(TransformedTensorDescriptor<Ts...>)
MakeDummyTensorCoordinate(TransformedTensorDescriptor<Ts...>)
{
return TransformedTensorCoordinate<TransformedTensorDescriptor<Ts...>>(
make_zero_array<index_t, TensorDesc::GetNumOfDimension()>());
......
......@@ -742,12 +742,15 @@ struct BlockwiseGenericTensorSliceCopy_v4
__device__ void RunLoadRegisterBuffer(const TData* p_src, TData* p_buffer) const
{
#if 0
mThreadwiseLoad.Run(p_src, p_buffer);
mThreadwiseLoad.Run_generic(p_src, p_buffer);
#elif 1
mThreadwiseLoad.Run_access_order_optimized_for_source_index_calculation(p_src, p_buffer);
#elif 0
// hardcoded: global to register
mThreadwiseLoad.template Run_amd_experiment<TData, 2, 0>(p_src, p_buffer);
// hardcoded: src is global memory
mThreadwiseLoad.template Run_generic<TData, address_space_t::global>(p_src, p_buffer);
#elif 1
// hardcoded: src is global memory
mThreadwiseLoad
.template Run_optimized_src_address_calculation<TData, address_space_t::global>(
p_src, p_buffer);
#endif
}
......@@ -755,10 +758,15 @@ struct BlockwiseGenericTensorSliceCopy_v4
__device__ void RunStoreRegisterBuffer(const TData* p_buffer, TData* p_dst) const
{
#if 0
mThreadwiseStore.Run(p_buffer, p_dst);
mThreadwiseStore.Run_generic(p_buffer, p_dst);
#elif 1
// hardcoded: register to LDS
mThreadwiseStore.template Run_amd_experiment<TData, 0, 1>(p_buffer, p_dst);
// hardcoded: dst is lds
mThreadwiseStore.template Run_generic<TData, address_space_t::lds>(p_buffer, p_dst);
#elif 1
// hardcoded: dst is lds
mThreadwiseStore
.template Run_optimized_dst_address_calculation<TData, address_space_t::lds>(p_buffer,
p_dst);
#endif
}
......
......@@ -22,7 +22,7 @@
#include "amd_inline_asm.hpp"
#endif
#if CK_USE_AMD_INTRINCIS
#if CK_USE_AMD_INTRINSIC
#include "amd_intrinsic.hpp"
#endif
......
......@@ -8,7 +8,7 @@
#define CK_DEVICE_BACKEND_AMD 1
#define CK_USE_AMD_INTRINSIC 1
#define CK_USE_AMD_INLINE_ASM 1
#define CK_USE_AMD_INTRINSIC_BUFFER_LOAD_STORE_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V2R1 1
#define CK_USE_AMD_INTRINSIC_BUFFER_LOAD_STORE 1
#define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_BLOCKWISE_GENERIC_SLICE_COPY_V1 1
#define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V1R1 0
#define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V1R2 0
......@@ -16,6 +16,14 @@
namespace ck {
enum address_space_t
{
generic = 0,
vgpr = 1,
lds = 2,
global = 3
};
#if CK_UNSIGNED_INDEX_TYPE
using index_t = uint32_t;
#else
......
......@@ -10,7 +10,7 @@
#define CK_DEVICE_BACKEND_NVIDIA 1
#define CK_USE_AMD_INTRINSIC 0
#define CK_USE_AMD_INLINE_ASM 0
#define CK_USE_AMD_INTRINSIC_BUFFER_LOAD_STORE_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V2R1 0
#define CK_USE_AMD_INTRINSIC_BUFFER_LOAD_STORE 0
#define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_BLOCKWISE_GENERIC_SLICE_COPY_V1 0
#define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V1R1 0
#define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V1R2 0
......@@ -18,6 +18,11 @@
namespace ck {
enum address_space_t
{
generic = 0
};
#if CK_UNSIGNED_INDEX_TYPE
using index_t = uint32_t;
#else
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment