"git@developer.sourcefind.cn:gaoqiong/composable_kernel.git" did not exist on "aa0199a31ca262f1a62746dc08e54ee6dc71fd5c"
Commit 7a251a09 authored by Chao Liu's avatar Chao Liu
Browse files

debugged: CUDA should use its own float4 definition

parent f6cb5b84
...@@ -193,6 +193,7 @@ void device_implicit_gemm_convolution_2_chwn_cyxk_khwn(InDesc, ...@@ -193,6 +193,7 @@ void device_implicit_gemm_convolution_2_chwn_cyxk_khwn(InDesc,
constexpr index_t BlockSize = 256; constexpr index_t BlockSize = 256;
#elif 0 #elif 0
// 1x1, 14x14, Vega 20, disable lds_double_buffer, enable register double buffer // 1x1, 14x14, Vega 20, disable lds_double_buffer, enable register double buffer
// 1x1, 14x14, Pascal, enable lds_double_buffer, disable register double buffer
constexpr index_t BPerBlock = 64; constexpr index_t BPerBlock = 64;
constexpr index_t KPerBlock = 128; constexpr index_t KPerBlock = 128;
constexpr index_t CPerBlock = 8; constexpr index_t CPerBlock = 8;
...@@ -266,7 +267,7 @@ void device_implicit_gemm_convolution_2_chwn_cyxk_khwn(InDesc, ...@@ -266,7 +267,7 @@ void device_implicit_gemm_convolution_2_chwn_cyxk_khwn(InDesc,
for(index_t i = 0; i < nrepeat; ++i) for(index_t i = 0; i < nrepeat; ++i)
{ {
constexpr auto gridwise_conv = constexpr auto gridwise_conv =
#if 1 #if 0
GridwiseConvolutionImplicitGemm_v2_chwn_cyxk_khwn GridwiseConvolutionImplicitGemm_v2_chwn_cyxk_khwn
#else #else
GridwiseConvolutionImplicitGemm_v2_chwn_cyxk_khwn_lds_double_buffer GridwiseConvolutionImplicitGemm_v2_chwn_cyxk_khwn_lds_double_buffer
......
...@@ -580,7 +580,7 @@ int main(int argc, char* argv[]) ...@@ -580,7 +580,7 @@ int main(int argc, char* argv[])
constexpr index_t HPad = 0; constexpr index_t HPad = 0;
constexpr index_t WPad = 0; constexpr index_t WPad = 0;
#elif 0 #elif 1
// 1x1 filter, 14x14 image, C = 2048 // 1x1 filter, 14x14 image, C = 2048
constexpr index_t N = 128; constexpr index_t N = 128;
constexpr index_t C = 2048; constexpr index_t C = 2048;
......
...@@ -15,9 +15,13 @@ struct vector_type<float, 1> ...@@ -15,9 +15,13 @@ struct vector_type<float, 1>
template <> template <>
struct vector_type<float, 2> struct vector_type<float, 2>
{ {
#if 1 #if DEVICE_BACKEND_HIP
// For some reason, HIP compiler need this definition to generate optimal load and store instruction
typedef float MemoryType __attribute__((ext_vector_type(2))); typedef float MemoryType __attribute__((ext_vector_type(2)));
#else #elif DEVICE_BACKEND_CUDA
// For some reason, CUDA need this definition to, otherwise
// compiler won't generate optimal load and store instruction, and
// kernel would produce wrong result, indicating the compiler fail to generate correct instruction,
using MemoryType = float2; using MemoryType = float2;
#endif #endif
...@@ -38,9 +42,13 @@ struct vector_type<float, 2> ...@@ -38,9 +42,13 @@ struct vector_type<float, 2>
template <> template <>
struct vector_type<float, 4> struct vector_type<float, 4>
{ {
#if 1 #if DEVICE_BACKEND_HIP
// For some reason, HIP compiler need this definition to generate optimal load and store instruction
typedef float MemoryType __attribute__((ext_vector_type(4))); typedef float MemoryType __attribute__((ext_vector_type(4)));
#else #elif DEVICE_BACKEND_CUDA
// For some reason, CUDA need this definition to, otherwise
// compiler won't generate optimal load and store instruction, and
// kernel would produce wrong result, indicating the compiler fail to generate correct instruction,
using MemoryType = float4; using MemoryType = float4;
#endif #endif
}; };
......
...@@ -204,8 +204,18 @@ struct GridwiseConvolutionImplicitGemm_v2_chwn_cyxk_khwn_lds_double_buffer ...@@ -204,8 +204,18 @@ struct GridwiseConvolutionImplicitGemm_v2_chwn_cyxk_khwn_lds_double_buffer
// preload data into LDS // preload data into LDS
{ {
#if 1 #if 1
blockwise_in_copy.Run(p_in_global_block_offset, p_in_block_double); Float p_in_register_clipboard[blockwise_in_copy.GetRegisterClipboardSize()];
blockwise_wei_copy.Run(p_wei_global_block_offset, p_wei_block_double); Float p_wei_register_clipboard[blockwise_wei_copy.GetRegisterClipboardSize()];
blockwise_in_copy.RunLoadRegisterClipboard(p_in_global_block_offset,
p_in_register_clipboard);
blockwise_wei_copy.RunLoadRegisterClipboard(p_wei_global_block_offset,
p_wei_register_clipboard);
blockwise_in_copy.RunStoreRegisterClipboard(p_in_register_clipboard,
p_in_block_double);
blockwise_wei_copy.RunStoreRegisterClipboard(p_wei_register_clipboard,
p_wei_block_double);
#elif 0 #elif 0
Float p_in_register_clipboard[blockwise_in_copy.GetRegisterClipboardSize()]; Float p_in_register_clipboard[blockwise_in_copy.GetRegisterClipboardSize()];
Float p_wei_register_clipboard[blockwise_wei_copy.GetRegisterClipboardSize()]; Float p_wei_register_clipboard[blockwise_wei_copy.GetRegisterClipboardSize()];
...@@ -363,9 +373,9 @@ struct GridwiseConvolutionImplicitGemm_v2_chwn_cyxk_khwn_lds_double_buffer ...@@ -363,9 +373,9 @@ struct GridwiseConvolutionImplicitGemm_v2_chwn_cyxk_khwn_lds_double_buffer
#elif 1 #elif 1
blockwise_gemm.Run_asm blockwise_gemm.Run_asm
#endif #endif
(p_wei_block_double + in_block_space + (p_wei_block_double + wei_block_space +
wei_cyxk_block_desc.Get1dIndex(0, y, x, 0), wei_cyxk_block_desc.Get1dIndex(0, y, x, 0),
p_in_block_double + wei_block_space + y * Wi + x, p_in_block_double + in_block_space + y * Wi + x,
p_out_thread); p_out_thread);
} }
} }
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment