Commit a46a17fb authored by root's avatar root
Browse files

buffer load workaround

parent 415b4cbd
...@@ -141,17 +141,17 @@ struct DriverDynamicConvolutionForwardImplicitGemm_v5r1_nchw_kcyx_nkhw_pad ...@@ -141,17 +141,17 @@ struct DriverDynamicConvolutionForwardImplicitGemm_v5r1_nchw_kcyx_nkhw_pad
constexpr auto a_k_m_global_move_slice_window_iterator_hack = Sequence<0, 0, 0>{}; constexpr auto a_k_m_global_move_slice_window_iterator_hack = Sequence<0, 0, 0>{};
constexpr auto b_k_n_global_iterator_hacks = constexpr auto b_k_n_global_iterator_hacks =
make_tuple(make_tuple(Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{}, make_tuple(make_tuple(Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0>{},
Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{}, Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},
Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{}, Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},
Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{}), Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{}),
make_tuple(Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{}, make_tuple(Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0>{},
Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{}, Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},
Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{}, Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},
Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{})); Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{}));
constexpr auto b_k_n_global_move_slice_window_iterator_hack = constexpr auto b_k_n_global_move_slice_window_iterator_hack =
Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{}; Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0>{};
// hack to control index calculation when iterating over c_m0_m1_n0_n1_global tensor // hack to control index calculation when iterating over c_m0_m1_n0_n1_global tensor
// hack for NKHW format // hack for NKHW format
......
...@@ -262,12 +262,12 @@ struct GridwiseDynamicGemm_km_kn_mn_v2 ...@@ -262,12 +262,12 @@ struct GridwiseDynamicGemm_km_kn_mn_v2
a_blockwise_copy.RunWrite(a_cyx_k_block_desc, p_a_block_double); a_blockwise_copy.RunWrite(a_cyx_k_block_desc, p_a_block_double);
#if 1 #if 0
__syncthreads(); __syncthreads();
index_t sum = 0; //index_t sum = 0;
for(index_t i = 0; i < b_cyx_n_h_w_thread_desc.GetElementSpaceSize(); i++) //for(index_t i = 0; i < b_cyx_n_h_w_thread_desc.GetElementSpaceSize(); i++)
sum += p_b_thread_double[i]; //sum += p_b_thread_double[i];
p_c_thread[0] += p_b_thread_double[0] + p_b_thread_double[1] + p_b_thread_double[2]; p_c_thread[0] += p_b_thread_double[0] + p_b_thread_double[1] + p_b_thread_double[2];
p_c_thread[0] += p_b_thread_double[3] + p_b_thread_double[4] + p_b_thread_double[5]; p_c_thread[0] += p_b_thread_double[3] + p_b_thread_double[4] + p_b_thread_double[5];
...@@ -275,7 +275,7 @@ struct GridwiseDynamicGemm_km_kn_mn_v2 ...@@ -275,7 +275,7 @@ struct GridwiseDynamicGemm_km_kn_mn_v2
#endif #endif
} }
#if 0 #if 1
if constexpr(HasMainKBlockLoop) if constexpr(HasMainKBlockLoop)
{ {
Float* p_a_block_even = p_a_block_double; Float* p_a_block_even = p_a_block_double;
......
...@@ -535,8 +535,7 @@ struct ThreadwiseDynamicTensorSliceTransfer_v2 ...@@ -535,8 +535,7 @@ struct ThreadwiseDynamicTensorSliceTransfer_v2
dst_desc.CalculateOffset(to_multi_index(dst_slice_origin_idx) + src_data_idx + dst_desc.CalculateOffset(to_multi_index(dst_slice_origin_idx) + src_data_idx +
i * src_scalar_step_in_vector); i * src_scalar_step_in_vector);
// p_dst[Number<dst_offset>{}] = src_vector[i]; p_dst[Number<dst_offset>{}] = src_vector.Scalars()[i];
p_dst[Number<dst_offset>{}] = src_vector.Scalars()(i);
}); });
constexpr auto move_on_dim = [&]() constexpr constexpr auto move_on_dim = [&]() constexpr
......
...@@ -7,7 +7,7 @@ ...@@ -7,7 +7,7 @@
#endif #endif
#include "bfloat16_dev.hpp" #include "bfloat16_dev.hpp"
#if 0 #if 1
#define CK_AMD_GPU_GFX906 1 #define CK_AMD_GPU_GFX906 1
#elif 0 #elif 0
#define CK_AMD_GPU_GFX908 1 #define CK_AMD_GPU_GFX908 1
...@@ -74,7 +74,7 @@ ...@@ -74,7 +74,7 @@
// experimental implementation // experimental implementation
#ifndef CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK #ifndef CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK
#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 0 #define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
#endif #endif
#ifndef CK_EXPERIMENTAL_USE_BUFFER_STORE_OOB_CHECK_OFFSET_TRICK #ifndef CK_EXPERIMENTAL_USE_BUFFER_STORE_OOB_CHECK_OFFSET_TRICK
......
...@@ -78,7 +78,7 @@ void device_dynamic_convolution_forward_implicit_gemm_v5r1_nchw_kcyx_nkhw(InDesc ...@@ -78,7 +78,7 @@ void device_dynamic_convolution_forward_implicit_gemm_v5r1_nchw_kcyx_nkhw(InDesc
constexpr index_t KPerThread = 16; constexpr index_t KPerThread = 16;
constexpr index_t HPerThread = 1; constexpr index_t HPerThread = 1;
constexpr index_t WPerThread = 1; constexpr index_t WPerThread = 1;
constexpr index_t CYXPerThread = 3 * 3; constexpr index_t CYXPerThread = 4 * 3 * 3;
using GemmABlockTransferThreadSliceLengths_GemmK_GemmM = Sequence<9, 1>; using GemmABlockTransferThreadSliceLengths_GemmK_GemmM = Sequence<9, 1>;
using GemmABlockTransferThreadClusterLengths_GemmK_GemmM = Sequence<4, 16>; using GemmABlockTransferThreadClusterLengths_GemmK_GemmM = Sequence<4, 16>;
......
...@@ -657,7 +657,7 @@ int main(int argc, char* argv[]) ...@@ -657,7 +657,7 @@ int main(int argc, char* argv[])
if(do_verification) if(do_verification)
{ {
#if 1 #if 0
in_nchw.GenerateTensorValue(GeneratorTensor_1{}, num_thread); in_nchw.GenerateTensorValue(GeneratorTensor_1{}, num_thread);
wei_kcyx.GenerateTensorValue(GeneratorTensor_1{}, num_thread); wei_kcyx.GenerateTensorValue(GeneratorTensor_1{}, num_thread);
#elif 0 #elif 0
...@@ -776,7 +776,7 @@ int main(int argc, char* argv[]) ...@@ -776,7 +776,7 @@ int main(int argc, char* argv[])
} }
check_error(out_nkhw_host, out_nkhw_device); check_error(out_nkhw_host, out_nkhw_device);
#if 1 #if 0
// LogRange(std::cout << "in_nchw : ", in_nchw.mData, ",") << std::endl; // LogRange(std::cout << "in_nchw : ", in_nchw.mData, ",") << std::endl;
// LogRange(std::cout << "wei_kcyx: ", wei_kcyx.mData, ",") << std::endl; // LogRange(std::cout << "wei_kcyx: ", wei_kcyx.mData, ",") << std::endl;
LogRange(std::cout << "out_nkhw_host : ", out_nkhw_host.mData, ",") << std::endl; LogRange(std::cout << "out_nkhw_host : ", out_nkhw_host.mData, ",") << std::endl;
......
...@@ -3,7 +3,7 @@ rm -f CMakeCache.txt ...@@ -3,7 +3,7 @@ rm -f CMakeCache.txt
rm -f *.cmake rm -f *.cmake
rm -rf CMakeFiles rm -rf CMakeFiles
MY_PROJECT_SOURCE=../../../ MY_PROJECT_SOURCE=../
MY_PROJECT_INSTALL=../install.dir MY_PROJECT_INSTALL=../install.dir
cmake \ cmake \
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment