Commit f3099df8 authored by Chao Liu's avatar Chao Liu
Browse files

trying to solve scratch mem issue

parent 0413a020
...@@ -72,11 +72,12 @@ map_convolution_into_gemm(const WeiDesc& wei_k_c_y_x_global_desc, ...@@ -72,11 +72,12 @@ map_convolution_into_gemm(const WeiDesc& wei_k_c_y_x_global_desc,
make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}), make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2, 3>{}, Sequence<4, 5>{})); make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2, 3>{}, Sequence<4, 5>{}));
const auto in_gemmk_gemmn_global_desc = transform_dynamic_tensor_descriptor( const auto in_gemmk_gemmn_global_desc =
in_n_c_y_ho_x_wo_global_desc, transform_dynamic_tensor_descriptor(in_n_c_y_ho_x_wo_global_desc,
make_tuple(DynamicMerge<3>{{C, Y, X}}, DynamicMerge<3>{{N, Ho, Wo}}), make_tuple(DynamicMerge<3>{MultiIndex<3>{{C, Y, X}}},
make_tuple(Sequence<1, 2, 4>{}, Sequence<0, 3, 5>{}), DynamicMerge<3>{MultiIndex<3>{{N, Ho, Wo}}}),
make_tuple(Sequence<0>{}, Sequence<1>{})); make_tuple(Sequence<1, 2, 4>{}, Sequence<0, 3, 5>{}),
make_tuple(Sequence<0>{}, Sequence<1>{}));
return make_tuple(in_gemmk_gemmn_global_desc); return make_tuple(in_gemmk_gemmn_global_desc);
} }
...@@ -146,7 +147,8 @@ map_convolution_into_gemm_v2(const WeiDesc& wei_k_c_y_x_global_desc, ...@@ -146,7 +147,8 @@ map_convolution_into_gemm_v2(const WeiDesc& wei_k_c_y_x_global_desc,
const auto in_gemmk_gemmn_global_desc = transform_dynamic_tensor_descriptor_v2( const auto in_gemmk_gemmn_global_desc = transform_dynamic_tensor_descriptor_v2(
in_n_c_y_ho_x_wo_global_desc, in_n_c_y_ho_x_wo_global_desc,
make_tuple(DynamicMerge<3>{{C, Y, X}}, DynamicMerge<3>{{N, Ho, Wo}}), make_tuple(DynamicMerge<3>{MultiIndex<3>{{C, Y, X}}},
DynamicMerge<3>{MultiIndex<3>{{N, Ho, Wo}}}),
make_tuple(Sequence<1, 2, 4>{}, Sequence<0, 3, 5>{}), make_tuple(Sequence<1, 2, 4>{}, Sequence<0, 3, 5>{}),
make_tuple(Sequence<0>{}, Sequence<1>{})); make_tuple(Sequence<0>{}, Sequence<1>{}));
...@@ -632,7 +634,7 @@ struct DummyDynamicTransform_1 ...@@ -632,7 +634,7 @@ struct DummyDynamicTransform_1
for(index_t iter = 0; iter < niter; ++iter) for(index_t iter = 0; iter < niter; ++iter)
{ {
constexpr auto gemmk1_gemmn0 = MultiIndex<2>{1, 0}; constexpr auto gemmk1_gemmn0 = MultiIndex<2>{{1, 0}};
in_gemmk_gemmn_coord += gemmk1_gemmn0; in_gemmk_gemmn_coord += gemmk1_gemmn0;
...@@ -793,8 +795,8 @@ struct DummyDynamicTransform_2 ...@@ -793,8 +795,8 @@ struct DummyDynamicTransform_2
in_n_c_hi_wi_global_desc, in_n_c_hi_wi_global_desc,
make_tuple(DynamicPassThrough{N}, make_tuple(DynamicPassThrough{N},
DynamicPassThrough{C}, DynamicPassThrough{C},
DynamicLeftPad{Hi, InLeftPadH}, DynamicPassThrough{Hi},
DynamicLeftPad{Wi, InLeftPadW}), DynamicPassThrough{Wi}),
make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}), make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{})); make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}));
......
...@@ -183,7 +183,7 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer ...@@ -183,7 +183,7 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer
AddressSpace::Vgpr, AddressSpace::Vgpr,
AddressSpace::Lds, AddressSpace::Lds,
InMemoryDataOperation::Set>( InMemoryDataOperation::Set>(
{0, 0, b_block_data_on_global, 0}, {0, 0, 0, 0}); MultiIndex<4>{{0, 0, b_block_data_on_global, 0}}, MultiIndex<4>{{0, 0, 0, 0}});
// weight tensor // weight tensor
// global tensor in global memory, src of blockwise copy // global tensor in global memory, src of blockwise copy
......
...@@ -10,20 +10,20 @@ struct DynamicPassThrough ...@@ -10,20 +10,20 @@ struct DynamicPassThrough
using LowerIndex = MultiIndex<1>; using LowerIndex = MultiIndex<1>;
using UpperIndex = MultiIndex<1>; using UpperIndex = MultiIndex<1>;
const index_t up_length_; const UpperIndex up_lengths_;
__host__ __device__ explicit constexpr DynamicPassThrough(const index_t& low_length) __host__ __device__ explicit constexpr DynamicPassThrough(const index_t& low_length)
: up_length_{low_length} : up_lengths_{{low_length}}
{ {
} }
__host__ __device__ explicit constexpr DynamicPassThrough() : up_length_{0} {} __host__ __device__ explicit constexpr DynamicPassThrough() : up_lengths_{{0}} {}
__host__ __device__ constexpr index_t GetNumOfLowerDimension() { return 1; } __host__ __device__ static constexpr index_t GetNumOfLowerDimension() { return 1; }
__host__ __device__ constexpr index_t GetNumOfUpperDimension() { return 1; } __host__ __device__ static constexpr index_t GetNumOfUpperDimension() { return 1; }
__host__ __device__ constexpr auto GetUpperLengths() const { return UpperIndex{up_length_}; } __host__ __device__ constexpr const auto& GetUpperLengths() const { return up_lengths_; }
template <typename LowIdx, typename UpIdx> template <typename LowIdx, typename UpIdx>
__host__ __device__ static void CalculateLowerIndex(LowIdx& idx_low, const UpIdx& idx_up) __host__ __device__ static void CalculateLowerIndex(LowIdx& idx_low, const UpIdx& idx_up)
...@@ -68,22 +68,22 @@ struct DynamicLeftPad ...@@ -68,22 +68,22 @@ struct DynamicLeftPad
using LowerIndex = MultiIndex<1>; using LowerIndex = MultiIndex<1>;
using UpperIndex = MultiIndex<1>; using UpperIndex = MultiIndex<1>;
const index_t up_length_; const UpperIndex up_lengths_;
const index_t left_pad_; const index_t left_pad_;
__host__ __device__ explicit constexpr DynamicLeftPad(const index_t& low_length, __host__ __device__ explicit constexpr DynamicLeftPad(const index_t& low_length,
const index_t& left_pad) const index_t& left_pad)
: up_length_{low_length + left_pad}, left_pad_{left_pad} : up_lengths_{{low_length + left_pad}}, left_pad_{left_pad}
{ {
} }
__host__ __device__ explicit constexpr DynamicLeftPad() : up_length_{0}, left_pad_{0} {} __host__ __device__ explicit constexpr DynamicLeftPad() : up_lengths_{{0}}, left_pad_{0} {}
__host__ __device__ static constexpr index_t GetNumOfLowerDimension() { return 1; } __host__ __device__ static constexpr index_t GetNumOfLowerDimension() { return 1; }
__host__ __device__ static constexpr index_t GetNumOfUpperDimension() { return 1; } __host__ __device__ static constexpr index_t GetNumOfUpperDimension() { return 1; }
__host__ __device__ constexpr auto GetUpperLengths() const { return UpperIndex{up_length_}; } __host__ __device__ constexpr const auto& GetUpperLengths() const { return up_lengths_; }
template <typename LowIdx, typename UpIdx> template <typename LowIdx, typename UpIdx>
__host__ __device__ constexpr void CalculateLowerIndex(LowIdx& idx_low, __host__ __device__ constexpr void CalculateLowerIndex(LowIdx& idx_low,
...@@ -96,10 +96,11 @@ struct DynamicLeftPad ...@@ -96,10 +96,11 @@ struct DynamicLeftPad
} }
template <typename LowIdxDiff, typename UpIdxDiff, typename LowIdx, typename UpIdx> template <typename LowIdxDiff, typename UpIdxDiff, typename LowIdx, typename UpIdx>
__host__ __device__ static void CalculateLowerIndexDiff(LowIdxDiff& idx_diff_low, __host__ __device__ static constexpr void
const UpIdxDiff& idx_diff_up, CalculateLowerIndexDiff(LowIdxDiff& idx_diff_low,
const LowIdx& /* idx_low_old */, const UpIdxDiff& idx_diff_up,
const UpIdx& /* idx_up_old */) const LowIdx& /* idx_low_old */,
const UpIdx& /* idx_up_old */)
{ {
static_assert(LowIdxDiff::Size() == 1 && UpIdxDiff::Size() == 1 && LowIdx::Size() == 1 && static_assert(LowIdxDiff::Size() == 1 && UpIdxDiff::Size() == 1 && LowIdx::Size() == 1 &&
UpIdx::Size() == 1, UpIdx::Size() == 1,
...@@ -129,18 +130,18 @@ struct DynamicRightPad ...@@ -129,18 +130,18 @@ struct DynamicRightPad
using LowerIndex = MultiIndex<1>; using LowerIndex = MultiIndex<1>;
using UpperIndex = MultiIndex<1>; using UpperIndex = MultiIndex<1>;
const index_t up_length_; const UpperIndex up_lengths_;
const index_t low_length_; const index_t low_length_;
const index_t right_pad_; const index_t right_pad_;
__host__ __device__ explicit constexpr DynamicRightPad(const index_t& low_length, __host__ __device__ explicit constexpr DynamicRightPad(const index_t& low_length,
const index_t& right_pad) const index_t& right_pad)
: up_length_{low_length + right_pad}, low_length_{low_length}, right_pad_{right_pad} : up_lengths_{{low_length + right_pad}}, low_length_{low_length}, right_pad_{right_pad}
{ {
} }
__host__ __device__ explicit constexpr DynamicRightPad() __host__ __device__ explicit constexpr DynamicRightPad()
: up_length_{0}, low_length_{0}, right_pad_{0} : up_lengths_{{0}}, low_length_{0}, right_pad_{0}
{ {
} }
...@@ -148,10 +149,11 @@ struct DynamicRightPad ...@@ -148,10 +149,11 @@ struct DynamicRightPad
__host__ __device__ static constexpr index_t GetNumOfUpperDimension() { return 1; } __host__ __device__ static constexpr index_t GetNumOfUpperDimension() { return 1; }
__host__ __device__ constexpr auto GetUpperLengths() const { return UpperIndex{up_length_}; } __host__ __device__ constexpr const auto& GetUpperLengths() const { return up_lengths_; }
template <typename LowIdx, typename UpIdx> template <typename LowIdx, typename UpIdx>
__host__ __device__ static void CalculateLowerIndex(LowIdx& idx_low, const UpIdx& idx_up) __host__ __device__ static constexpr void CalculateLowerIndex(LowIdx& idx_low,
const UpIdx& idx_up)
{ {
static_assert(LowIdx::Size() == 1 && UpIdx::Size() == 1, static_assert(LowIdx::Size() == 1 && UpIdx::Size() == 1,
"wrong! inconsistent # of dimension"); "wrong! inconsistent # of dimension");
...@@ -160,10 +162,11 @@ struct DynamicRightPad ...@@ -160,10 +162,11 @@ struct DynamicRightPad
} }
template <typename LowIdxDiff, typename UpIdxDiff, typename LowIdx, typename UpIdx> template <typename LowIdxDiff, typename UpIdxDiff, typename LowIdx, typename UpIdx>
__host__ __device__ static void CalculateLowerIndexDiff(LowIdxDiff& idx_diff_low, __host__ __device__ static constexpr void
const UpIdxDiff& idx_diff_up, CalculateLowerIndexDiff(LowIdxDiff& idx_diff_low,
const LowIdx& /* idx_low_old */, const UpIdxDiff& idx_diff_up,
const UpIdx& /* idx_up_old */) const LowIdx& /* idx_low_old */,
const UpIdx& /* idx_up_old */)
{ {
static_assert(LowIdxDiff::Size() == 1 && UpIdxDiff::Size() == 1 && LowIdx::Size() == 1 && static_assert(LowIdxDiff::Size() == 1 && UpIdxDiff::Size() == 1 && LowIdx::Size() == 1 &&
UpIdx::Size() == 1, UpIdx::Size() == 1,
...@@ -216,7 +219,7 @@ struct DynamicEmbed ...@@ -216,7 +219,7 @@ struct DynamicEmbed
__host__ __device__ static constexpr index_t GetNumOfUpperDimension() { return NDimUp; } __host__ __device__ static constexpr index_t GetNumOfUpperDimension() { return NDimUp; }
__host__ __device__ constexpr auto GetUpperLengths() const { return up_lengths_; } __host__ __device__ constexpr const auto& GetUpperLengths() const { return up_lengths_; }
template <typename LowIdx, typename UpIdx> template <typename LowIdx, typename UpIdx>
__host__ __device__ constexpr void CalculateLowerIndex(LowIdx& idx_low, __host__ __device__ constexpr void CalculateLowerIndex(LowIdx& idx_low,
...@@ -276,13 +279,13 @@ struct DynamicMerge ...@@ -276,13 +279,13 @@ struct DynamicMerge
const LowerIndex low_lengths_; const LowerIndex low_lengths_;
const LowerIndex low_lengths_scan_; const LowerIndex low_lengths_scan_;
const index_t up_length_; const UpperIndex up_lengths_;
__host__ __device__ explicit constexpr DynamicMerge(const LowerIndex& low_lengths) __host__ __device__ explicit constexpr DynamicMerge(const LowerIndex& low_lengths)
: low_lengths_{low_lengths}, : low_lengths_{low_lengths},
low_lengths_scan_{reverse_exclusive_scan_on_array( low_lengths_scan_{reverse_exclusive_scan_on_array(
low_lengths, math::multiplies<index_t>{}, index_t{1})}, low_lengths, math::multiplies<index_t>{}, index_t{1})},
up_length_{reduce_on_array(low_lengths, math::multiplies<index_t>(), 1)} up_lengths_{{reduce_on_array(low_lengths, math::multiplies<index_t>(), index_t{1})}}
{ {
static_assert(LowerIndex::Size() == NDimLow, "wrong!"); static_assert(LowerIndex::Size() == NDimLow, "wrong!");
} }
...@@ -290,7 +293,7 @@ struct DynamicMerge ...@@ -290,7 +293,7 @@ struct DynamicMerge
__host__ __device__ explicit constexpr DynamicMerge() __host__ __device__ explicit constexpr DynamicMerge()
: low_lengths_{make_zero_array<index_t, NDimLow>()}, : low_lengths_{make_zero_array<index_t, NDimLow>()},
low_lengths_scan_{make_zero_array<index_t, NDimLow>()}, low_lengths_scan_{make_zero_array<index_t, NDimLow>()},
up_length_{0} up_lengths_{{0}}
{ {
} }
...@@ -298,7 +301,7 @@ struct DynamicMerge ...@@ -298,7 +301,7 @@ struct DynamicMerge
__host__ __device__ static constexpr index_t GetNumOfUpperDimension() { return 1; } __host__ __device__ static constexpr index_t GetNumOfUpperDimension() { return 1; }
__host__ __device__ constexpr auto GetUpperLengths() const { return UpperIndex{up_length_}; } __host__ __device__ constexpr const auto& GetUpperLengths() const { return up_lengths_; }
template <typename LowIdx, typename UpIdx> template <typename LowIdx, typename UpIdx>
__host__ __device__ constexpr void CalculateLowerIndex(LowIdx& idx_low, __host__ __device__ constexpr void CalculateLowerIndex(LowIdx& idx_low,
...@@ -444,7 +447,7 @@ struct DynamicUnMerge ...@@ -444,7 +447,7 @@ struct DynamicUnMerge
__host__ __device__ static constexpr index_t GetNumOfUpperDimension() { return NDimUp; } __host__ __device__ static constexpr index_t GetNumOfUpperDimension() { return NDimUp; }
__host__ __device__ constexpr auto GetUpperLengths() const { return up_lengths_; } __host__ __device__ constexpr const auto& GetUpperLengths() const { return up_lengths_; }
template <typename LowIdx, typename UpIdx> template <typename LowIdx, typename UpIdx>
__host__ __device__ constexpr void CalculateLowerIndex(LowIdx& idx_low, __host__ __device__ constexpr void CalculateLowerIndex(LowIdx& idx_low,
...@@ -500,7 +503,7 @@ struct DynamicFreeze ...@@ -500,7 +503,7 @@ struct DynamicFreeze
__host__ __device__ static constexpr index_t GetNumOfUpperDimension() { return 0; } __host__ __device__ static constexpr index_t GetNumOfUpperDimension() { return 0; }
__host__ __device__ constexpr auto GetUpperLengths() const { return UpperIndex{}; } __host__ __device__ static constexpr auto GetUpperLengths() { return UpperIndex{}; }
template <typename LowIdx, typename UpIdx> template <typename LowIdx, typename UpIdx>
__host__ __device__ constexpr void CalculateLowerIndex(LowIdx& idx_low, __host__ __device__ constexpr void CalculateLowerIndex(LowIdx& idx_low,
......
...@@ -150,7 +150,7 @@ struct DynamicTensorDescriptor_v2 ...@@ -150,7 +150,7 @@ struct DynamicTensorDescriptor_v2
index_t element_space_size) index_t element_space_size)
{ {
// zero initialization // zero initialization
HiddenIndex hidden_lengths{0}; HiddenIndex hidden_lengths{{0}};
// this is the orignal tensor element space size // this is the orignal tensor element space size
hidden_lengths(0) = element_space_size; hidden_lengths(0) = element_space_size;
...@@ -319,7 +319,8 @@ struct lambda_get_up_dim_num ...@@ -319,7 +319,8 @@ struct lambda_get_up_dim_num
template <typename I> template <typename I>
__host__ __device__ constexpr auto operator()(I) const __host__ __device__ constexpr auto operator()(I) const
{ {
return Number<NewTransforms{}.At(I{}).GetNumOfUpperDimension()>{}; using Tran = remove_reference_t<decltype(NewTransforms{}.At(I{}))>;
return Number<Tran::GetNumOfUpperDimension()>{};
} }
}; };
...@@ -488,7 +489,7 @@ __host__ __device__ void move_dynamic_tensor_coordinate_v2(const TensorDesc& ten ...@@ -488,7 +489,7 @@ __host__ __device__ void move_dynamic_tensor_coordinate_v2(const TensorDesc& ten
using HiddenIndex = MultiIndex<ndim_hidden>; using HiddenIndex = MultiIndex<ndim_hidden>;
// this is what needs to be calculated // this is what needs to be calculated
auto idx_diff_hidden = HiddenIndex{0}; auto idx_diff_hidden = HiddenIndex{{0}};
// initialize visible index diff // initialize visible index diff
// idx_diff_hidden_pick_visible contains reference to idx_diff_hidden // idx_diff_hidden_pick_visible contains reference to idx_diff_hidden
......
...@@ -364,7 +364,7 @@ struct UnMerge ...@@ -364,7 +364,7 @@ struct UnMerge
__host__ __device__ static constexpr auto CalculateLowerIndex(const UpperIndex& idx_up) __host__ __device__ static constexpr auto CalculateLowerIndex(const UpperIndex& idx_up)
{ {
LowerIndex idx_low{0}; LowerIndex idx_low{{0}};
constexpr auto pseudo_up_strides = constexpr auto pseudo_up_strides =
reverse_inclusive_scan_sequence( reverse_inclusive_scan_sequence(
...@@ -425,7 +425,7 @@ struct Embed ...@@ -425,7 +425,7 @@ struct Embed
__host__ __device__ static constexpr auto CalculateLowerIndex(const UpperIndex& idx_up) __host__ __device__ static constexpr auto CalculateLowerIndex(const UpperIndex& idx_up)
{ {
LowerIndex idx_low = {Coefficients{}[nDimUp]}; LowerIndex idx_low{{Coefficients{}[nDimUp]}};
for(index_t i = 0; i < nDimUp; ++i) for(index_t i = 0; i < nDimUp; ++i)
{ {
......
...@@ -196,7 +196,7 @@ struct ClusterDescriptor ...@@ -196,7 +196,7 @@ struct ClusterDescriptor
__host__ __device__ static constexpr auto CalculateClusterIndex(index_t idx_1d) __host__ __device__ static constexpr auto CalculateClusterIndex(index_t idx_1d)
{ {
return mDesc.CalculateLowerIndex(MultiIndex<1>{idx_1d}); return mDesc.CalculateLowerIndex(MultiIndex<1>{{idx_1d}});
} }
}; };
......
...@@ -17,6 +17,7 @@ install(TARGETS host LIBRARY DESTINATION lib) ...@@ -17,6 +17,7 @@ install(TARGETS host LIBRARY DESTINATION lib)
if(DEVICE_BACKEND STREQUAL "AMD") if(DEVICE_BACKEND STREQUAL "AMD")
set(CONV_SOURCE src/conv_driver.cpp) set(CONV_SOURCE src/conv_driver.cpp)
set(CONV_BWD_DATA_SOURCE src/conv_bwd_data_driver.cpp) set(CONV_BWD_DATA_SOURCE src/conv_bwd_data_driver.cpp)
set(TRY_SOURCE src/try.cpp)
elseif(DEVICE_BACKEND STREQUAL "NVIDIA") elseif(DEVICE_BACKEND STREQUAL "NVIDIA")
set(CONV_SOURCE src/conv_driver.cu) set(CONV_SOURCE src/conv_driver.cu)
set(CONV_BWD_DATA_SOURCE src/conv_bwd_data_driver.cu) set(CONV_BWD_DATA_SOURCE src/conv_bwd_data_driver.cu)
...@@ -24,6 +25,8 @@ endif() ...@@ -24,6 +25,8 @@ endif()
add_executable(conv_driver ${CONV_SOURCE}) add_executable(conv_driver ${CONV_SOURCE})
add_executable(conv_bwd_data_driver ${CONV_BWD_DATA_SOURCE}) add_executable(conv_bwd_data_driver ${CONV_BWD_DATA_SOURCE})
add_executable(try ${TRY_SOURCE})
target_link_libraries(conv_driver PRIVATE host) target_link_libraries(conv_driver PRIVATE host)
target_link_libraries(conv_bwd_data_driver PRIVATE host) target_link_libraries(conv_bwd_data_driver PRIVATE host)
target_link_libraries(try PRIVATE host)
...@@ -12,17 +12,17 @@ template <class T, ...@@ -12,17 +12,17 @@ template <class T,
class ConvDilations, class ConvDilations,
class InLeftPads, class InLeftPads,
class InRightPads> class InRightPads>
void device_dummy_dynamic_transform(InDesc, void device_dummy_dynamic_transform_1(InDesc,
const Tensor<T>& in_nchw, const Tensor<T>& in_nchw,
WeiDesc, WeiDesc,
const Tensor<T>& wei_kcyx, const Tensor<T>& wei_kcyx,
OutDesc, OutDesc,
Tensor<T>& out_nkhw, Tensor<T>& out_nkhw,
ConvStrides, ConvStrides,
ConvDilations, ConvDilations,
InLeftPads, InLeftPads,
InRightPads, InRightPads,
ck::index_t nrepeat) ck::index_t nrepeat)
{ {
using namespace ck; using namespace ck;
...@@ -52,11 +52,11 @@ void device_dummy_dynamic_transform(InDesc, ...@@ -52,11 +52,11 @@ void device_dummy_dynamic_transform(InDesc,
const auto in_gemmk_gemmn_global_desc = tensor_descs.At(Number<0>{}); const auto in_gemmk_gemmn_global_desc = tensor_descs.At(Number<0>{});
auto in_gemmk_gemmn_coord = auto in_gemmk_gemmn_coord =
make_dynamic_tensor_coordinate(in_gemmk_gemmn_global_desc, MultiIndex<2>{0, 0}); make_dynamic_tensor_coordinate(in_gemmk_gemmn_global_desc, MultiIndex<2>{{0, 0}});
for(index_t iter = 0; iter < 10; ++iter) for(index_t iter = 0; iter < 10; ++iter)
{ {
constexpr auto gemmk1_gemmn0 = MultiIndex<2>{1, 0}; constexpr auto gemmk1_gemmn0 = MultiIndex<2>{{1, 0}};
printf("iter %d\n", iter); printf("iter %d\n", iter);
...@@ -147,17 +147,17 @@ template <class T, ...@@ -147,17 +147,17 @@ template <class T,
class ConvDilations, class ConvDilations,
class InLeftPads, class InLeftPads,
class InRightPads> class InRightPads>
void device_dummy_dynamic_transform_v2(InDesc, void device_dummy_dynamic_transform_2(InDesc,
const Tensor<T>& in_nchw, const Tensor<T>& in_nchw,
WeiDesc, WeiDesc,
const Tensor<T>& wei_kcyx, const Tensor<T>& wei_kcyx,
OutDesc, OutDesc,
Tensor<T>& out_nkhw, Tensor<T>& out_nkhw,
ConvStrides, ConvStrides,
ConvDilations, ConvDilations,
InLeftPads, InLeftPads,
InRightPads, InRightPads,
ck::index_t nrepeat) ck::index_t nrepeat)
{ {
using namespace ck; using namespace ck;
...@@ -187,10 +187,10 @@ void device_dummy_dynamic_transform_v2(InDesc, ...@@ -187,10 +187,10 @@ void device_dummy_dynamic_transform_v2(InDesc,
const auto in_gemmk_gemmn_global_desc = tensor_descs.At(Number<0>{}); const auto in_gemmk_gemmn_global_desc = tensor_descs.At(Number<0>{});
auto in_gemmk_gemmn_coord = auto in_gemmk_gemmn_coord =
make_dynamic_tensor_coordinate_v2(in_gemmk_gemmn_global_desc, MultiIndex<2>{0, 0}); make_dynamic_tensor_coordinate_v2(in_gemmk_gemmn_global_desc, MultiIndex<2>{{0, 0}});
const auto in_gemmk_gemmn_coord_step = const auto in_gemmk_gemmn_coord_step = make_dynamic_tensor_coordinate_step_v2(
make_dynamic_tensor_coordinate_step_v2(in_gemmk_gemmn_global_desc, MultiIndex<2>{0, 1}); in_gemmk_gemmn_global_desc, MultiIndex<2>{{0, 1}});
for(index_t iter = 0; iter < 20; ++iter) for(index_t iter = 0; iter < 20; ++iter)
{ {
......
...@@ -585,18 +585,18 @@ int main(int argc, char* argv[]) ...@@ -585,18 +585,18 @@ int main(int argc, char* argv[])
RightPads{}, RightPads{},
nrepeat); nrepeat);
#elif 0 #elif 0
device_dummy_dynamic_transform(in_nchw_desc, device_dummy_dynamic_transform_1(in_nchw_desc,
in_nchw, in_nchw,
wei_kcyx_desc, wei_kcyx_desc,
wei_kcyx, wei_kcyx,
out_nkhw_desc, out_nkhw_desc,
out_nkhw_device, out_nkhw_device,
ConvStrides{}, ConvStrides{},
ConvDilations{}, ConvDilations{},
LeftPads{}, LeftPads{},
RightPads{}, RightPads{},
nrepeat); nrepeat);
#elif 0 #elif 1
device_dummy_dynamic_transform_2(in_nchw_desc, device_dummy_dynamic_transform_2(in_nchw_desc,
in_nchw, in_nchw,
wei_kcyx_desc, wei_kcyx_desc,
......
#include <iostream>
#include <numeric>
#include <initializer_list>
#include <cstdlib>
#include <stdlib.h>
#include <half.hpp>
#include "config.hpp"
#include "print.hpp"
#include "device.hpp"
#include "host_tensor_generator.hpp"
#include "conv_common.hpp"
#include "host_conv.hpp"
#include "device_tensor.hpp"
#include "device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw.hpp"
#include "device_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw.hpp"
int main(int argc, char* argv[])
{
using namespace ck;
auto idx1 = std::array<index_t, 2>{{1, 0}};
auto idx2 = Array<index_t, 2>{{1, 0}};
auto idx3 = MultiIndex<2>{{1, 0}};
auto idx0 = MultiIndex<2>{{1, 0}};
print_array("idx2", idx2);
print_array("idx3", idx2);
}
...@@ -8,14 +8,17 @@ MY_PROJECT_INSTALL=../install.dir ...@@ -8,14 +8,17 @@ MY_PROJECT_INSTALL=../install.dir
cmake \ cmake \
-D CMAKE_INSTALL_PREFIX=${MY_PROJECT_INSTALL} \ -D CMAKE_INSTALL_PREFIX=${MY_PROJECT_INSTALL} \
-D CMAKE_BUILD_TYPE=Release \ -D CMAKE_BUILD_TYPE=Debug \
-D DEVICE_BACKEND="AMD" \ -D DEVICE_BACKEND="AMD" \
-D CMAKE_CXX_FLAGS="-O3 --amdgpu-target=gfx906 -mllvm --amdgpu-enable-global-sgpr-addr -mllvm --amdgpu-spill-vgpr-to-agpr=0" \ -D CMAKE_CXX_FLAGS="-O3 --amdgpu-target=gfx906 -mllvm --amdgpu-spill-vgpr-to-agpr=0 -save-temps=$CWD" \
-D CMAKE_CXX_COMPILER=/opt/rocm/bin/hipcc \ -D CMAKE_CXX_COMPILER=/opt/rocm/bin/hipcc \
-D CMAKE_PREFIX_PATH="/opt/rocm" \ -D CMAKE_PREFIX_PATH="/opt/rocm" \
-D CMAKE_VERBOSE_MAKEFILE:BOOL=ON \ -D CMAKE_VERBOSE_MAKEFILE:BOOL=ON \
${MY_PROJECT_SOURCE} ${MY_PROJECT_SOURCE}
#-D CMAKE_CXX_FLAGS="-c -emit-llvm -O3 --amdgpu-target=gfx906 -mllvm --amdgpu-spill-vgpr-to-agpr=0 -save-temps=$CWD" \
#-D CMAKE_CXX_FLAGS="-gline-tables-only -S -emit-llvm -O3 --amdgpu-target=gfx906 -mllvm --amdgpu-spill-vgpr-to-agpr=0 -save-temps=$CWD" \
#-D CMAKE_CXX_FLAGS="-O3 --amdgpu-target=gfx906 -mllvm --amdgpu-spill-vgpr-to-agpr=0 -save-temps=$CWD" \
#-D CMAKE_CXX_FLAGS="-O3 --amdgpu-target=gfx906 -mllvm --amdgpu-enable-global-sgpr-addr -mllvm --amdgpu-spill-vgpr-to-agpr=0" \ #-D CMAKE_CXX_FLAGS="-O3 --amdgpu-target=gfx906 -mllvm --amdgpu-enable-global-sgpr-addr -mllvm --amdgpu-spill-vgpr-to-agpr=0" \
#-D CMAKE_CXX_FLAGS="-O3 --amdgpu-target=gfx906 -mllvm --amdgpu-enable-global-sgpr-addr -mllvm --amdgpu-spill-vgpr-to-agpr=0 -save-temps=$CWD" \ #-D CMAKE_CXX_FLAGS="-O3 --amdgpu-target=gfx906 -mllvm --amdgpu-enable-global-sgpr-addr -mllvm --amdgpu-spill-vgpr-to-agpr=0 -save-temps=$CWD" \
#-D CMAKE_CXX_FLAGS="-O3 --amdgpu-target=gfx906 -mllvm --amdgpu-enable-global-sgpr-addr -mllvm --amdgpu-spill-vgpr-to-agpr=0 -v -gline-tables-only -save-temps=$CWD" \ #-D CMAKE_CXX_FLAGS="-O3 --amdgpu-target=gfx906 -mllvm --amdgpu-enable-global-sgpr-addr -mllvm --amdgpu-spill-vgpr-to-agpr=0 -v -gline-tables-only -save-temps=$CWD" \
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment