Commit f3099df8 authored by Chao Liu's avatar Chao Liu
Browse files

trying to solve scratch mem issue

parent 0413a020
......@@ -72,11 +72,12 @@ map_convolution_into_gemm(const WeiDesc& wei_k_c_y_x_global_desc,
make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2, 3>{}, Sequence<4, 5>{}));
const auto in_gemmk_gemmn_global_desc = transform_dynamic_tensor_descriptor(
in_n_c_y_ho_x_wo_global_desc,
make_tuple(DynamicMerge<3>{{C, Y, X}}, DynamicMerge<3>{{N, Ho, Wo}}),
make_tuple(Sequence<1, 2, 4>{}, Sequence<0, 3, 5>{}),
make_tuple(Sequence<0>{}, Sequence<1>{}));
const auto in_gemmk_gemmn_global_desc =
transform_dynamic_tensor_descriptor(in_n_c_y_ho_x_wo_global_desc,
make_tuple(DynamicMerge<3>{MultiIndex<3>{{C, Y, X}}},
DynamicMerge<3>{MultiIndex<3>{{N, Ho, Wo}}}),
make_tuple(Sequence<1, 2, 4>{}, Sequence<0, 3, 5>{}),
make_tuple(Sequence<0>{}, Sequence<1>{}));
return make_tuple(in_gemmk_gemmn_global_desc);
}
......@@ -146,7 +147,8 @@ map_convolution_into_gemm_v2(const WeiDesc& wei_k_c_y_x_global_desc,
const auto in_gemmk_gemmn_global_desc = transform_dynamic_tensor_descriptor_v2(
in_n_c_y_ho_x_wo_global_desc,
make_tuple(DynamicMerge<3>{{C, Y, X}}, DynamicMerge<3>{{N, Ho, Wo}}),
make_tuple(DynamicMerge<3>{MultiIndex<3>{{C, Y, X}}},
DynamicMerge<3>{MultiIndex<3>{{N, Ho, Wo}}}),
make_tuple(Sequence<1, 2, 4>{}, Sequence<0, 3, 5>{}),
make_tuple(Sequence<0>{}, Sequence<1>{}));
......@@ -632,7 +634,7 @@ struct DummyDynamicTransform_1
for(index_t iter = 0; iter < niter; ++iter)
{
constexpr auto gemmk1_gemmn0 = MultiIndex<2>{1, 0};
constexpr auto gemmk1_gemmn0 = MultiIndex<2>{{1, 0}};
in_gemmk_gemmn_coord += gemmk1_gemmn0;
......@@ -793,8 +795,8 @@ struct DummyDynamicTransform_2
in_n_c_hi_wi_global_desc,
make_tuple(DynamicPassThrough{N},
DynamicPassThrough{C},
DynamicLeftPad{Hi, InLeftPadH},
DynamicLeftPad{Wi, InLeftPadW}),
DynamicPassThrough{Hi},
DynamicPassThrough{Wi}),
make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}));
......
......@@ -183,7 +183,7 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer
AddressSpace::Vgpr,
AddressSpace::Lds,
InMemoryDataOperation::Set>(
{0, 0, b_block_data_on_global, 0}, {0, 0, 0, 0});
MultiIndex<4>{{0, 0, b_block_data_on_global, 0}}, MultiIndex<4>{{0, 0, 0, 0}});
// weight tensor
// global tensor in global memory, src of blockwise copy
......
......@@ -10,20 +10,20 @@ struct DynamicPassThrough
using LowerIndex = MultiIndex<1>;
using UpperIndex = MultiIndex<1>;
const index_t up_length_;
const UpperIndex up_lengths_;
__host__ __device__ explicit constexpr DynamicPassThrough(const index_t& low_length)
: up_length_{low_length}
: up_lengths_{{low_length}}
{
}
__host__ __device__ explicit constexpr DynamicPassThrough() : up_length_{0} {}
__host__ __device__ explicit constexpr DynamicPassThrough() : up_lengths_{{0}} {}
__host__ __device__ constexpr index_t GetNumOfLowerDimension() { return 1; }
__host__ __device__ static constexpr index_t GetNumOfLowerDimension() { return 1; }
__host__ __device__ constexpr index_t GetNumOfUpperDimension() { return 1; }
__host__ __device__ static constexpr index_t GetNumOfUpperDimension() { return 1; }
__host__ __device__ constexpr auto GetUpperLengths() const { return UpperIndex{up_length_}; }
__host__ __device__ constexpr const auto& GetUpperLengths() const { return up_lengths_; }
template <typename LowIdx, typename UpIdx>
__host__ __device__ static void CalculateLowerIndex(LowIdx& idx_low, const UpIdx& idx_up)
......@@ -68,22 +68,22 @@ struct DynamicLeftPad
using LowerIndex = MultiIndex<1>;
using UpperIndex = MultiIndex<1>;
const index_t up_length_;
const UpperIndex up_lengths_;
const index_t left_pad_;
__host__ __device__ explicit constexpr DynamicLeftPad(const index_t& low_length,
const index_t& left_pad)
: up_length_{low_length + left_pad}, left_pad_{left_pad}
: up_lengths_{{low_length + left_pad}}, left_pad_{left_pad}
{
}
__host__ __device__ explicit constexpr DynamicLeftPad() : up_length_{0}, left_pad_{0} {}
__host__ __device__ explicit constexpr DynamicLeftPad() : up_lengths_{{0}}, left_pad_{0} {}
__host__ __device__ static constexpr index_t GetNumOfLowerDimension() { return 1; }
__host__ __device__ static constexpr index_t GetNumOfUpperDimension() { return 1; }
__host__ __device__ constexpr auto GetUpperLengths() const { return UpperIndex{up_length_}; }
__host__ __device__ constexpr const auto& GetUpperLengths() const { return up_lengths_; }
template <typename LowIdx, typename UpIdx>
__host__ __device__ constexpr void CalculateLowerIndex(LowIdx& idx_low,
......@@ -96,10 +96,11 @@ struct DynamicLeftPad
}
template <typename LowIdxDiff, typename UpIdxDiff, typename LowIdx, typename UpIdx>
__host__ __device__ static void CalculateLowerIndexDiff(LowIdxDiff& idx_diff_low,
const UpIdxDiff& idx_diff_up,
const LowIdx& /* idx_low_old */,
const UpIdx& /* idx_up_old */)
__host__ __device__ static constexpr void
CalculateLowerIndexDiff(LowIdxDiff& idx_diff_low,
const UpIdxDiff& idx_diff_up,
const LowIdx& /* idx_low_old */,
const UpIdx& /* idx_up_old */)
{
static_assert(LowIdxDiff::Size() == 1 && UpIdxDiff::Size() == 1 && LowIdx::Size() == 1 &&
UpIdx::Size() == 1,
......@@ -129,18 +130,18 @@ struct DynamicRightPad
using LowerIndex = MultiIndex<1>;
using UpperIndex = MultiIndex<1>;
const index_t up_length_;
const UpperIndex up_lengths_;
const index_t low_length_;
const index_t right_pad_;
__host__ __device__ explicit constexpr DynamicRightPad(const index_t& low_length,
const index_t& right_pad)
: up_length_{low_length + right_pad}, low_length_{low_length}, right_pad_{right_pad}
: up_lengths_{{low_length + right_pad}}, low_length_{low_length}, right_pad_{right_pad}
{
}
__host__ __device__ explicit constexpr DynamicRightPad()
: up_length_{0}, low_length_{0}, right_pad_{0}
: up_lengths_{{0}}, low_length_{0}, right_pad_{0}
{
}
......@@ -148,10 +149,11 @@ struct DynamicRightPad
__host__ __device__ static constexpr index_t GetNumOfUpperDimension() { return 1; }
__host__ __device__ constexpr auto GetUpperLengths() const { return UpperIndex{up_length_}; }
__host__ __device__ constexpr const auto& GetUpperLengths() const { return up_lengths_; }
template <typename LowIdx, typename UpIdx>
__host__ __device__ static void CalculateLowerIndex(LowIdx& idx_low, const UpIdx& idx_up)
__host__ __device__ static constexpr void CalculateLowerIndex(LowIdx& idx_low,
const UpIdx& idx_up)
{
static_assert(LowIdx::Size() == 1 && UpIdx::Size() == 1,
"wrong! inconsistent # of dimension");
......@@ -160,10 +162,11 @@ struct DynamicRightPad
}
template <typename LowIdxDiff, typename UpIdxDiff, typename LowIdx, typename UpIdx>
__host__ __device__ static void CalculateLowerIndexDiff(LowIdxDiff& idx_diff_low,
const UpIdxDiff& idx_diff_up,
const LowIdx& /* idx_low_old */,
const UpIdx& /* idx_up_old */)
__host__ __device__ static constexpr void
CalculateLowerIndexDiff(LowIdxDiff& idx_diff_low,
const UpIdxDiff& idx_diff_up,
const LowIdx& /* idx_low_old */,
const UpIdx& /* idx_up_old */)
{
static_assert(LowIdxDiff::Size() == 1 && UpIdxDiff::Size() == 1 && LowIdx::Size() == 1 &&
UpIdx::Size() == 1,
......@@ -216,7 +219,7 @@ struct DynamicEmbed
__host__ __device__ static constexpr index_t GetNumOfUpperDimension() { return NDimUp; }
__host__ __device__ constexpr auto GetUpperLengths() const { return up_lengths_; }
__host__ __device__ constexpr const auto& GetUpperLengths() const { return up_lengths_; }
template <typename LowIdx, typename UpIdx>
__host__ __device__ constexpr void CalculateLowerIndex(LowIdx& idx_low,
......@@ -276,13 +279,13 @@ struct DynamicMerge
const LowerIndex low_lengths_;
const LowerIndex low_lengths_scan_;
const index_t up_length_;
const UpperIndex up_lengths_;
__host__ __device__ explicit constexpr DynamicMerge(const LowerIndex& low_lengths)
: low_lengths_{low_lengths},
low_lengths_scan_{reverse_exclusive_scan_on_array(
low_lengths, math::multiplies<index_t>{}, index_t{1})},
up_length_{reduce_on_array(low_lengths, math::multiplies<index_t>(), 1)}
up_lengths_{{reduce_on_array(low_lengths, math::multiplies<index_t>(), index_t{1})}}
{
static_assert(LowerIndex::Size() == NDimLow, "wrong!");
}
......@@ -290,7 +293,7 @@ struct DynamicMerge
__host__ __device__ explicit constexpr DynamicMerge()
: low_lengths_{make_zero_array<index_t, NDimLow>()},
low_lengths_scan_{make_zero_array<index_t, NDimLow>()},
up_length_{0}
up_lengths_{{0}}
{
}
......@@ -298,7 +301,7 @@ struct DynamicMerge
__host__ __device__ static constexpr index_t GetNumOfUpperDimension() { return 1; }
__host__ __device__ constexpr auto GetUpperLengths() const { return UpperIndex{up_length_}; }
__host__ __device__ constexpr const auto& GetUpperLengths() const { return up_lengths_; }
template <typename LowIdx, typename UpIdx>
__host__ __device__ constexpr void CalculateLowerIndex(LowIdx& idx_low,
......@@ -444,7 +447,7 @@ struct DynamicUnMerge
__host__ __device__ static constexpr index_t GetNumOfUpperDimension() { return NDimUp; }
__host__ __device__ constexpr auto GetUpperLengths() const { return up_lengths_; }
__host__ __device__ constexpr const auto& GetUpperLengths() const { return up_lengths_; }
template <typename LowIdx, typename UpIdx>
__host__ __device__ constexpr void CalculateLowerIndex(LowIdx& idx_low,
......@@ -500,7 +503,7 @@ struct DynamicFreeze
__host__ __device__ static constexpr index_t GetNumOfUpperDimension() { return 0; }
__host__ __device__ constexpr auto GetUpperLengths() const { return UpperIndex{}; }
__host__ __device__ static constexpr auto GetUpperLengths() { return UpperIndex{}; }
template <typename LowIdx, typename UpIdx>
__host__ __device__ constexpr void CalculateLowerIndex(LowIdx& idx_low,
......
......@@ -150,7 +150,7 @@ struct DynamicTensorDescriptor_v2
index_t element_space_size)
{
// zero initialization
HiddenIndex hidden_lengths{0};
HiddenIndex hidden_lengths{{0}};
// this is the orignal tensor element space size
hidden_lengths(0) = element_space_size;
......@@ -319,7 +319,8 @@ struct lambda_get_up_dim_num
template <typename I>
__host__ __device__ constexpr auto operator()(I) const
{
return Number<NewTransforms{}.At(I{}).GetNumOfUpperDimension()>{};
using Tran = remove_reference_t<decltype(NewTransforms{}.At(I{}))>;
return Number<Tran::GetNumOfUpperDimension()>{};
}
};
......@@ -488,7 +489,7 @@ __host__ __device__ void move_dynamic_tensor_coordinate_v2(const TensorDesc& ten
using HiddenIndex = MultiIndex<ndim_hidden>;
// this is what needs to be calculated
auto idx_diff_hidden = HiddenIndex{0};
auto idx_diff_hidden = HiddenIndex{{0}};
// initialize visible index diff
// idx_diff_hidden_pick_visible contains reference to idx_diff_hidden
......
......@@ -364,7 +364,7 @@ struct UnMerge
__host__ __device__ static constexpr auto CalculateLowerIndex(const UpperIndex& idx_up)
{
LowerIndex idx_low{0};
LowerIndex idx_low{{0}};
constexpr auto pseudo_up_strides =
reverse_inclusive_scan_sequence(
......@@ -425,7 +425,7 @@ struct Embed
__host__ __device__ static constexpr auto CalculateLowerIndex(const UpperIndex& idx_up)
{
LowerIndex idx_low = {Coefficients{}[nDimUp]};
LowerIndex idx_low{{Coefficients{}[nDimUp]}};
for(index_t i = 0; i < nDimUp; ++i)
{
......
......@@ -196,7 +196,7 @@ struct ClusterDescriptor
__host__ __device__ static constexpr auto CalculateClusterIndex(index_t idx_1d)
{
return mDesc.CalculateLowerIndex(MultiIndex<1>{idx_1d});
return mDesc.CalculateLowerIndex(MultiIndex<1>{{idx_1d}});
}
};
......
......@@ -17,6 +17,7 @@ install(TARGETS host LIBRARY DESTINATION lib)
if(DEVICE_BACKEND STREQUAL "AMD")
set(CONV_SOURCE src/conv_driver.cpp)
set(CONV_BWD_DATA_SOURCE src/conv_bwd_data_driver.cpp)
set(TRY_SOURCE src/try.cpp)
elseif(DEVICE_BACKEND STREQUAL "NVIDIA")
set(CONV_SOURCE src/conv_driver.cu)
set(CONV_BWD_DATA_SOURCE src/conv_bwd_data_driver.cu)
......@@ -24,6 +25,8 @@ endif()
add_executable(conv_driver ${CONV_SOURCE})
add_executable(conv_bwd_data_driver ${CONV_BWD_DATA_SOURCE})
add_executable(try ${TRY_SOURCE})
target_link_libraries(conv_driver PRIVATE host)
target_link_libraries(conv_bwd_data_driver PRIVATE host)
target_link_libraries(try PRIVATE host)
......@@ -12,17 +12,17 @@ template <class T,
class ConvDilations,
class InLeftPads,
class InRightPads>
void device_dummy_dynamic_transform(InDesc,
const Tensor<T>& in_nchw,
WeiDesc,
const Tensor<T>& wei_kcyx,
OutDesc,
Tensor<T>& out_nkhw,
ConvStrides,
ConvDilations,
InLeftPads,
InRightPads,
ck::index_t nrepeat)
void device_dummy_dynamic_transform_1(InDesc,
const Tensor<T>& in_nchw,
WeiDesc,
const Tensor<T>& wei_kcyx,
OutDesc,
Tensor<T>& out_nkhw,
ConvStrides,
ConvDilations,
InLeftPads,
InRightPads,
ck::index_t nrepeat)
{
using namespace ck;
......@@ -52,11 +52,11 @@ void device_dummy_dynamic_transform(InDesc,
const auto in_gemmk_gemmn_global_desc = tensor_descs.At(Number<0>{});
auto in_gemmk_gemmn_coord =
make_dynamic_tensor_coordinate(in_gemmk_gemmn_global_desc, MultiIndex<2>{0, 0});
make_dynamic_tensor_coordinate(in_gemmk_gemmn_global_desc, MultiIndex<2>{{0, 0}});
for(index_t iter = 0; iter < 10; ++iter)
{
constexpr auto gemmk1_gemmn0 = MultiIndex<2>{1, 0};
constexpr auto gemmk1_gemmn0 = MultiIndex<2>{{1, 0}};
printf("iter %d\n", iter);
......@@ -147,17 +147,17 @@ template <class T,
class ConvDilations,
class InLeftPads,
class InRightPads>
void device_dummy_dynamic_transform_v2(InDesc,
const Tensor<T>& in_nchw,
WeiDesc,
const Tensor<T>& wei_kcyx,
OutDesc,
Tensor<T>& out_nkhw,
ConvStrides,
ConvDilations,
InLeftPads,
InRightPads,
ck::index_t nrepeat)
void device_dummy_dynamic_transform_2(InDesc,
const Tensor<T>& in_nchw,
WeiDesc,
const Tensor<T>& wei_kcyx,
OutDesc,
Tensor<T>& out_nkhw,
ConvStrides,
ConvDilations,
InLeftPads,
InRightPads,
ck::index_t nrepeat)
{
using namespace ck;
......@@ -187,10 +187,10 @@ void device_dummy_dynamic_transform_v2(InDesc,
const auto in_gemmk_gemmn_global_desc = tensor_descs.At(Number<0>{});
auto in_gemmk_gemmn_coord =
make_dynamic_tensor_coordinate_v2(in_gemmk_gemmn_global_desc, MultiIndex<2>{0, 0});
make_dynamic_tensor_coordinate_v2(in_gemmk_gemmn_global_desc, MultiIndex<2>{{0, 0}});
const auto in_gemmk_gemmn_coord_step =
make_dynamic_tensor_coordinate_step_v2(in_gemmk_gemmn_global_desc, MultiIndex<2>{0, 1});
const auto in_gemmk_gemmn_coord_step = make_dynamic_tensor_coordinate_step_v2(
in_gemmk_gemmn_global_desc, MultiIndex<2>{{0, 1}});
for(index_t iter = 0; iter < 20; ++iter)
{
......
......@@ -585,18 +585,18 @@ int main(int argc, char* argv[])
RightPads{},
nrepeat);
#elif 0
device_dummy_dynamic_transform(in_nchw_desc,
in_nchw,
wei_kcyx_desc,
wei_kcyx,
out_nkhw_desc,
out_nkhw_device,
ConvStrides{},
ConvDilations{},
LeftPads{},
RightPads{},
nrepeat);
#elif 0
device_dummy_dynamic_transform_1(in_nchw_desc,
in_nchw,
wei_kcyx_desc,
wei_kcyx,
out_nkhw_desc,
out_nkhw_device,
ConvStrides{},
ConvDilations{},
LeftPads{},
RightPads{},
nrepeat);
#elif 1
device_dummy_dynamic_transform_2(in_nchw_desc,
in_nchw,
wei_kcyx_desc,
......
#include <iostream>
#include <numeric>
#include <initializer_list>
#include <cstdlib>
#include <stdlib.h>
#include <half.hpp>
#include "config.hpp"
#include "print.hpp"
#include "device.hpp"
#include "host_tensor_generator.hpp"
#include "conv_common.hpp"
#include "host_conv.hpp"
#include "device_tensor.hpp"
#include "device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw.hpp"
#include "device_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw.hpp"
int main(int argc, char* argv[])
{
using namespace ck;
auto idx1 = std::array<index_t, 2>{{1, 0}};
auto idx2 = Array<index_t, 2>{{1, 0}};
auto idx3 = MultiIndex<2>{{1, 0}};
auto idx0 = MultiIndex<2>{{1, 0}};
print_array("idx2", idx2);
print_array("idx3", idx2);
}
......@@ -8,14 +8,17 @@ MY_PROJECT_INSTALL=../install.dir
cmake \
-D CMAKE_INSTALL_PREFIX=${MY_PROJECT_INSTALL} \
-D CMAKE_BUILD_TYPE=Release \
-D CMAKE_BUILD_TYPE=Debug \
-D DEVICE_BACKEND="AMD" \
-D CMAKE_CXX_FLAGS="-O3 --amdgpu-target=gfx906 -mllvm --amdgpu-enable-global-sgpr-addr -mllvm --amdgpu-spill-vgpr-to-agpr=0" \
-D CMAKE_CXX_FLAGS="-O3 --amdgpu-target=gfx906 -mllvm --amdgpu-spill-vgpr-to-agpr=0 -save-temps=$CWD" \
-D CMAKE_CXX_COMPILER=/opt/rocm/bin/hipcc \
-D CMAKE_PREFIX_PATH="/opt/rocm" \
-D CMAKE_VERBOSE_MAKEFILE:BOOL=ON \
${MY_PROJECT_SOURCE}
#-D CMAKE_CXX_FLAGS="-c -emit-llvm -O3 --amdgpu-target=gfx906 -mllvm --amdgpu-spill-vgpr-to-agpr=0 -save-temps=$CWD" \
#-D CMAKE_CXX_FLAGS="-gline-tables-only -S -emit-llvm -O3 --amdgpu-target=gfx906 -mllvm --amdgpu-spill-vgpr-to-agpr=0 -save-temps=$CWD" \
#-D CMAKE_CXX_FLAGS="-O3 --amdgpu-target=gfx906 -mllvm --amdgpu-spill-vgpr-to-agpr=0 -save-temps=$CWD" \
#-D CMAKE_CXX_FLAGS="-O3 --amdgpu-target=gfx906 -mllvm --amdgpu-enable-global-sgpr-addr -mllvm --amdgpu-spill-vgpr-to-agpr=0" \
#-D CMAKE_CXX_FLAGS="-O3 --amdgpu-target=gfx906 -mllvm --amdgpu-enable-global-sgpr-addr -mllvm --amdgpu-spill-vgpr-to-agpr=0 -save-temps=$CWD" \
#-D CMAKE_CXX_FLAGS="-O3 --amdgpu-target=gfx906 -mllvm --amdgpu-enable-global-sgpr-addr -mllvm --amdgpu-spill-vgpr-to-agpr=0 -v -gline-tables-only -save-temps=$CWD" \
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment