Commit c98cbea0 authored by Chao Liu's avatar Chao Liu
Browse files

refactoring array type

parent 674c405f
......@@ -68,17 +68,18 @@ map_convolution_into_gemm_v2(const WeiDesc& wei_k_c_y_x_global_desc,
const auto in_n_c_y_ho_x_wo_global_desc = transform_dynamic_tensor_descriptor_v2(
in_n_c_hip_wip_global_desc,
make_tuple(DynamicPassThrough{N},
DynamicPassThrough{C},
DynamicEmbed<2>{{Y, Ho}, {ConvDilationH, ConvStrideH, 0}},
DynamicEmbed<2>{{X, Wo}, {ConvDilationW, ConvStrideW, 0}}),
make_tuple(
DynamicPassThrough{N},
DynamicPassThrough{C},
DynamicEmbed<2>{make_multi_index(Y, Ho), make_multi_index(ConvDilationH, ConvStrideH)},
DynamicEmbed<2>{make_multi_index(X, Wo), make_multi_index(ConvDilationW, ConvStrideW)}),
make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2, 3>{}, Sequence<4, 5>{}));
const auto in_gemmk_gemmn_global_desc = transform_dynamic_tensor_descriptor_v2(
in_n_c_y_ho_x_wo_global_desc,
make_tuple(DynamicMerge<3>{MultiIndex<3>{{C, Y, X}}},
DynamicMerge<3>{MultiIndex<3>{{N, Ho, Wo}}}),
make_tuple(DynamicMerge<3>{make_multi_index(C, Y, X)},
DynamicMerge<3>{make_multi_index(N, Ho, Wo)}),
make_tuple(Sequence<1, 2, 4>{}, Sequence<0, 3, 5>{}),
make_tuple(Sequence<0>{}, Sequence<1>{}));
......
......@@ -183,7 +183,7 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer
AddressSpace::Vgpr,
AddressSpace::Lds,
InMemoryDataOperation::Set>(
MultiIndex<4>{{0, 0, b_block_data_on_global, 0}}, MultiIndex<4>{{0, 0, 0, 0}});
MultiIndex<4>{0, 0, b_block_data_on_global, 0}, MultiIndex<4>{0, 0, 0, 0});
// weight tensor
// global tensor in global memory, src of blockwise copy
......
......@@ -2,6 +2,7 @@
#define CK_DYNAMIC_MULTI_INDEX_TRANSFORM_HPP
#include "common_header.hpp"
#include "multi_index.hpp"
namespace ck {
......@@ -13,11 +14,11 @@ struct DynamicPassThrough
const UpperIndex up_lengths_;
__host__ __device__ explicit constexpr DynamicPassThrough(const index_t& low_length)
: up_lengths_{{low_length}}
: up_lengths_{low_length}
{
}
__host__ __device__ explicit constexpr DynamicPassThrough() : up_lengths_{{0}} {}
__host__ __device__ explicit constexpr DynamicPassThrough() : up_lengths_{0} {}
__host__ __device__ static constexpr index_t GetNumOfLowerDimension() { return 1; }
......@@ -73,11 +74,11 @@ struct DynamicLeftPad
__host__ __device__ explicit constexpr DynamicLeftPad(const index_t& low_length,
const index_t& left_pad)
: up_lengths_{{low_length + left_pad}}, left_pad_{left_pad}
: up_lengths_{low_length + left_pad}, left_pad_{left_pad}
{
}
__host__ __device__ explicit constexpr DynamicLeftPad() : up_lengths_{{0}}, left_pad_{0} {}
__host__ __device__ explicit constexpr DynamicLeftPad() : up_lengths_{0}, left_pad_{0} {}
__host__ __device__ static constexpr index_t GetNumOfLowerDimension() { return 1; }
......@@ -136,12 +137,12 @@ struct DynamicRightPad
__host__ __device__ explicit constexpr DynamicRightPad(const index_t& low_length,
const index_t& right_pad)
: up_lengths_{{low_length + right_pad}}, low_length_{low_length}, right_pad_{right_pad}
: up_lengths_{low_length + right_pad}, low_length_{low_length}, right_pad_{right_pad}
{
}
__host__ __device__ explicit constexpr DynamicRightPad()
: up_lengths_{{0}}, low_length_{0}, right_pad_{0}
: up_lengths_{0}, low_length_{0}, right_pad_{0}
{
}
......@@ -190,8 +191,7 @@ struct DynamicRightPad
}
};
// idx_low = coefficients[0, ...nDimUp-1] * idx_up[0, ...nDimUp-1] +
// coefficients[nDimUp]
// idx_low = coefficients[0, ...nDimUp-1] * idx_up[0, ...nDimUp-1]
template <index_t NDimUp>
struct DynamicEmbed
{
......@@ -199,11 +199,10 @@ struct DynamicEmbed
using UpperIndex = MultiIndex<NDimUp>;
const UpperIndex up_lengths_;
const Array<index_t, NDimUp + 1> coefficients_;
const UpperIndex coefficients_;
__host__
__device__ explicit constexpr DynamicEmbed(const UpperIndex& up_lengths,
const Array<index_t, NDimUp + 1>& coefficients)
__host__ __device__ explicit constexpr DynamicEmbed(const UpperIndex& up_lengths,
const UpperIndex& coefficients)
: up_lengths_{up_lengths}, coefficients_{coefficients}
{
static_assert(UpperIndex::Size() == NDimUp, "wrong! # of dimensions not consistent");
......@@ -211,7 +210,7 @@ struct DynamicEmbed
__host__ __device__ explicit constexpr DynamicEmbed()
: up_lengths_{make_zero_array<index_t, NDimUp>()},
coefficients_{make_zero_array<index_t, NDimUp + 1>()}
coefficients_{make_zero_array<index_t, NDimUp>()}
{
}
......@@ -228,7 +227,7 @@ struct DynamicEmbed
static_assert(LowIdx::Size() == 1 && UpIdx::Size() == NDimUp,
"wrong! inconsistent # of dimension");
idx_low(Number<0>{}) = coefficients_[Number<NDimUp>{}];
idx_low(Number<0>{}) = 0;
static_for<0, NDimUp, 1>{}([&idx_low, &idx_up, this](auto i) {
idx_low(Number<0>{}) += idx_up[i] * this->coefficients_[i];
......@@ -288,7 +287,7 @@ struct DynamicMerge
__host__ __device__ explicit constexpr DynamicMerge()
: low_lengths_{make_zero_array<index_t, NDimLow>()},
low_lengths_scan_{make_zero_array<index_t, NDimLow>()},
up_lengths_{{0}}
up_lengths_{0}
{
}
......
......@@ -31,9 +31,7 @@ template <index_t N>
__host__ __device__ constexpr auto
make_dynamic_native_tensor_descriptor_v2(const MultiIndex<N>& lengths, const MultiIndex<N>& strides)
{
const auto coefficients = strides.PushBack(index_t{0});
const auto transforms = make_tuple(DynamicEmbed<N>{lengths, coefficients});
const auto transforms = make_tuple(DynamicEmbed<N>{lengths, strides});
constexpr auto low_dim_hidden_idss = make_tuple(Sequence<0>{});
constexpr auto up_dim_hidden_idss =
make_tuple(typename arithmetic_sequence_gen<1, N + 1, 1>::type{});
......@@ -41,11 +39,7 @@ make_dynamic_native_tensor_descriptor_v2(const MultiIndex<N>& lengths, const Mul
index_t element_space_size = 1;
#pragma unroll
for(index_t i = 0; i < N; ++i)
{
element_space_size += (lengths[i] - 1) * strides[i];
}
static_for<0, N, 1>{}([&](auto i) { element_space_size += (lengths[i] - 1) * strides[i]; });
return DynamicTensorDescriptor_v2<decltype(transforms),
decltype(low_dim_hidden_idss),
......
#ifndef CK_MULTI_INDEX_HPP
#define CK_MULTI_INDEX_HPP
#include "common_header.hpp"
namespace ck {
#if 1 // debug
template <index_t N>
using MultiIndex = Array<index_t, N>;
template <typename... Xs>
__host__ __device__ constexpr auto make_multi_index(Xs... xs)
{
return MultiIndex<sizeof...(Xs)>{{static_cast<index_t>(xs)...}};
}
#else
template <index_t N>
using MultiIndex = StaticallyIndexedArray<index_t, N>;
template <typename... Xs>
__host__ __device__ constexpr auto make_multi_index(Xs... xs)
{
return MultiIndex<sizeof...(Xs)>(static_cast<index_t>(xs)... r);
}
#endif
template <index_t NSize>
__host__ __device__ constexpr auto make_zero_multi_index()
{
return unpack([](auto... xs) { return make_multi_index(xs...); },
typename uniform_sequence_gen<NSize, 0>::type{});
}
} // namespace ck
#endif
......@@ -2,24 +2,10 @@
#define CK_MULTI_INDEX_TRANSFORM_HPP
#include "common_header.hpp"
#include "multi_index.hpp"
namespace ck {
template <index_t N>
using MultiIndex = Array<index_t, N>;
template <typename... Xs>
__host__ __device__ constexpr auto make_multi_index(Xs... xs)
{
return MultiIndex<sizeof...(Xs)>(xs...);
}
template <index_t NSize>
__host__ __device__ constexpr auto make_zero_multi_index()
{
return make_zero_array<index_t, NSize>();
}
template <index_t Length>
struct PassThrough
{
......
......@@ -196,7 +196,7 @@ struct ClusterDescriptor
__host__ __device__ static constexpr auto CalculateClusterIndex(index_t idx_1d)
{
return mDesc.CalculateLowerIndex(MultiIndex<1>{{idx_1d}});
return mDesc.CalculateLowerIndex(MultiIndex<1>{idx_1d});
}
};
......
......@@ -16,48 +16,96 @@ template <typename TData>
struct StaticallyIndexedArray<TData, 0> : Tuple<>
{
using data_type = TData;
using base = Tuple<>;
template <typename... Ys>
__host__ __device__ explicit constexpr StaticallyIndexedArray(Ys&&... ys) : base(ys...)
{
}
};
template <typename TData>
struct StaticallyIndexedArray<TData, 1> : Tuple<TData>
{
using data_type = TData;
using base = Tuple<TData>;
template <typename... Ys>
__host__ __device__ explicit constexpr StaticallyIndexedArray(Ys&&... ys) : base(ys...)
{
}
};
template <typename TData>
struct StaticallyIndexedArray<TData, 2> : Tuple<TData, TData>
{
using data_type = TData;
using base = Tuple<TData, TData>;
template <typename... Ys>
__host__ __device__ explicit constexpr StaticallyIndexedArray(Ys&&... ys) : base(ys...)
{
}
};
template <typename TData>
struct StaticallyIndexedArray<TData, 3> : Tuple<TData, TData, TData>
{
using data_type = TData;
using base = Tuple<TData, TData, TData>;
template <typename... Ys>
__host__ __device__ explicit constexpr StaticallyIndexedArray(Ys&&... ys) : base(ys...)
{
}
};
template <typename TData>
struct StaticallyIndexedArray<TData, 4> : Tuple<TData, TData, TData, TData>
{
using data_type = TData;
using base = Tuple<TData, TData, TData, TData>;
template <typename... Ys>
__host__ __device__ explicit constexpr StaticallyIndexedArray(Ys&&... ys) : base(ys...)
{
}
};
template <typename TData>
struct StaticallyIndexedArray<TData, 5> : Tuple<TData, TData, TData, TData, TData>
{
using data_type = TData;
using base = Tuple<TData, TData, TData, TData, TData>;
template <typename... Ys>
__host__ __device__ explicit constexpr StaticallyIndexedArray(Ys&&... ys) : base(ys...)
{
}
};
template <typename TData>
struct StaticallyIndexedArray<TData, 6> : Tuple<TData, TData, TData, TData, TData, TData>
{
using data_type = TData;
using base = Tuple<TData, TData, TData, TData, TData, TData>;
template <typename... Ys>
__host__ __device__ explicit constexpr StaticallyIndexedArray(Ys&&... ys) : base(ys...)
{
}
};
template <typename TData>
struct StaticallyIndexedArray<TData, 7> : Tuple<TData, TData, TData, TData, TData, TData, TData>
{
using data_type = TData;
using base = Tuple<TData, TData, TData, TData, TData, TData, TData>;
template <typename... Ys>
__host__ __device__ explicit constexpr StaticallyIndexedArray(Ys&&... ys) : base(ys...)
{
}
};
template <typename TData>
......@@ -65,6 +113,12 @@ struct StaticallyIndexedArray<TData, 8>
: Tuple<TData, TData, TData, TData, TData, TData, TData, TData>
{
using data_type = TData;
using base = Tuple<TData, TData, TData, TData, TData, TData, TData, TData>;
template <typename... Ys>
__host__ __device__ explicit constexpr StaticallyIndexedArray(Ys&&... ys) : base(ys...)
{
}
};
template <typename TData>
......@@ -72,6 +126,12 @@ struct StaticallyIndexedArray<TData, 9>
: Tuple<TData, TData, TData, TData, TData, TData, TData, TData, TData>
{
using data_type = TData;
using base = Tuple<TData, TData, TData, TData, TData, TData, TData, TData, TData>;
template <typename... Ys>
__host__ __device__ explicit constexpr StaticallyIndexedArray(Ys&&... ys) : base(ys...)
{
}
};
template <typename TData>
......@@ -79,6 +139,12 @@ struct StaticallyIndexedArray<TData, 10>
: Tuple<TData, TData, TData, TData, TData, TData, TData, TData, TData, TData>
{
using data_type = TData;
using base = Tuple<TData, TData, TData, TData, TData, TData, TData, TData, TData, TData>;
template <typename... Ys>
__host__ __device__ explicit constexpr StaticallyIndexedArray(Ys&&... ys) : base(ys...)
{
}
};
template <typename TData>
......
......@@ -561,7 +561,7 @@ int main(int argc, char* argv[])
LeftPads{},
RightPads{},
nrepeat);
#elif 1
#elif 0
device_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw(in_nchw_desc,
in_nchw,
wei_kcyx_desc,
......
......@@ -10,15 +10,14 @@ cmake
-D CMAKE_INSTALL_PREFIX=${MY_PROJECT_INSTALL} \
-D CMAKE_BUILD_TYPE=Debug \
-D DEVICE_BACKEND="AMD" \
-D CMAKE_CXX_FLAGS="-O3 --amdgpu-target=gfx906 -mllvm --amdgpu-spill-vgpr-to-agpr=0 -save-temps=$CWD" \
-D CMAKE_CXX_FLAGS="-O3 --amdgpu-target=gfx906 -mllvm --amdgpu-spill-vgpr-to-agpr=0 -save-temps=$CWD" \
-D CMAKE_CXX_COMPILER=/opt/rocm/bin/hipcc \
-D CMAKE_PREFIX_PATH="/opt/rocm" \
-D CMAKE_VERBOSE_MAKEFILE:BOOL=ON \
${MY_PROJECT_SOURCE}
#-D CMAKE_CXX_FLAGS="-c -emit-llvm -O3 --amdgpu-target=gfx906 -mllvm --amdgpu-spill-vgpr-to-agpr=0 -save-temps=$CWD" \
#-D CMAKE_CXX_FLAGS="-gline-tables-only -S -emit-llvm -O3 --amdgpu-target=gfx906 -mllvm --amdgpu-spill-vgpr-to-agpr=0 -save-temps=$CWD" \
#-D CMAKE_CXX_FLAGS="-O3 --amdgpu-target=gfx906 -mllvm --amdgpu-spill-vgpr-to-agpr=0 -save-temps=$CWD" \
#-D CMAKE_CXX_FLAGS="-O3 --amdgpu-target=gfx906 -mllvm --amdgpu-spill-vgpr-to-agpr=0" \
#-D CMAKE_CXX_FLAGS="-O3 --amdgpu-target=gfx906 -mllvm --amdgpu-spill-vgpr-to-agpr=0 -save-temps=$CWD" \
#-D CMAKE_CXX_FLAGS="-O3 --amdgpu-target=gfx906 -mllvm --amdgpu-enable-global-sgpr-addr -mllvm --amdgpu-spill-vgpr-to-agpr=0" \
#-D CMAKE_CXX_FLAGS="-O3 --amdgpu-target=gfx906 -mllvm --amdgpu-enable-global-sgpr-addr -mllvm --amdgpu-spill-vgpr-to-agpr=0 -save-temps=$CWD" \
#-D CMAKE_CXX_FLAGS="-O3 --amdgpu-target=gfx906 -mllvm --amdgpu-enable-global-sgpr-addr -mllvm --amdgpu-spill-vgpr-to-agpr=0 -v -gline-tables-only -save-temps=$CWD" \
rm *.ll *.s
/opt/rocm/llvm/bin/llvm-dis driver/conv_driver-hip-amdgcn-amd-amdhsa-gfx906-optimized.bc -o tmp.ll
/opt/rocm/llvm/bin/opt -S -inline -inline-threshold=104857 tmp.ll > inline.ll
/opt/rocm/llvm/bin/opt -S -O3 -sroa inline.ll > o3.ll
/opt/rocm/llvm/bin/opt -S -O3 -sroa o3.ll > o3_2.ll
/opt/rocm/llvm/bin/opt -S -O3 -sroa o3_2.ll > o3_3.ll
/opt/rocm/llvm/bin/opt -S -O3 -sroa o3_3.ll > o3_4.ll
/opt/rocm/llvm/bin/llc -mcpu=gfx908 o3.ll
/opt/rocm/llvm/bin/llc -mcpu=gfx908 o3_2.ll
/opt/rocm/llvm/bin/llc -mcpu=gfx908 o3_3.ll
/opt/rocm/llvm/bin/llc -mcpu=gfx908 o3_4.ll
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment