Commit 5514dc0b authored by Chao Liu's avatar Chao Liu
Browse files

prototype dynamic tensor descriptor

parent 0b7fcca6
...@@ -8,6 +8,76 @@ ...@@ -8,6 +8,76 @@
#include "dynamic_tensor_descriptor_helper.hpp" #include "dynamic_tensor_descriptor_helper.hpp"
namespace ck { namespace ck {
__host__ __device__ constexpr auto
map_convolution_into_gemm(const DynamicNativeTensorDescriptor<4> wei_k_c_y_x_global_desc,
const DynamicNativeTensorDescriptor<4> in_n_c_hi_wi_global_desc,
const DynamicNativeTensorDescriptor<4> out_n_k_ho_wo_global_desc,
const Array<index_t, 2> conv_strides,
const Array<index_t, 2> conv_dilations,
const Array<index_t, 2> in_left_pads,
const Array<index_t, 2> in_right_pads)
{
const index_t N = in_n_c_hi_wi_global_desc.GetLength(0);
const index_t C = in_n_c_hi_wi_global_desc.GetLength(1);
const index_t K = out_n_k_ho_wo_global_desc.GetLength(1);
const index_t Y = wei_k_c_y_x_global_desc.GetLength(2);
const index_t X = wei_k_c_y_x_global_desc.GetLength(3);
const index_t Hi = in_n_c_hi_wi_global_desc.GetLength(2);
const index_t Wi = in_n_c_hi_wi_global_desc.GetLength(3);
const index_t Ho = out_n_k_ho_wo_global_desc.GetLength(2);
const index_t Wo = out_n_k_ho_wo_global_desc.GetLength(3);
const index_t ConvStrideH = conv_strides[0];
const index_t ConvStrideW = conv_strides[1];
const index_t ConvDilationH = conv_dilations[0];
const index_t ConvDilationW = conv_dilations[1];
const index_t InLeftPadH = in_left_pads[0];
const index_t InLeftPadW = in_left_pads[1];
const index_t InRightPadH = in_right_pads[0];
const index_t InRightPadW = in_right_pads[1];
// input tensor
const auto in_n_c_hip_wip_global_desc = transform_dynamic_tensor_descriptor(
transform_dynamic_tensor_descriptor(
in_n_c_hi_wi_global_desc,
make_tuple(DynamicPassThrough{N},
DynamicPassThrough{C},
DynamicLeftPad{Hi, InLeftPadH},
DynamicLeftPad{Wi, InLeftPadW}),
make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{})),
make_tuple(DynamicPassThrough{N},
DynamicPassThrough{C},
DynamicRightPad{Hi + InLeftPadH, InRightPadH},
DynamicRightPad{Wi + InLeftPadW, InRightPadW}),
make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}));
const index_t Hip = in_n_c_hip_wip_global_desc.GetLength(2);
const index_t Wip = in_n_c_hip_wip_global_desc.GetLength(3);
const auto in_n_c_y_ho_x_wo_global_desc = transform_dynamic_tensor_descriptor(
in_n_c_hip_wip_global_desc,
make_tuple(DynamicPassThrough{N},
DynamicPassThrough{C},
DynamicEmbed<2>{{Y, Ho}, {ConvDilationH, ConvStrideH, 0}},
DynamicEmbed<2>{{X, Wo}, {ConvDilationW, ConvStrideW, 0}}),
make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2, 3>{}, Sequence<4, 5>{}));
const auto in_gemmk_gemmn_global_desc = transform_dynamic_tensor_descriptor(
in_n_c_y_ho_x_wo_global_desc,
make_tuple(DynamicMerge<3>{{C, Y, X}}, DynamicMerge<3>{{N, Ho, Wo}}),
make_tuple(Sequence<1, 2, 4>{}, Sequence<0, 3, 5>{}),
make_tuple(Sequence<0>{}, Sequence<1>{}));
return make_tuple(in_gemmk_gemmn_global_desc);
}
template <index_t BlockSize> template <index_t BlockSize>
struct DummyDynamicTransform struct DummyDynamicTransform
...@@ -462,64 +532,15 @@ struct DummyDynamicTransform ...@@ -462,64 +532,15 @@ struct DummyDynamicTransform
const Array<index_t, 2> in_left_pads, const Array<index_t, 2> in_left_pads,
const Array<index_t, 2> in_right_pads) const const Array<index_t, 2> in_right_pads) const
{ {
const index_t N = in_n_c_hi_wi_global_desc.GetLength(0); const auto transformed_tensor_descs = map_convolution_into_gemm(wei_k_c_y_x_global_desc,
const index_t C = in_n_c_hi_wi_global_desc.GetLength(1); in_n_c_hi_wi_global_desc,
const index_t K = out_n_k_ho_wo_global_desc.GetLength(1); out_n_k_ho_wo_global_desc,
conv_strides,
const index_t Y = wei_k_c_y_x_global_desc.GetLength(2); conv_dilations,
const index_t X = wei_k_c_y_x_global_desc.GetLength(3); in_left_pads,
in_right_pads);
const index_t Hi = in_n_c_hi_wi_global_desc.GetLength(2);
const index_t Wi = in_n_c_hi_wi_global_desc.GetLength(3); const auto in_gemmk_gemmn_global_desc = transformed_tensor_descs.At(Number<0>{});
const index_t Ho = out_n_k_ho_wo_global_desc.GetLength(2);
const index_t Wo = out_n_k_ho_wo_global_desc.GetLength(3);
const index_t ConvStrideH = conv_strides[0];
const index_t ConvStrideW = conv_strides[1];
const index_t ConvDilationH = conv_dilations[0];
const index_t ConvDilationW = conv_dilations[1];
const index_t InLeftPadH = in_left_pads[0];
const index_t InLeftPadW = in_left_pads[1];
const index_t InRightPadH = in_right_pads[0];
const index_t InRightPadW = in_right_pads[1];
// input tensor
const auto in_n_c_hip_wip_global_desc = transform_dynamic_tensor_descriptor(
transform_dynamic_tensor_descriptor(
in_n_c_hi_wi_global_desc,
make_tuple(DynamicPassThrough{N},
DynamicPassThrough{C},
DynamicLeftPad{Hi, InLeftPadH},
DynamicLeftPad{Wi, InLeftPadW}),
make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{})),
make_tuple(DynamicPassThrough{N},
DynamicPassThrough{C},
DynamicRightPad{Hi + InLeftPadH, InRightPadH},
DynamicRightPad{Wi + InLeftPadW, InRightPadW}),
make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}));
const index_t Hip = in_n_c_hip_wip_global_desc.GetLength(2);
const index_t Wip = in_n_c_hip_wip_global_desc.GetLength(3);
const auto in_n_c_y_ho_x_wo_global_desc = transform_dynamic_tensor_descriptor(
in_n_c_hip_wip_global_desc,
make_tuple(DynamicPassThrough{N},
DynamicPassThrough{C},
DynamicEmbed<2>{{Y, Ho}, {ConvDilationH, ConvStrideH, 0}},
DynamicEmbed<2>{{X, Wo}, {ConvDilationW, ConvStrideW, 0}}),
make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2, 3>{}, Sequence<4, 5>{}));
const auto in_gemmk_gemmn_global_desc = transform_dynamic_tensor_descriptor(
in_n_c_y_ho_x_wo_global_desc,
make_tuple(DynamicMerge<3>{{C, Y, X}}, DynamicMerge<3>{{N, Ho, Wo}}),
make_tuple(Sequence<1, 2, 4>{}, Sequence<0, 3, 5>{}),
make_tuple(Sequence<0>{}, Sequence<1>{}));
#pragma unroll 1 #pragma unroll 1
for(index_t iter = 0; iter < 100; ++iter) for(index_t iter = 0; iter < 100; ++iter)
......
...@@ -186,7 +186,8 @@ struct DynamicRightPad ...@@ -186,7 +186,8 @@ struct DynamicRightPad
} }
}; };
// idx_low = coefficients[0, ...nDimUp-1] * idx_up[0, ...nDimUp-1] + coefficients[nDimUp] // idx_low = coefficients[0, ...nDimUp-1] * idx_up[0, ...nDimUp-1] +
// coefficients[nDimUp]
template <index_t NDimUp> template <index_t NDimUp>
struct DynamicEmbed struct DynamicEmbed
{ {
...@@ -277,7 +278,7 @@ struct DynamicMerge ...@@ -277,7 +278,7 @@ struct DynamicMerge
__host__ __device__ explicit constexpr DynamicMerge(const LowerIndex& low_lengths) __host__ __device__ explicit constexpr DynamicMerge(const LowerIndex& low_lengths)
: low_lengths_{low_lengths}, : low_lengths_{low_lengths},
low_lengths_scan_{reverse_inclusive_scan_on_array( low_lengths_scan_{reverse_exclusive_scan_on_array(
low_lengths, math::multiplies<index_t>{}, index_t{1})}, low_lengths, math::multiplies<index_t>{}, index_t{1})},
up_length_{reduce_on_array(low_lengths, math::multiplies<index_t>(), 1)} up_length_{reduce_on_array(low_lengths, math::multiplies<index_t>(), 1)}
{ {
...@@ -303,16 +304,16 @@ struct DynamicMerge ...@@ -303,16 +304,16 @@ struct DynamicMerge
static_assert(LowIdx::Size() == NDimLow && UpIdx::Size() == 1, static_assert(LowIdx::Size() == NDimLow && UpIdx::Size() == 1,
"wrong! inconsistent # of dimension"); "wrong! inconsistent # of dimension");
index_t itmp = idx_up[0]; index_t tmp = idx_up[0];
#pragma unroll #pragma unroll
for(index_t i; i < NDimLow - 1; ++i) for(index_t i = 0; i < NDimLow - 1; ++i)
{ {
idx_low(i) = itmp / low_lengths_scan_[i]; idx_low(i) = tmp / low_lengths_scan_[i];
itmp -= idx_low[i] * low_lengths_scan_[i]; tmp -= idx_low[i] * low_lengths_scan_[i];
} }
idx_low(NDimLow - 1) = itmp; idx_low(NDimLow - 1) = tmp;
} }
// idx_low_diff depends on idx_low_old, so idx_low need to be up-to-date // idx_low_diff depends on idx_low_old, so idx_low need to be up-to-date
...@@ -334,10 +335,14 @@ struct DynamicMerge ...@@ -334,10 +335,14 @@ struct DynamicMerge
// 1) If idx_up_diff is known at compile-time, then idx_low_diff_const // 1) If idx_up_diff is known at compile-time, then idx_low_diff_const
// can be calculated at compile-time. // can be calculated at compile-time.
// 2) If idx_up_diff is not known at compile-time, but its value // 2) If idx_up_diff is not known at compile-time, but its value
// doesn't change during the whole kernel execution, then idx_low_diff_const also // doesn't change during the whole kernel execution, then
// doesn't change during the whole kernel execution. Compiler generated ISA should // idx_low_diff_const also
// only caclculate idx_low_diff_const once and save it durinng the whole kernel execution // doesn't change during the whole kernel execution. Compiler generated
// If neither 1) nor 2) is satisfied, then the calculation will also be computed at // ISA should
// only caclculate idx_low_diff_const once and save it durinng the whole
// kernel execution
// If neither 1) nor 2) is satisfied, then the calculation will also be
// computed at
// run-time each time this function is called, and can be very expensive. // run-time each time this function is called, and can be very expensive.
LowerIndex idx_low_diff_const = CalculateLowerIndex(idx_up_diff); LowerIndex idx_low_diff_const = CalculateLowerIndex(idx_up_diff);
...@@ -346,7 +351,7 @@ struct DynamicMerge ...@@ -346,7 +351,7 @@ struct DynamicMerge
index_t carry = 0; index_t carry = 0;
#pragma unroll #pragma unroll
for(index_t i = NDimLow - 1; i > 1; --i) for(index_t i = NDimLow - 1; i > 0; --i)
{ {
// this should be saved as well // this should be saved as well
index_t idx_low_length_minus_idx_low_diff_const = index_t idx_low_length_minus_idx_low_diff_const =
......
...@@ -168,10 +168,13 @@ struct DynamicTransformedTensorDescriptor ...@@ -168,10 +168,13 @@ struct DynamicTransformedTensorDescriptor
is_valid_sequence_map<sorted_low_dimension_ids>{}, is_valid_sequence_map<sorted_low_dimension_ids>{},
"wrong! LowDimensionIds is not configured correctly"); "wrong! LowDimensionIds is not configured correctly");
// TODO: sanity check: while a up-dimension could be associated with multille // TODO: sanity check: while a up-dimension could be associated with
// transformation, a low-dimension should be associated with only one transformation // multille
// transformation, a low-dimension should be associated with only one
// transformation
// TODO: sanity-check: GetLowerLengths of each transform should be consistent with lengths // TODO: sanity-check: GetLowerLengths of each transform should be
// consistent with lengths
// of lower-tensor-descriptor // of lower-tensor-descriptor
} }
...@@ -222,14 +225,16 @@ struct DynamicTransformedTensorDescriptor ...@@ -222,14 +225,16 @@ struct DynamicTransformedTensorDescriptor
return low_tensor_desc_.GetElementSpace(); return low_tensor_desc_.GetElementSpace();
} }
__host__ __device__ constexpr auto GetLowerTensorDescriptor() const { return low_tensor_desc_; }
template <typename LowIdx, typename UpIdx> template <typename LowIdx, typename UpIdx>
__host__ __device__ void CalculateLowerIndex(LowIdx& idx_low, const UpIdx& idx_up) const __host__ __device__ void CalculateLowerIndex(LowIdx& idx_low, const UpIdx& idx_up) const
{ {
static_for<0, NTransform, 1>{}([&](auto itran) constexpr { static_for<0, NTransform, 1>{}([&](auto itran) constexpr {
auto tran = transforms_.At(itran); const auto tran = transforms_.At(itran);
auto idx_up_part = pick_array_element(idx_up, UpDimensionIds{}.At(itran)); const auto idx_up_part = pick_array_element(idx_up, UpDimensionIds{}.At(itran));
auto idx_low_part = pick_array_element(idx_low, LowDimensionIds{}.At(itran)); auto idx_low_part = pick_array_element(idx_low, LowDimensionIds{}.At(itran));
tran.CalculateLowerIndex(idx_low_part, idx_up_part); tran.CalculateLowerIndex(idx_low_part, idx_up_part);
}); });
......
#ifndef CK_ARRAY_HPP #ifndef CK_ARRAY_HPP
#define CK_ARRAY_HPP #define CK_ARRAY_HPP
#include "sequence.hpp"
#include "functional2.hpp" #include "functional2.hpp"
#include "sequence.hpp"
namespace ck { namespace ck {
......
...@@ -288,7 +288,7 @@ reverse_exclusive_scan_on_array(const Array<TData, NSize>& x, Reduce f, TData in ...@@ -288,7 +288,7 @@ reverse_exclusive_scan_on_array(const Array<TData, NSize>& x, Reduce f, TData in
r = f(r, x[i]); r = f(r, x[i]);
} }
y(NSize - 1) = r; y(0) = r;
return y; return y;
} }
......
#ifndef CK_COMMON_HEADER_HPP #ifndef CK_COMMON_HEADER_HPP
#define CK_COMMON_HEADER_HPP #define CK_COMMON_HEADER_HPP
#include "config.hpp"
#include "utility.hpp"
#include "integral_constant.hpp"
#include "number.hpp"
#include "float_type.hpp"
#include "type.hpp"
#include "tuple.hpp"
#include "tuple_helper.hpp"
#include "math.hpp"
#include "sequence.hpp"
#include "array.hpp" #include "array.hpp"
#include "array_helper.hpp" #include "array_helper.hpp"
#include "config.hpp"
#include "float_type.hpp"
#include "functional.hpp" #include "functional.hpp"
#include "functional2.hpp" #include "functional2.hpp"
#include "functional3.hpp" #include "functional3.hpp"
#include "functional4.hpp" #include "functional4.hpp"
#include "in_memory_operation.hpp" #include "in_memory_operation.hpp"
#include "integral_constant.hpp"
#include "math.hpp"
#include "number.hpp"
#include "sequence.hpp"
#include "synchronization.hpp" #include "synchronization.hpp"
#include "tuple.hpp"
#include "tuple_helper.hpp"
#include "type.hpp"
#include "utility.hpp"
#if CK_USE_AMD_INLINE_ASM #if CK_USE_AMD_INLINE_ASM
#include "amd_inline_asm.hpp" #include "amd_inline_asm.hpp"
......
...@@ -2,7 +2,6 @@ ...@@ -2,7 +2,6 @@
#define CK_FUNCTIONAL_HPP #define CK_FUNCTIONAL_HPP
#include "integral_constant.hpp" #include "integral_constant.hpp"
#include "sequence.hpp"
#include "type.hpp" #include "type.hpp"
namespace ck { namespace ck {
...@@ -56,8 +55,10 @@ struct static_if<true> ...@@ -56,8 +55,10 @@ struct static_if<true>
__host__ __device__ constexpr auto operator()(F f) const __host__ __device__ constexpr auto operator()(F f) const
{ {
// This is a trick for compiler: // This is a trick for compiler:
// Pass forwarder to lambda "f" as "auto" argument, and make sure "f" will use it, // Pass forwarder to lambda "f" as "auto" argument, and make sure "f" will
// this will make "f" a generic lambda, so that "f" won't be compiled until being // use it,
// this will make "f" a generic lambda, so that "f" won't be compiled
// until being
// instantiated here // instantiated here
f(forwarder{}); f(forwarder{});
return Type{}; return Type{};
...@@ -84,8 +85,10 @@ struct static_if<false> ...@@ -84,8 +85,10 @@ struct static_if<false>
__host__ __device__ static void Else(F f) __host__ __device__ static void Else(F f)
{ {
// This is a trick for compiler: // This is a trick for compiler:
// Pass forwarder to lambda "f" as "auto" argument, and make sure "f" will use it, // Pass forwarder to lambda "f" as "auto" argument, and make sure "f" will
// this will make "f" a generic lambda, so that "f" won't be compiled until being // use it,
// this will make "f" a generic lambda, so that "f" won't be compiled
// until being
// instantiated here // instantiated here
f(forwarder{}); f(forwarder{});
} }
......
#ifndef CK_FUNCTIONAL3_HPP #ifndef CK_FUNCTIONAL3_HPP
#define CK_FUNCTIONAL3_HPP #define CK_FUNCTIONAL3_HPP
#include "array.hpp"
#include "functional.hpp" #include "functional.hpp"
#include "functional2.hpp" #include "functional2.hpp"
#include "sequence.hpp" #include "sequence.hpp"
#include "array.hpp"
namespace ck { namespace ck {
...@@ -83,8 +83,10 @@ struct ford_impl<Sequence<>, Orders> ...@@ -83,8 +83,10 @@ struct ford_impl<Sequence<>, Orders>
} // namespace detail } // namespace detail
// Lengths is Sequence<...>, it is the length of each dimension for N-dimensional loop // Lengths is Sequence<...>, it is the length of each dimension for
// Orders is Sequence<...>, it is the order of dimension in which static_ford will loop over each // N-dimensional loop
// Orders is Sequence<...>, it is the order of dimension in which static_ford
// will loop over each
// dimension // dimension
template <class Lengths, template <class Lengths,
class Orders = typename arithmetic_sequence_gen<0, Lengths::GetSize(), 1>::type> class Orders = typename arithmetic_sequence_gen<0, Lengths::GetSize(), 1>::type>
...@@ -106,8 +108,10 @@ struct static_ford ...@@ -106,8 +108,10 @@ struct static_ford
} }
}; };
// Lengths is Sequence<...>, it is the length of each dimension for N-dimensional loop // Lengths is Sequence<...>, it is the length of each dimension for
// Orders is Sequence<...>, it is the order of dimension in which ford will loop over each // N-dimensional loop
// Orders is Sequence<...>, it is the order of dimension in which ford will loop
// over each
// dimension // dimension
template <class Lengths, template <class Lengths,
class Orders = typename arithmetic_sequence_gen<0, Lengths::GetSize(), 1>::type> class Orders = typename arithmetic_sequence_gen<0, Lengths::GetSize(), 1>::type>
......
...@@ -165,6 +165,6 @@ struct less ...@@ -165,6 +165,6 @@ struct less
}; };
} // namespace math } // namespace math
} // namspace ck } // namespace ck
#endif #endif
#ifndef CK_PRINT_ARRAY_HPP
#define CK_PRINT_ARRAY_HPP
#include "array.hpp"
namespace ck {
template <index_t NSize>
__host__ __device__ void print_array(const char* s, Array<uint32_t, NSize> a)
{
constexpr index_t nsize = a.GetSize();
static_assert(nsize > 0 && nsize <= 10, "wrong!");
static_if<nsize == 1>{}([&](auto) { printf("%s size %u, {%u}\n", s, nsize, a[0]); });
static_if<nsize == 2>{}([&](auto) { printf("%s size %u, {%u %u}\n", s, nsize, a[0], a[1]); });
static_if<nsize == 3>{}(
[&](auto) { printf("%s size %u, {%u %u %u}\n", s, nsize, a[0], a[1], a[2]); });
static_if<nsize == 4>{}(
[&](auto) { printf("%s size %u, {%u %u %u %u}\n", s, nsize, a[0], a[1], a[2], a[3]); });
static_if<nsize == 5>{}([&](auto) {
printf("%s size %u, {%u %u %u %u %u}\n", s, nsize, a[0], a[1], a[2], a[3], a[4]);
});
static_if<nsize == 6>{}([&](auto) {
printf("%s size %u, {%u %u %u %u %u %u}\n", s, nsize, a[0], a[1], a[2], a[3], a[4], a[5]);
});
static_if<nsize == 7>{}([&](auto) {
printf("%s size %u, {%u %u %u %u %u %u %u}\n",
s,
nsize,
a[0],
a[1],
a[2],
a[3],
a[4],
a[5],
a[6]);
});
static_if<nsize == 8>{}([&](auto) {
printf("%s size %u, {%u %u %u %u %u %u %u %u}\n",
s,
nsize,
a[0],
a[1],
a[2],
a[3],
a[4],
a[5],
a[6],
a[7]);
});
static_if<nsize == 9>{}([&](auto) {
printf("%s size %u, {%u %u %u %u %u %u %u %u %u}\n",
s,
nsize,
a[0],
a[1],
a[2],
a[3],
a[4],
a[5],
a[6],
a[7],
a[8]);
});
static_if<nsize == 10>{}([&](auto) {
printf("%s size %u, {%u %u %u %u %u %u %u %u %u %u}\n",
s,
nsize,
a[0],
a[1],
a[2],
a[3],
a[4],
a[5],
a[6],
a[7],
a[8],
a[9]);
});
}
template <index_t NSize>
__host__ __device__ void print_array(const char* s, Array<int32_t, NSize> a)
{
constexpr index_t nsize = a.GetSize();
static_assert(nsize > 0 && nsize <= 10, "wrong!");
static_if<nsize == 1>{}([&](auto) { printf("%s size %d, {%d}\n", s, nsize, a[0]); });
static_if<nsize == 2>{}([&](auto) { printf("%s size %d, {%d %d}\n", s, nsize, a[0], a[1]); });
static_if<nsize == 3>{}(
[&](auto) { printf("%s size %d, {%d %d %d}\n", s, nsize, a[0], a[1], a[2]); });
static_if<nsize == 4>{}(
[&](auto) { printf("%s size %d, {%d %d %d %d}\n", s, nsize, a[0], a[1], a[2], a[3]); });
static_if<nsize == 5>{}([&](auto) {
printf("%s size %d, {%d %d %d %d %d}\n", s, nsize, a[0], a[1], a[2], a[3], a[4]);
});
static_if<nsize == 6>{}([&](auto) {
printf("%s size %d, {%d %d %d %d %d %d}\n", s, nsize, a[0], a[1], a[2], a[3], a[4], a[5]);
});
static_if<nsize == 7>{}([&](auto) {
printf("%s size %d, {%d %d %d %d %d %d %d}\n",
s,
nsize,
a[0],
a[1],
a[2],
a[3],
a[4],
a[5],
a[6]);
});
static_if<nsize == 8>{}([&](auto) {
printf("%s size %d, {%d %d %d %d %d %d %d %d}\n",
s,
nsize,
a[0],
a[1],
a[2],
a[3],
a[4],
a[5],
a[6],
a[7]);
});
static_if<nsize == 9>{}([&](auto) {
printf("%s size %d, {%d %d %d %d %d %d %d %d %d}\n",
s,
nsize,
a[0],
a[1],
a[2],
a[3],
a[4],
a[5],
a[6],
a[7],
a[8]);
});
static_if<nsize == 10>{}([&](auto) {
printf("%s size %d, {%d %d %d %d %d %d %d %d %d %d}\n",
s,
nsize,
a[0],
a[1],
a[2],
a[3],
a[4],
a[5],
a[6],
a[7],
a[8],
a[9]);
});
}
} // namespace ck
#endif
#ifndef CK_PRINT_SEQUENCE_HPP
#define CK_PRINT_SEQUENCE_HPP
#include "sequence.hpp"
namespace ck {
template <index_t... Xs>
__host__ __device__ void print_sequence(const char* s, Sequence<Xs...>)
{
constexpr index_t nsize = Sequence<Xs...>::Size();
static_assert(nsize <= 10, "wrong!");
static_if<nsize == 0>{}([&](auto) { printf("%s size %u, {}\n", s, nsize, Xs...); });
static_if<nsize == 1>{}([&](auto) { printf("%s size %u, {%u}\n", s, nsize, Xs...); });
static_if<nsize == 2>{}([&](auto) { printf("%s size %u, {%u %u}\n", s, nsize, Xs...); });
static_if<nsize == 3>{}([&](auto) { printf("%s size %u, {%u %u %u}\n", s, nsize, Xs...); });
static_if<nsize == 4>{}([&](auto) { printf("%s size %u, {%u %u %u %u}\n", s, nsize, Xs...); });
static_if<nsize == 5>{}(
[&](auto) { printf("%s size %u, {%u %u %u %u %u}\n", s, nsize, Xs...); });
static_if<nsize == 6>{}(
[&](auto) { printf("%s size %u, {%u %u %u %u %u %u}\n", s, nsize, Xs...); });
static_if<nsize == 7>{}(
[&](auto) { printf("%s size %u, {%u %u %u %u %u %u %u}\n", s, nsize, Xs...); });
static_if<nsize == 8>{}(
[&](auto) { printf("%s size %u, {%u %u %u %u %u %u %u %u}\n", s, nsize, Xs...); });
static_if<nsize == 9>{}(
[&](auto) { printf("%s size %u, {%u %u %u %u %u %u %u %u %u}\n", s, nsize, Xs...); });
static_if<nsize == 10>{}(
[&](auto) { printf("%s size %u, {%u %u %u %u %u %u %u %u %u %u}\n", s, nsize, Xs...); });
}
} // namespace ck
#endif
...@@ -2,8 +2,8 @@ ...@@ -2,8 +2,8 @@
#define CK_TUPLE_HPP #define CK_TUPLE_HPP
#include "integral_constant.hpp" #include "integral_constant.hpp"
#include "type.hpp"
#include "sequence.hpp" #include "sequence.hpp"
#include "type.hpp"
namespace ck { namespace ck {
......
...@@ -9,6 +9,6 @@ __device__ index_t get_thread_local_1d_id() { return threadIdx.x; } ...@@ -9,6 +9,6 @@ __device__ index_t get_thread_local_1d_id() { return threadIdx.x; }
__device__ index_t get_block_1d_id() { return blockIdx.x; } __device__ index_t get_block_1d_id() { return blockIdx.x; }
} // namspace ck } // namespace ck
#endif #endif
...@@ -40,6 +40,31 @@ void device_dummy_dynamic_transform(InDesc, ...@@ -40,6 +40,31 @@ void device_dummy_dynamic_transform(InDesc,
const auto in_left_pads = to_array(InLeftPads{}); const auto in_left_pads = to_array(InLeftPads{});
const auto in_right_pads = to_array(InRightPads{}); const auto in_right_pads = to_array(InRightPads{});
{
const auto tensor_descs = map_convolution_into_gemm(wei_kcyx_desc,
in_nchw_desc,
out_nkhw_desc,
conv_strides,
conv_dilations,
in_left_pads,
in_right_pads);
const auto in_gemmk_gemmn_global_desc = tensor_descs.At(Number<0>{});
print_array("cpu: in_gemmk_gemmn_global_desc:", in_gemmk_gemmn_global_desc.GetLengths());
const auto idx0 = MultiIndex<2>({2591, 36991});
const auto idx1 = in_gemmk_gemmn_global_desc.CalculateLowerIndex(idx0);
const auto idx2 =
in_gemmk_gemmn_global_desc.GetLowerTensorDescriptor().CalculateLowerIndex(idx1);
const index_t offset = in_gemmk_gemmn_global_desc.CalculateOffset(idx0);
print_array("idx0:", idx0);
print_array("idx1:", idx1);
print_array("idx2:", idx2);
printf("offset %d\n", offset);
}
std::size_t data_sz = sizeof(T); std::size_t data_sz = sizeof(T);
DeviceMem in_nchw_device_buf(data_sz * in_nchw.mDesc.GetElementSpace()); DeviceMem in_nchw_device_buf(data_sz * in_nchw.mDesc.GetElementSpace());
DeviceMem wei_kcyx_device_buf(data_sz * wei_kcyx.mDesc.GetElementSpace()); DeviceMem wei_kcyx_device_buf(data_sz * wei_kcyx.mDesc.GetElementSpace());
...@@ -54,6 +79,7 @@ void device_dummy_dynamic_transform(InDesc, ...@@ -54,6 +79,7 @@ void device_dummy_dynamic_transform(InDesc,
printf("%s: BlockSize %u, GridSize %u \n", __func__, BlockSize, GridSize); printf("%s: BlockSize %u, GridSize %u \n", __func__, BlockSize, GridSize);
#if 0
using dummy_transform = DummyDynamicTransform<BlockSize>; using dummy_transform = DummyDynamicTransform<BlockSize>;
for(index_t i = 0; i < 5; ++i) for(index_t i = 0; i < 5; ++i)
...@@ -92,6 +118,7 @@ void device_dummy_dynamic_transform(InDesc, ...@@ -92,6 +118,7 @@ void device_dummy_dynamic_transform(InDesc,
in_right_pads); in_right_pads);
} }
} }
#endif
out_nkhw_device_buf.FromDevice(out_nkhw.mData.data()); out_nkhw_device_buf.FromDevice(out_nkhw.mData.data());
} }
...@@ -5,8 +5,7 @@ ...@@ -5,8 +5,7 @@
#include <stdlib.h> #include <stdlib.h>
#include <half.hpp> #include <half.hpp>
#include "config.hpp" #include "config.hpp"
#include "print_array.hpp" #include "print.hpp"
#include "print_sequence.hpp"
#include "device.hpp" #include "device.hpp"
#include "host_tensor_generator.hpp" #include "host_tensor_generator.hpp"
#include "conv_common.hpp" #include "conv_common.hpp"
...@@ -496,10 +495,10 @@ int main(int argc, char* argv[]) ...@@ -496,10 +495,10 @@ int main(int argc, char* argv[])
ostream_tensor_descriptor(in_nchw_desc, std::cout << "in_nchw_desc: "); ostream_tensor_descriptor(in_nchw_desc, std::cout << "in_nchw_desc: ");
ostream_tensor_descriptor(wei_kcyx_desc, std::cout << "wei_kcyx_desc: "); ostream_tensor_descriptor(wei_kcyx_desc, std::cout << "wei_kcyx_desc: ");
ostream_tensor_descriptor(out_nkhw_desc, std::cout << "out_nkhw_desc: "); ostream_tensor_descriptor(out_nkhw_desc, std::cout << "out_nkhw_desc: ");
print_sequence("LeftPads", LeftPads{}); print_array("LeftPads", LeftPads{});
print_sequence("RightPads", RightPads{}); print_array("RightPads", RightPads{});
print_sequence("ConvStrides", ConvStrides{}); print_array("ConvStrides", ConvStrides{});
print_sequence("ConvDilations", ConvDilations{}); print_array("ConvDilations", ConvDilations{});
#if 1 #if 1
using in_data_t = float; using in_data_t = float;
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment