Commit 5514dc0b authored by Chao Liu's avatar Chao Liu
Browse files

prototype dynamic tensor descriptor

parent 0b7fcca6
......@@ -8,6 +8,76 @@
#include "dynamic_tensor_descriptor_helper.hpp"
namespace ck {
__host__ __device__ constexpr auto
map_convolution_into_gemm(const DynamicNativeTensorDescriptor<4> wei_k_c_y_x_global_desc,
const DynamicNativeTensorDescriptor<4> in_n_c_hi_wi_global_desc,
const DynamicNativeTensorDescriptor<4> out_n_k_ho_wo_global_desc,
const Array<index_t, 2> conv_strides,
const Array<index_t, 2> conv_dilations,
const Array<index_t, 2> in_left_pads,
const Array<index_t, 2> in_right_pads)
{
const index_t N = in_n_c_hi_wi_global_desc.GetLength(0);
const index_t C = in_n_c_hi_wi_global_desc.GetLength(1);
const index_t K = out_n_k_ho_wo_global_desc.GetLength(1);
const index_t Y = wei_k_c_y_x_global_desc.GetLength(2);
const index_t X = wei_k_c_y_x_global_desc.GetLength(3);
const index_t Hi = in_n_c_hi_wi_global_desc.GetLength(2);
const index_t Wi = in_n_c_hi_wi_global_desc.GetLength(3);
const index_t Ho = out_n_k_ho_wo_global_desc.GetLength(2);
const index_t Wo = out_n_k_ho_wo_global_desc.GetLength(3);
const index_t ConvStrideH = conv_strides[0];
const index_t ConvStrideW = conv_strides[1];
const index_t ConvDilationH = conv_dilations[0];
const index_t ConvDilationW = conv_dilations[1];
const index_t InLeftPadH = in_left_pads[0];
const index_t InLeftPadW = in_left_pads[1];
const index_t InRightPadH = in_right_pads[0];
const index_t InRightPadW = in_right_pads[1];
// input tensor
const auto in_n_c_hip_wip_global_desc = transform_dynamic_tensor_descriptor(
transform_dynamic_tensor_descriptor(
in_n_c_hi_wi_global_desc,
make_tuple(DynamicPassThrough{N},
DynamicPassThrough{C},
DynamicLeftPad{Hi, InLeftPadH},
DynamicLeftPad{Wi, InLeftPadW}),
make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{})),
make_tuple(DynamicPassThrough{N},
DynamicPassThrough{C},
DynamicRightPad{Hi + InLeftPadH, InRightPadH},
DynamicRightPad{Wi + InLeftPadW, InRightPadW}),
make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}));
const index_t Hip = in_n_c_hip_wip_global_desc.GetLength(2);
const index_t Wip = in_n_c_hip_wip_global_desc.GetLength(3);
const auto in_n_c_y_ho_x_wo_global_desc = transform_dynamic_tensor_descriptor(
in_n_c_hip_wip_global_desc,
make_tuple(DynamicPassThrough{N},
DynamicPassThrough{C},
DynamicEmbed<2>{{Y, Ho}, {ConvDilationH, ConvStrideH, 0}},
DynamicEmbed<2>{{X, Wo}, {ConvDilationW, ConvStrideW, 0}}),
make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2, 3>{}, Sequence<4, 5>{}));
const auto in_gemmk_gemmn_global_desc = transform_dynamic_tensor_descriptor(
in_n_c_y_ho_x_wo_global_desc,
make_tuple(DynamicMerge<3>{{C, Y, X}}, DynamicMerge<3>{{N, Ho, Wo}}),
make_tuple(Sequence<1, 2, 4>{}, Sequence<0, 3, 5>{}),
make_tuple(Sequence<0>{}, Sequence<1>{}));
return make_tuple(in_gemmk_gemmn_global_desc);
}
template <index_t BlockSize>
struct DummyDynamicTransform
......@@ -462,64 +532,15 @@ struct DummyDynamicTransform
const Array<index_t, 2> in_left_pads,
const Array<index_t, 2> in_right_pads) const
{
const index_t N = in_n_c_hi_wi_global_desc.GetLength(0);
const index_t C = in_n_c_hi_wi_global_desc.GetLength(1);
const index_t K = out_n_k_ho_wo_global_desc.GetLength(1);
const index_t Y = wei_k_c_y_x_global_desc.GetLength(2);
const index_t X = wei_k_c_y_x_global_desc.GetLength(3);
const index_t Hi = in_n_c_hi_wi_global_desc.GetLength(2);
const index_t Wi = in_n_c_hi_wi_global_desc.GetLength(3);
const index_t Ho = out_n_k_ho_wo_global_desc.GetLength(2);
const index_t Wo = out_n_k_ho_wo_global_desc.GetLength(3);
const index_t ConvStrideH = conv_strides[0];
const index_t ConvStrideW = conv_strides[1];
const index_t ConvDilationH = conv_dilations[0];
const index_t ConvDilationW = conv_dilations[1];
const index_t InLeftPadH = in_left_pads[0];
const index_t InLeftPadW = in_left_pads[1];
const index_t InRightPadH = in_right_pads[0];
const index_t InRightPadW = in_right_pads[1];
// input tensor
const auto in_n_c_hip_wip_global_desc = transform_dynamic_tensor_descriptor(
transform_dynamic_tensor_descriptor(
const auto transformed_tensor_descs = map_convolution_into_gemm(wei_k_c_y_x_global_desc,
in_n_c_hi_wi_global_desc,
make_tuple(DynamicPassThrough{N},
DynamicPassThrough{C},
DynamicLeftPad{Hi, InLeftPadH},
DynamicLeftPad{Wi, InLeftPadW}),
make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{})),
make_tuple(DynamicPassThrough{N},
DynamicPassThrough{C},
DynamicRightPad{Hi + InLeftPadH, InRightPadH},
DynamicRightPad{Wi + InLeftPadW, InRightPadW}),
make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}));
const index_t Hip = in_n_c_hip_wip_global_desc.GetLength(2);
const index_t Wip = in_n_c_hip_wip_global_desc.GetLength(3);
const auto in_n_c_y_ho_x_wo_global_desc = transform_dynamic_tensor_descriptor(
in_n_c_hip_wip_global_desc,
make_tuple(DynamicPassThrough{N},
DynamicPassThrough{C},
DynamicEmbed<2>{{Y, Ho}, {ConvDilationH, ConvStrideH, 0}},
DynamicEmbed<2>{{X, Wo}, {ConvDilationW, ConvStrideW, 0}}),
make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2, 3>{}, Sequence<4, 5>{}));
out_n_k_ho_wo_global_desc,
conv_strides,
conv_dilations,
in_left_pads,
in_right_pads);
const auto in_gemmk_gemmn_global_desc = transform_dynamic_tensor_descriptor(
in_n_c_y_ho_x_wo_global_desc,
make_tuple(DynamicMerge<3>{{C, Y, X}}, DynamicMerge<3>{{N, Ho, Wo}}),
make_tuple(Sequence<1, 2, 4>{}, Sequence<0, 3, 5>{}),
make_tuple(Sequence<0>{}, Sequence<1>{}));
const auto in_gemmk_gemmn_global_desc = transformed_tensor_descs.At(Number<0>{});
#pragma unroll 1
for(index_t iter = 0; iter < 100; ++iter)
......
......@@ -186,7 +186,8 @@ struct DynamicRightPad
}
};
// idx_low = coefficients[0, ...nDimUp-1] * idx_up[0, ...nDimUp-1] + coefficients[nDimUp]
// idx_low = coefficients[0, ...nDimUp-1] * idx_up[0, ...nDimUp-1] +
// coefficients[nDimUp]
template <index_t NDimUp>
struct DynamicEmbed
{
......@@ -277,7 +278,7 @@ struct DynamicMerge
__host__ __device__ explicit constexpr DynamicMerge(const LowerIndex& low_lengths)
: low_lengths_{low_lengths},
low_lengths_scan_{reverse_inclusive_scan_on_array(
low_lengths_scan_{reverse_exclusive_scan_on_array(
low_lengths, math::multiplies<index_t>{}, index_t{1})},
up_length_{reduce_on_array(low_lengths, math::multiplies<index_t>(), 1)}
{
......@@ -303,16 +304,16 @@ struct DynamicMerge
static_assert(LowIdx::Size() == NDimLow && UpIdx::Size() == 1,
"wrong! inconsistent # of dimension");
index_t itmp = idx_up[0];
index_t tmp = idx_up[0];
#pragma unroll
for(index_t i; i < NDimLow - 1; ++i)
for(index_t i = 0; i < NDimLow - 1; ++i)
{
idx_low(i) = itmp / low_lengths_scan_[i];
itmp -= idx_low[i] * low_lengths_scan_[i];
idx_low(i) = tmp / low_lengths_scan_[i];
tmp -= idx_low[i] * low_lengths_scan_[i];
}
idx_low(NDimLow - 1) = itmp;
idx_low(NDimLow - 1) = tmp;
}
// idx_low_diff depends on idx_low_old, so idx_low need to be up-to-date
......@@ -334,10 +335,14 @@ struct DynamicMerge
// 1) If idx_up_diff is known at compile-time, then idx_low_diff_const
// can be calculated at compile-time.
// 2) If idx_up_diff is not known at compile-time, but its value
// doesn't change during the whole kernel execution, then idx_low_diff_const also
// doesn't change during the whole kernel execution. Compiler generated ISA should
// only caclculate idx_low_diff_const once and save it durinng the whole kernel execution
// If neither 1) nor 2) is satisfied, then the calculation will also be computed at
// doesn't change during the whole kernel execution, then
// idx_low_diff_const also
// doesn't change during the whole kernel execution. Compiler generated
// ISA should
// only caclculate idx_low_diff_const once and save it durinng the whole
// kernel execution
// If neither 1) nor 2) is satisfied, then the calculation will also be
// computed at
// run-time each time this function is called, and can be very expensive.
LowerIndex idx_low_diff_const = CalculateLowerIndex(idx_up_diff);
......@@ -346,7 +351,7 @@ struct DynamicMerge
index_t carry = 0;
#pragma unroll
for(index_t i = NDimLow - 1; i > 1; --i)
for(index_t i = NDimLow - 1; i > 0; --i)
{
// this should be saved as well
index_t idx_low_length_minus_idx_low_diff_const =
......
......@@ -168,10 +168,13 @@ struct DynamicTransformedTensorDescriptor
is_valid_sequence_map<sorted_low_dimension_ids>{},
"wrong! LowDimensionIds is not configured correctly");
// TODO: sanity check: while a up-dimension could be associated with multille
// transformation, a low-dimension should be associated with only one transformation
// TODO: sanity check: while a up-dimension could be associated with
// multille
// transformation, a low-dimension should be associated with only one
// transformation
// TODO: sanity-check: GetLowerLengths of each transform should be consistent with lengths
// TODO: sanity-check: GetLowerLengths of each transform should be
// consistent with lengths
// of lower-tensor-descriptor
}
......@@ -222,13 +225,15 @@ struct DynamicTransformedTensorDescriptor
return low_tensor_desc_.GetElementSpace();
}
__host__ __device__ constexpr auto GetLowerTensorDescriptor() const { return low_tensor_desc_; }
template <typename LowIdx, typename UpIdx>
__host__ __device__ void CalculateLowerIndex(LowIdx& idx_low, const UpIdx& idx_up) const
{
static_for<0, NTransform, 1>{}([&](auto itran) constexpr {
auto tran = transforms_.At(itran);
const auto tran = transforms_.At(itran);
auto idx_up_part = pick_array_element(idx_up, UpDimensionIds{}.At(itran));
const auto idx_up_part = pick_array_element(idx_up, UpDimensionIds{}.At(itran));
auto idx_low_part = pick_array_element(idx_low, LowDimensionIds{}.At(itran));
tran.CalculateLowerIndex(idx_low_part, idx_up_part);
......
#ifndef CK_ARRAY_HPP
#define CK_ARRAY_HPP
#include "sequence.hpp"
#include "functional2.hpp"
#include "sequence.hpp"
namespace ck {
......
......@@ -288,7 +288,7 @@ reverse_exclusive_scan_on_array(const Array<TData, NSize>& x, Reduce f, TData in
r = f(r, x[i]);
}
y(NSize - 1) = r;
y(0) = r;
return y;
}
......
#ifndef CK_COMMON_HEADER_HPP
#define CK_COMMON_HEADER_HPP
#include "config.hpp"
#include "utility.hpp"
#include "integral_constant.hpp"
#include "number.hpp"
#include "float_type.hpp"
#include "type.hpp"
#include "tuple.hpp"
#include "tuple_helper.hpp"
#include "math.hpp"
#include "sequence.hpp"
#include "array.hpp"
#include "array_helper.hpp"
#include "config.hpp"
#include "float_type.hpp"
#include "functional.hpp"
#include "functional2.hpp"
#include "functional3.hpp"
#include "functional4.hpp"
#include "in_memory_operation.hpp"
#include "integral_constant.hpp"
#include "math.hpp"
#include "number.hpp"
#include "sequence.hpp"
#include "synchronization.hpp"
#include "tuple.hpp"
#include "tuple_helper.hpp"
#include "type.hpp"
#include "utility.hpp"
#if CK_USE_AMD_INLINE_ASM
#include "amd_inline_asm.hpp"
......
......@@ -2,7 +2,6 @@
#define CK_FUNCTIONAL_HPP
#include "integral_constant.hpp"
#include "sequence.hpp"
#include "type.hpp"
namespace ck {
......@@ -56,8 +55,10 @@ struct static_if<true>
__host__ __device__ constexpr auto operator()(F f) const
{
// This is a trick for compiler:
// Pass forwarder to lambda "f" as "auto" argument, and make sure "f" will use it,
// this will make "f" a generic lambda, so that "f" won't be compiled until being
// Pass forwarder to lambda "f" as "auto" argument, and make sure "f" will
// use it,
// this will make "f" a generic lambda, so that "f" won't be compiled
// until being
// instantiated here
f(forwarder{});
return Type{};
......@@ -84,8 +85,10 @@ struct static_if<false>
__host__ __device__ static void Else(F f)
{
// This is a trick for compiler:
// Pass forwarder to lambda "f" as "auto" argument, and make sure "f" will use it,
// this will make "f" a generic lambda, so that "f" won't be compiled until being
// Pass forwarder to lambda "f" as "auto" argument, and make sure "f" will
// use it,
// this will make "f" a generic lambda, so that "f" won't be compiled
// until being
// instantiated here
f(forwarder{});
}
......
#ifndef CK_FUNCTIONAL3_HPP
#define CK_FUNCTIONAL3_HPP
#include "array.hpp"
#include "functional.hpp"
#include "functional2.hpp"
#include "sequence.hpp"
#include "array.hpp"
namespace ck {
......@@ -83,8 +83,10 @@ struct ford_impl<Sequence<>, Orders>
} // namespace detail
// Lengths is Sequence<...>, it is the length of each dimension for N-dimensional loop
// Orders is Sequence<...>, it is the order of dimension in which static_ford will loop over each
// Lengths is Sequence<...>, it is the length of each dimension for
// N-dimensional loop
// Orders is Sequence<...>, it is the order of dimension in which static_ford
// will loop over each
// dimension
template <class Lengths,
class Orders = typename arithmetic_sequence_gen<0, Lengths::GetSize(), 1>::type>
......@@ -106,8 +108,10 @@ struct static_ford
}
};
// Lengths is Sequence<...>, it is the length of each dimension for N-dimensional loop
// Orders is Sequence<...>, it is the order of dimension in which ford will loop over each
// Lengths is Sequence<...>, it is the length of each dimension for
// N-dimensional loop
// Orders is Sequence<...>, it is the order of dimension in which ford will loop
// over each
// dimension
template <class Lengths,
class Orders = typename arithmetic_sequence_gen<0, Lengths::GetSize(), 1>::type>
......
......@@ -165,6 +165,6 @@ struct less
};
} // namespace math
} // namspace ck
} // namespace ck
#endif
#ifndef CK_PRINT_ARRAY_HPP
#define CK_PRINT_ARRAY_HPP
#include "array.hpp"
namespace ck {
template <index_t NSize>
__host__ __device__ void print_array(const char* s, Array<uint32_t, NSize> a)
{
constexpr index_t nsize = a.GetSize();
static_assert(nsize > 0 && nsize <= 10, "wrong!");
static_if<nsize == 1>{}([&](auto) { printf("%s size %u, {%u}\n", s, nsize, a[0]); });
static_if<nsize == 2>{}([&](auto) { printf("%s size %u, {%u %u}\n", s, nsize, a[0], a[1]); });
static_if<nsize == 3>{}(
[&](auto) { printf("%s size %u, {%u %u %u}\n", s, nsize, a[0], a[1], a[2]); });
static_if<nsize == 4>{}(
[&](auto) { printf("%s size %u, {%u %u %u %u}\n", s, nsize, a[0], a[1], a[2], a[3]); });
static_if<nsize == 5>{}([&](auto) {
printf("%s size %u, {%u %u %u %u %u}\n", s, nsize, a[0], a[1], a[2], a[3], a[4]);
});
static_if<nsize == 6>{}([&](auto) {
printf("%s size %u, {%u %u %u %u %u %u}\n", s, nsize, a[0], a[1], a[2], a[3], a[4], a[5]);
});
static_if<nsize == 7>{}([&](auto) {
printf("%s size %u, {%u %u %u %u %u %u %u}\n",
s,
nsize,
a[0],
a[1],
a[2],
a[3],
a[4],
a[5],
a[6]);
});
static_if<nsize == 8>{}([&](auto) {
printf("%s size %u, {%u %u %u %u %u %u %u %u}\n",
s,
nsize,
a[0],
a[1],
a[2],
a[3],
a[4],
a[5],
a[6],
a[7]);
});
static_if<nsize == 9>{}([&](auto) {
printf("%s size %u, {%u %u %u %u %u %u %u %u %u}\n",
s,
nsize,
a[0],
a[1],
a[2],
a[3],
a[4],
a[5],
a[6],
a[7],
a[8]);
});
static_if<nsize == 10>{}([&](auto) {
printf("%s size %u, {%u %u %u %u %u %u %u %u %u %u}\n",
s,
nsize,
a[0],
a[1],
a[2],
a[3],
a[4],
a[5],
a[6],
a[7],
a[8],
a[9]);
});
}
template <index_t NSize>
__host__ __device__ void print_array(const char* s, Array<int32_t, NSize> a)
{
constexpr index_t nsize = a.GetSize();
static_assert(nsize > 0 && nsize <= 10, "wrong!");
static_if<nsize == 1>{}([&](auto) { printf("%s size %d, {%d}\n", s, nsize, a[0]); });
static_if<nsize == 2>{}([&](auto) { printf("%s size %d, {%d %d}\n", s, nsize, a[0], a[1]); });
static_if<nsize == 3>{}(
[&](auto) { printf("%s size %d, {%d %d %d}\n", s, nsize, a[0], a[1], a[2]); });
static_if<nsize == 4>{}(
[&](auto) { printf("%s size %d, {%d %d %d %d}\n", s, nsize, a[0], a[1], a[2], a[3]); });
static_if<nsize == 5>{}([&](auto) {
printf("%s size %d, {%d %d %d %d %d}\n", s, nsize, a[0], a[1], a[2], a[3], a[4]);
});
static_if<nsize == 6>{}([&](auto) {
printf("%s size %d, {%d %d %d %d %d %d}\n", s, nsize, a[0], a[1], a[2], a[3], a[4], a[5]);
});
static_if<nsize == 7>{}([&](auto) {
printf("%s size %d, {%d %d %d %d %d %d %d}\n",
s,
nsize,
a[0],
a[1],
a[2],
a[3],
a[4],
a[5],
a[6]);
});
static_if<nsize == 8>{}([&](auto) {
printf("%s size %d, {%d %d %d %d %d %d %d %d}\n",
s,
nsize,
a[0],
a[1],
a[2],
a[3],
a[4],
a[5],
a[6],
a[7]);
});
static_if<nsize == 9>{}([&](auto) {
printf("%s size %d, {%d %d %d %d %d %d %d %d %d}\n",
s,
nsize,
a[0],
a[1],
a[2],
a[3],
a[4],
a[5],
a[6],
a[7],
a[8]);
});
static_if<nsize == 10>{}([&](auto) {
printf("%s size %d, {%d %d %d %d %d %d %d %d %d %d}\n",
s,
nsize,
a[0],
a[1],
a[2],
a[3],
a[4],
a[5],
a[6],
a[7],
a[8],
a[9]);
});
}
} // namespace ck
#endif
#ifndef CK_PRINT_SEQUENCE_HPP
#define CK_PRINT_SEQUENCE_HPP
#include "sequence.hpp"
namespace ck {
template <index_t... Xs>
__host__ __device__ void print_sequence(const char* s, Sequence<Xs...>)
{
constexpr index_t nsize = Sequence<Xs...>::Size();
static_assert(nsize <= 10, "wrong!");
static_if<nsize == 0>{}([&](auto) { printf("%s size %u, {}\n", s, nsize, Xs...); });
static_if<nsize == 1>{}([&](auto) { printf("%s size %u, {%u}\n", s, nsize, Xs...); });
static_if<nsize == 2>{}([&](auto) { printf("%s size %u, {%u %u}\n", s, nsize, Xs...); });
static_if<nsize == 3>{}([&](auto) { printf("%s size %u, {%u %u %u}\n", s, nsize, Xs...); });
static_if<nsize == 4>{}([&](auto) { printf("%s size %u, {%u %u %u %u}\n", s, nsize, Xs...); });
static_if<nsize == 5>{}(
[&](auto) { printf("%s size %u, {%u %u %u %u %u}\n", s, nsize, Xs...); });
static_if<nsize == 6>{}(
[&](auto) { printf("%s size %u, {%u %u %u %u %u %u}\n", s, nsize, Xs...); });
static_if<nsize == 7>{}(
[&](auto) { printf("%s size %u, {%u %u %u %u %u %u %u}\n", s, nsize, Xs...); });
static_if<nsize == 8>{}(
[&](auto) { printf("%s size %u, {%u %u %u %u %u %u %u %u}\n", s, nsize, Xs...); });
static_if<nsize == 9>{}(
[&](auto) { printf("%s size %u, {%u %u %u %u %u %u %u %u %u}\n", s, nsize, Xs...); });
static_if<nsize == 10>{}(
[&](auto) { printf("%s size %u, {%u %u %u %u %u %u %u %u %u %u}\n", s, nsize, Xs...); });
}
} // namespace ck
#endif
......@@ -2,8 +2,8 @@
#define CK_TUPLE_HPP
#include "integral_constant.hpp"
#include "type.hpp"
#include "sequence.hpp"
#include "type.hpp"
namespace ck {
......
......@@ -9,6 +9,6 @@ __device__ index_t get_thread_local_1d_id() { return threadIdx.x; }
__device__ index_t get_block_1d_id() { return blockIdx.x; }
} // namspace ck
} // namespace ck
#endif
......@@ -40,6 +40,31 @@ void device_dummy_dynamic_transform(InDesc,
const auto in_left_pads = to_array(InLeftPads{});
const auto in_right_pads = to_array(InRightPads{});
{
const auto tensor_descs = map_convolution_into_gemm(wei_kcyx_desc,
in_nchw_desc,
out_nkhw_desc,
conv_strides,
conv_dilations,
in_left_pads,
in_right_pads);
const auto in_gemmk_gemmn_global_desc = tensor_descs.At(Number<0>{});
print_array("cpu: in_gemmk_gemmn_global_desc:", in_gemmk_gemmn_global_desc.GetLengths());
const auto idx0 = MultiIndex<2>({2591, 36991});
const auto idx1 = in_gemmk_gemmn_global_desc.CalculateLowerIndex(idx0);
const auto idx2 =
in_gemmk_gemmn_global_desc.GetLowerTensorDescriptor().CalculateLowerIndex(idx1);
const index_t offset = in_gemmk_gemmn_global_desc.CalculateOffset(idx0);
print_array("idx0:", idx0);
print_array("idx1:", idx1);
print_array("idx2:", idx2);
printf("offset %d\n", offset);
}
std::size_t data_sz = sizeof(T);
DeviceMem in_nchw_device_buf(data_sz * in_nchw.mDesc.GetElementSpace());
DeviceMem wei_kcyx_device_buf(data_sz * wei_kcyx.mDesc.GetElementSpace());
......@@ -54,6 +79,7 @@ void device_dummy_dynamic_transform(InDesc,
printf("%s: BlockSize %u, GridSize %u \n", __func__, BlockSize, GridSize);
#if 0
using dummy_transform = DummyDynamicTransform<BlockSize>;
for(index_t i = 0; i < 5; ++i)
......@@ -92,6 +118,7 @@ void device_dummy_dynamic_transform(InDesc,
in_right_pads);
}
}
#endif
out_nkhw_device_buf.FromDevice(out_nkhw.mData.data());
}
......@@ -5,8 +5,7 @@
#include <stdlib.h>
#include <half.hpp>
#include "config.hpp"
#include "print_array.hpp"
#include "print_sequence.hpp"
#include "print.hpp"
#include "device.hpp"
#include "host_tensor_generator.hpp"
#include "conv_common.hpp"
......@@ -496,10 +495,10 @@ int main(int argc, char* argv[])
ostream_tensor_descriptor(in_nchw_desc, std::cout << "in_nchw_desc: ");
ostream_tensor_descriptor(wei_kcyx_desc, std::cout << "wei_kcyx_desc: ");
ostream_tensor_descriptor(out_nkhw_desc, std::cout << "out_nkhw_desc: ");
print_sequence("LeftPads", LeftPads{});
print_sequence("RightPads", RightPads{});
print_sequence("ConvStrides", ConvStrides{});
print_sequence("ConvDilations", ConvDilations{});
print_array("LeftPads", LeftPads{});
print_array("RightPads", RightPads{});
print_array("ConvStrides", ConvStrides{});
print_array("ConvDilations", ConvDilations{});
#if 1
using in_data_t = float;
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment