Commit e273d4d3 authored by Jing Zhang's avatar Jing Zhang
Browse files

fixed

parent 332f9039
......@@ -151,8 +151,8 @@ struct DriverDynamicConvolutionForwardImplicitGemm_v5r1_nchw_kcyx_nkhw_outpad
// add tensor
const auto add_k_n_hopx2_wopx2_global_desc = transform_dynamic_tensor_descriptor(
make_dynamic_naive_tensor_descriptor_packed_v2(make_tuple(N, K0, Hox2, Wox2, K1)),
make_tuple(make_merge_transform(make_tuple(K0, K1)),
make_dynamic_naive_tensor_descriptor_packed_v2(make_tuple(N, K0, Hox2, Wox2, 1)),
make_tuple(make_merge_transform(make_tuple(K0, 1)),
make_pass_through_transform(N),
make_pad_transform(Hox2, 0, AddRightPadH),
make_pad_transform(Wox2, 0, AddRightPadW)),
......
......@@ -366,10 +366,11 @@ struct GridwiseDynamicGemm_km_kn_mn_v3
const index_t wox2_thread_data_on_global =
wox2_block_data_on_global + wo_thread_id * WoPerThreadx2;
static_assert(KPerThread % 16 == 0, "");
constexpr auto KPerThreadAdd = KPerThread / 16;
static_assert(KPerThread % CThreadTransferDstScalarPerVector == 0, "");
constexpr auto KPerThreadAdd = KPerThread / CThreadTransferDstScalarPerVector;
const index_t k_block_data_on_global_add = k_block_work_id * KPerBlock / 16;
const index_t k_block_data_on_global_add =
k_block_work_id * KPerBlock / CThreadTransferDstScalarPerVector;
const index_t k_thread_data_on_global_add =
k_block_data_on_global_add + k_thread_id * KPerThreadAdd;
......@@ -382,11 +383,11 @@ struct GridwiseDynamicGemm_km_kn_mn_v3
FloatAB p_d_thread[d_k_n_hox2_wox2_thread_desc.GetElementSpaceSize()];
constexpr auto vector_len = sizeof(FloatAB) / sizeof(FloatC);
static_assert(vector_len == 16);
static_assert(vector_len == CThreadTransferDstScalarPerVector);
constexpr auto c_k_n_ho_wo_global_tensor_iterator_hacks = CGlobalIteratorHacks{};
#if 1
#if 0
ThreadwiseDynamicTensorSliceTransfer_v2<
FloatAB,
FloatAB,
......@@ -415,17 +416,26 @@ struct GridwiseDynamicGemm_km_kn_mn_v3
#endif
#if 0
#if 1
for(index_t k_i = 0; k_i < KPerThreadAdd; ++k_i)
{
for(index_t h_i = 0; h_i < HoPerThreadx2; ++h_i)
{
for(index_t w_i = 0; w_i < WoPerThreadx2; ++w_i)
{
vector_type<FloatC, vector_len> d_vec;
d_vec.Vector() = p_d_thread[d_k_n_hox2_wox2_thread_desc.CalculateOffset(
make_tuple(k_i, 0, h_i, w_i))];
static_for<0, vector_len, 1>{}([&](auto i) {
d_vec.Scalars()(i) = 0;
//p_c_thread[c_k_n_ho_wo_thread_desc.CalculateOffset(
//make_tuple(k_i * vector_len + i, 0, h_i / 2, w_i / 2))];
});
p_d_thread[d_k_n_hox2_wox2_thread_desc.CalculateOffset(
make_tuple(k_i, 0, h_i, w_i))] += 1;
//p_c_thread[c_k_n_ho_wo_thread_desc.CalculateOffset(
//make_tuple(k_i, 0, h_i / 2, w_i / 2))];
make_tuple(k_i, 0, h_i, w_i))] = d_vec.Vector();
}
}
}
......
......@@ -24,7 +24,8 @@ std::ostream& LogRange(std::ostream& os, Range&& range, std::string delim)
return os;
}
typedef enum {
typedef enum
{
Half = 0,
Float = 1,
} DataType_t;
......
......@@ -78,7 +78,7 @@ int main(int argc, char* argv[])
using LeftPads = Sequence<1, 1>;
using RightPads = Sequence<1, 1>;
#elif 0
#elif 1
constexpr index_t N = 1;
constexpr index_t C = 4;
constexpr index_t HI = 64;
......@@ -637,7 +637,7 @@ int main(int argc, char* argv[])
print_array("ConvStrides", to_multi_index(ConvStrides{}));
print_array("ConvDilations", to_multi_index(ConvDilations{}));
#if 0
#if 1
using in_data_t = float;
constexpr index_t in_vector_size = 1;
using acc_data_t = float;
......@@ -654,7 +654,7 @@ int main(int argc, char* argv[])
using out_data_t = int8_t;
#elif 1
using in_data_t = int8_t;
constexpr index_t in_vector_size = 16;
constexpr index_t in_vector_size = 4;
using acc_data_t = int32_t;
using out_data_t = int8_t;
#endif
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment