Commit 5e627be5 authored by Jing Zhang's avatar Jing Zhang
Browse files

debug

parent 78740e43
......@@ -269,7 +269,6 @@ struct DriverStaticConvolutionForwardImplicitGemm_v5r1_nchw_kcyx_nkhw_outpad
index_t nrepeat = 100;
#if 1
for(index_t i = 0; i < 5; ++i)
{
std::cout << "Start running " << nrepeat << " times..." << std::endl;
......@@ -376,7 +375,6 @@ struct DriverStaticConvolutionForwardImplicitGemm_v5r1_nchw_kcyx_nkhw_outpad
std::cout << "Average time : " << ave_time << " ms, " << perf << " TFlop/s"
<< std::endl;
}
#endif
}
};
} // namespace ck
......
......@@ -51,7 +51,11 @@ struct GridwiseStaticGemm_km_kn_mn_v3
{
__host__ __device__ static constexpr index_t GetSharedMemoryNumberOfByte()
{
constexpr auto E = EPerBlock * 3 * 3;
constexpr auto I0 = Number<0>{};
constexpr auto a_e_k_global_desc = AGlobalDesc{};
constexpr auto E = a_e_k_global_desc.GetLength(I0);
constexpr auto max_lds_align =
math::lcm(Number<ABlockTransferDstScalarPerVector_K>{}, Number<KPerBlock>{});
......@@ -393,60 +397,6 @@ struct GridwiseStaticGemm_km_kn_mn_v3
integral_constant<bool, HasMainKBlockLoop>{},
integral_constant<bool, HasDoubleTailKBlockLoop>{});
}
#if 0
// pass tensor descriptors by their pointers
template <bool HasMainKBlockLoop, bool HasDoubleTailKBlockLoop>
__device__ void Run(const AGlobalDesc* p_a_e_k_global_desc,
const FloatAB* __restrict__ p_a_global,
const BGlobalDesc* p_b_e_n_ho_wo_global_desc,
const FloatAB* __restrict__ p_b_global,
const CGlobalDesc* p_c_k_n_ho_wo_global_desc,
FloatC* __restrict__ p_c_global,
integral_constant<bool, HasMainKBlockLoop>,
integral_constant<bool, HasDoubleTailKBlockLoop>) const
{
const auto a_e_k_global_desc = *p_a_e_k_global_desc;
const auto b_e_n_ho_wo_global_desc = *p_b_e_n_ho_wo_global_desc;
const auto c_k_n_ho_wo_global_desc = *p_c_k_n_ho_wo_global_desc;
Run(a_e_k_global_desc,
p_a_global,
b_e_n_ho_wo_global_desc,
p_b_global,
c_k_n_ho_wo_global_desc,
p_c_global,
integral_constant<bool, HasMainKBlockLoop>{},
integral_constant<bool, HasDoubleTailKBlockLoop>{});
}
// pass tensor descriptors by void*
template <bool HasMainKBlockLoop, bool HasDoubleTailKBlockLoop>
__device__ void Run(const void* p_a_e_k_global_desc,
const FloatAB* __restrict__ p_a_global,
const void* p_b_e_n_ho_wo_global_desc,
const FloatAB* __restrict__ p_b_global,
const void* p_c_k_n_ho_wo_global_desc,
FloatC* __restrict__ p_c_global,
integral_constant<bool, HasMainKBlockLoop>,
integral_constant<bool, HasDoubleTailKBlockLoop>) const
{
const auto a_e_k_global_desc = *reinterpret_cast<const AGlobalDesc*>(p_a_e_k_global_desc);
const auto b_e_n_ho_wo_global_desc =
*reinterpret_cast<const BGlobalDesc*>(p_b_e_n_ho_wo_global_desc);
const auto c_k_n_ho_wo_global_desc =
*reinterpret_cast<const CGlobalDesc*>(p_c_k_n_ho_wo_global_desc);
Run(a_e_k_global_desc,
p_a_global,
b_e_n_ho_wo_global_desc,
p_b_global,
c_k_n_ho_wo_global_desc,
p_c_global,
integral_constant<bool, HasMainKBlockLoop>{},
integral_constant<bool, HasDoubleTailKBlockLoop>{});
}
#endif
};
} // namespace ck
......
......@@ -567,6 +567,7 @@ __device__ void amd_buffer_store_impl_v2(const typename vector_type<T, N>::type
{
vector_type<half_t, 8> tmp{src_thread_data};
#if 0
__llvm_amdgcn_raw_buffer_store_fp16x4(tmp.AsType<half4_t>()[Number<0>{}],
dst_wave_buffer_resource,
dst_thread_addr_offset,
......@@ -578,6 +579,11 @@ __device__ void amd_buffer_store_impl_v2(const typename vector_type<T, N>::type
dst_thread_addr_offset,
dst_wave_addr_offset + 4 * sizeof(half_t),
0);
#else
auto tmp_ = as_type<float4_t>(tmp);
__llvm_amdgcn_raw_buffer_store_fp32x4(
tmp_, dst_wave_buffer_resource, dst_thread_addr_offset, dst_wave_addr_offset, 0);
#endif
}
}
}
......
......@@ -16,6 +16,7 @@
#include "device_dynamic_convolution_forward_implicit_gemm_v4r4r2_nhwc_kyxc_nhwk.hpp"
#include "device_dynamic_convolution_forward_implicit_gemm_v6r1_nchw_kcyx_nkhw.hpp"
#include "device_static_convolution_forward_implicit_gemm_v5r1_nchw_kcyx_nkhw.hpp"
#include "device_dynamic_convolution_forward_implicit_gemm_v5r1_nchw_kcyx_nkhw.hpp"
#include "device_dynamic_convolution_forward_implicit_gemm_v4r4r2_xdlops_nchw_kcyx_nkhw.hpp"
#include "device_dynamic_convolution_forward_implicit_gemm_v4r4r4_xdlops_nhwc_kyxc_nhwk.hpp"
......@@ -104,7 +105,7 @@ int main(int argc, char* argv[])
constexpr index_t N = 1;
constexpr index_t C = 16;
constexpr index_t Hi = 540;
constexpr index_t Hi = 544;
constexpr index_t Wi = 960;
constexpr index_t K = 16;
constexpr index_t Y = 3;
......@@ -380,20 +381,22 @@ int main(int argc, char* argv[])
const auto tmp = f_make_for_device_nchw();
device_static_convolution_forward_implicit_gemm_v5r1_nchw_kcyx_nkhw<in_data_t,
8,
acc_data_t,
out_data_t>(tmp[I0],
tmp[I1],
tmp[I2],
tmp[I3],
tmp[I4],
tmp[I5],
tmp[I6],
in,
wei,
out_device,
nrepeat);
#if 1
device_static_convolution_forward_implicit_gemm_v5r1_nchw_kcyx_nkhw
#else
device_dynamic_convolution_forward_implicit_gemm_v5r1_nchw_kcyx_nkhw
#endif
<in_data_t, 8, acc_data_t, out_data_t>(tmp[I0],
tmp[I1],
tmp[I2],
tmp[I3],
tmp[I4],
tmp[I5],
tmp[I6],
in,
wei,
out_device,
nrepeat);
}
#endif
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment