Commit 64705e7d authored by Jing Zhang's avatar Jing Zhang
Browse files

for binary dumps

parent e9575251
......@@ -315,7 +315,7 @@ struct GridwiseGemmDlops_km_kn_mn_v3_add
static constexpr auto NPerBlock = I1;
static constexpr FloatAcc alpha = 0.30000001192092896;
static constexpr FloatAcc alpha = 0.3;
__host__ __device__ static constexpr index_t GetSharedMemoryNumberOfByte()
{
......@@ -360,7 +360,7 @@ struct GridwiseGemmDlops_km_kn_mn_v3_add
__host__ __device__ static constexpr bool CalculateHasMainE1BlockLoop()
{
const bool has_main_e1_block_loop = (E1 + E1PerBlock) / (2 * E1PerBlock) > 1;
const bool has_main_e1_block_loop = ((E1 + E1PerBlock) / (2 * E1PerBlock)) > 1;
return has_main_e1_block_loop;
}
......@@ -699,9 +699,9 @@ struct GridwiseGemmDlops_km_kn_mn_v3_add
decltype(a_e0_e1_k0_k1_e2_grid_desc),
decltype(a_e0_e1_k0_k1_e2_block_copy_desc),
ABlockTransferSrcAccessOrder,
Sequence<0, 1, 2, 3, 4>, // ABlockTransferDstAccessOrder
Sequence<0, 1, 2, 3, 4>,
ABlockTransferSrcVectorDim,
4, // ABlockTransferDstVectorDim
4,
ABlockTransferSrcScalarPerVector,
ABlockTransferDstScalarPerVector_E2,
1,
......
......@@ -9,6 +9,8 @@ __device__ index_t get_thread_local_1d_id() { return threadIdx.x; }
__device__ index_t get_block_1d_id() { return blockIdx.x; }
//__device__ index_t get_block_1d_id() { return gridDim.x - 1 - blockIdx.x; }
} // namespace ck
#endif
......@@ -71,7 +71,7 @@ void device_convolution_add_forward_implicit_gemm_v5r1_dlops_nc0hwc1_kc0yxc1_nk0
bias_k0_k1_device_buf.ToDevice(bias_k0_k1.mData.data());
add_n_k0_hox2_wox2_k1_device_buf.ToDevice(add_n_k0_hox2_wox2_k1.mData.data());
constexpr index_t InWeiVectorSize = 8;
constexpr index_t InWeiVectorSize = C1;
if(C1 % InWeiVectorSize != 0)
{
......@@ -171,6 +171,11 @@ void device_convolution_add_forward_implicit_gemm_v5r1_dlops_nc0hwc1_kc0yxc1_nk0
CThreadTransferDstScalarPerVector_K,
activ_type>{};
std::cerr << "conv_bias_activ_resize_add_input_"
<< "n" << N << "c" << C0 << "h" << Hi << "w" << Wi << "c" << C1 << "_filter_k" << K
<< "c" << C0 << "y" << Y << "x" << X << "c" << C1 << "_addout_n" << N << "k" << K0
<< "h" << Ho * 2 << "w" << Wo * 2 << "k" << K1 << std::endl;
for(int i = 0; i < 5; i++)
{
......
......@@ -64,7 +64,7 @@ void device_convolution_forward_implicit_gemm_v5r1_dlops_nc0hwc1_kc0yxc1_nk0hwk1
wei_k_c0_y_x_c1_device_buf.ToDevice(wei_k_c0_y_x_c1.mData.data());
bias_k0_k1_device_buf.ToDevice(bias_k0_k1.mData.data());
constexpr index_t InWeiVectorSize = 8;
constexpr index_t InWeiVectorSize = C1;
if(C1 % InWeiVectorSize != 0)
{
......@@ -157,6 +157,11 @@ void device_convolution_forward_implicit_gemm_v5r1_dlops_nc0hwc1_kc0yxc1_nk0hwk1
CThreadTransferDstScalarPerVector_K,
activ_type>{};
std::cerr << "conv_bias_activ_input_"
<< "n" << N << "c" << C0 << "h" << Hi << "w" << Wi << "c" << C1 << "_filter_k" << K
<< "c" << C0 << "y" << Y << "x" << X << "c" << C1 << "_convout_n" << N << "k" << K0
<< "h" << Ho << "w" << Wo << "k" << K1 << std::endl;
for(int i = 0; i < 5; i++)
{
......
......@@ -73,7 +73,7 @@ void device_convolution_maxpool_forward_implicit_gemm_v5r1_dlops_nc0hwc1_kc0yxc1
bias_k0_k1_device_buf.ToDevice(bias_k0_k1.mData.data());
max_n_k0_hx_wx_k1_device_buf.ToDevice(max_n_k0_hx_wx_k1.mData.data());
constexpr index_t InWeiVectorSize = 8;
constexpr index_t InWeiVectorSize = C1;
if(C1 % InWeiVectorSize != 0)
{
......@@ -173,6 +173,12 @@ void device_convolution_maxpool_forward_implicit_gemm_v5r1_dlops_nc0hwc1_kc0yxc1
CThreadTransferDstScalarPerVector_K,
activ_type>{};
std::cerr << "conv_bias_activ_maxpool_input_"
<< "n" << N << "c" << C0 << "h" << Hi << "w" << Wi << "c" << C1 << "_filter_k" << K
<< "c" << C0 << "y" << Y << "x" << X << "c" << C1 << "_convout_n" << N << "k" << K0
<< "h" << Ho << "w" << Wo << "k" << K1 << "_maxpoolout_n" << N << "k" << K0 << "h"
<< Ho / 2 << "w" << Wo / 2 << "k" << K1 << std::endl;
for(int i = 0; i < 5; i++)
{
......
......@@ -93,7 +93,7 @@ int main(int argc, char* argv[])
const bool do_log = std::stoi(argv[4]);
const int nrepeat = std::stoi(argv[5]);
constexpr index_t activ_type = 0;
constexpr index_t activ_type = 1;
#if 0
constexpr auto N = Number<1>{};
......@@ -105,7 +105,7 @@ int main(int argc, char* argv[])
constexpr auto C1 = Number<8>{};
constexpr auto K1 = Number<8>{};
constexpr auto K0 = Number<8>{};
#elif 1
#elif 0
constexpr auto N = Number<1>{};
constexpr auto Hi = Number<540>{};
constexpr auto Wi = Number<960>{};
......@@ -125,8 +125,8 @@ int main(int argc, char* argv[])
constexpr auto C1 = Number<8>{};
constexpr auto K0 = Number<2>{};
constexpr auto K1 = Number<8>{};
#elif 0
constexpr auto N = Number<1>{};
#elif 1
constexpr auto N = Number<128>{};
constexpr auto Hi = Number<135>{};
constexpr auto Wi = Number<240>{};
constexpr auto Y = Number<3>{};
......
......@@ -92,7 +92,20 @@ int main(int argc, char* argv[])
const bool do_log = std::stoi(argv[4]);
const int nrepeat = std::stoi(argv[5]);
constexpr ck::ActivTypeEnum_t activ_type = ActivTypeEnum_t::Sigmoid;
// constexpr ck::ActivTypeEnum_t activ_type = ActivTypeEnum_t::LeakyRelu;
#if 0
constexpr auto N = Number<1>{};
constexpr auto Hi = Number<1080>{};
constexpr auto Wi = Number<1920>{};
constexpr auto Y = Number<3>{};
constexpr auto X = Number<3>{};
constexpr auto C0 = Number<2>{};
constexpr auto C1 = Number<8>{};
constexpr auto K0 = Number<1>{};
constexpr auto K1 = Number<4>{};
#elif 0
constexpr auto N = Number<1>{};
constexpr auto Hi = Number<1080>{};
constexpr auto Wi = Number<1920>{};
......@@ -102,18 +115,28 @@ int main(int argc, char* argv[])
constexpr auto C1 = Number<8>{};
constexpr auto K0 = Number<2>{};
constexpr auto K1 = Number<8>{};
#elif 0
constexpr auto N = Number<1>{};
constexpr auto Hi = Number<1080>{};
constexpr auto Wi = Number<1920>{};
constexpr auto Y = Number<1>{};
constexpr auto X = Number<1>{};
constexpr auto C0 = Number<2>{};
constexpr auto C1 = Number<8>{};
constexpr auto K0 = Number<2>{};
constexpr auto K1 = Number<8>{};
#elif 0
constexpr auto N = Number<1>{};
constexpr auto Hi = Number<540>{};
constexpr auto Wi = Number<960>{};
constexpr auto Y = Number<3>{};
constexpr auto X = Number<3>{};
constexpr auto Y = Number<1>{};
constexpr auto X = Number<1>{};
constexpr auto C0 = Number<2>{};
constexpr auto C1 = Number<8>{};
constexpr auto K0 = Number<2>{};
constexpr auto K1 = Number<8>{};
constexpr auto K0 = Number<8>{};
#elif 1
constexpr auto N = Number<1>{};
constexpr auto N = Number<128>{};
constexpr auto Hi = Number<270>{};
constexpr auto Wi = Number<480>{};
constexpr auto Y = Number<1>{};
......@@ -128,6 +151,7 @@ int main(int argc, char* argv[])
constexpr auto conv_stride_w = I1;
constexpr auto conv_dilation_h = I1;
constexpr auto conv_dilation_w = I1;
#if 0
constexpr auto in_left_pad_h = I1;
constexpr auto in_left_pad_w = I1;
......@@ -260,8 +284,6 @@ int main(int argc, char* argv[])
in_right_pads_dev);
};
constexpr ck::ActivTypeEnum_t activ_type = ActivTypeEnum_t::LeakyRelu;
#if USE_CONV_FWD_V5R1_NCHWC
if(algo == ConvForwardAlgo::V5R1NCHWC)
{
......
......@@ -96,6 +96,16 @@ int main(int argc, char* argv[])
constexpr index_t activ_type = 1;
#if 0
constexpr auto N = Number<1>{};
constexpr auto Hi = Number<1080>{};
constexpr auto Wi = Number<1920>{};
constexpr auto Y = Number<3>{};
constexpr auto X = Number<3>{};
constexpr auto C0 = Number<3>{};
constexpr auto C1 = Number<4>{};
constexpr auto K0 = Number<2>{};
constexpr auto K1 = Number<8>{};
#elif 0
constexpr auto N = Number<1>{};
constexpr auto Hi = Number<1080>{};
constexpr auto Wi = Number<1920>{};
......@@ -116,7 +126,7 @@ int main(int argc, char* argv[])
constexpr auto K0 = Number<2>{};
constexpr auto K1 = Number<8>{};
#elif 1
constexpr auto N = Number<1>{};
constexpr auto N = Number<128>{};
constexpr auto Hi = Number<270>{};
constexpr auto Wi = Number<480>{};
constexpr auto Y = Number<3>{};
......
......@@ -4,7 +4,7 @@
template <typename T>
inline auto activ(T v, const ck::index_t activ_type)
{
const T alpha = 0.30000001192092896;
const T alpha = 0.3;
switch(activ_type)
{
case 0: return v;
......@@ -127,9 +127,8 @@ void host_direct_convolution_nchwc(const Tensor<TIn>& in,
auto f_nchw = [&](auto n, auto k0, auto ho, auto wo, auto k1) {
double v = 0;
const int k = k0 * out.mDesc.GetLengths()[4] + k1;
for(int c0 = 0; c0 < wei.mDesc.GetLengths()[1]; ++c0)
{
for(int c1 = 0; c1 < wei.mDesc.GetLengths()[4]; ++c1)
{
for(int y = 0; y < wei.mDesc.GetLengths()[2]; ++y)
{
......@@ -139,6 +138,8 @@ void host_direct_convolution_nchwc(const Tensor<TIn>& in,
int wi = wo * conv_strides[I1] + x * conv_dilations[I1] - in_left_pads[I1];
if(hi >= 0 && hi < in.mDesc.GetLengths()[2] && wi >= 0 &&
wi < in.mDesc.GetLengths()[3])
{
for(int c1 = 0; c1 < wei.mDesc.GetLengths()[4]; ++c1)
{
v += static_cast<const double>(in(n, c0, hi, wi, c1)) *
static_cast<const double>(wei(k, c0, y, x, c1));
......@@ -185,9 +186,9 @@ void host_direct_convolution_add_nchwc(const Tensor<TIn>& in,
auto f_nchw = [&](auto n, auto k0, auto ho, auto wo, auto k1) {
double v = 0;
auto k = k0 * out_host.mDesc.GetLengths()[4] + k1;
for(int c0 = 0; c0 < wei.mDesc.GetLengths()[1]; ++c0)
{
for(int c1 = 0; c1 < wei.mDesc.GetLengths()[4]; ++c1)
{
for(int y = 0; y < wei.mDesc.GetLengths()[2]; ++y)
{
......@@ -197,17 +198,20 @@ void host_direct_convolution_add_nchwc(const Tensor<TIn>& in,
int wi = wo * conv_strides[I1] + x * conv_dilations[I1] - in_left_pads[I1];
if(hi >= 0 && hi < in.mDesc.GetLengths()[2] && wi >= 0 &&
wi < in.mDesc.GetLengths()[3])
{
for(int c1 = 0; c1 < wei.mDesc.GetLengths()[4]; ++c1)
{
v += static_cast<const double>(in(n, c0, hi, wi, c1)) *
static_cast<const double>(
wei(k0 * out_host.mDesc.GetLengths()[4] + k1, c0, y, x, c1));
static_cast<const double>(wei(k, c0, y, x, c1));
}
}
}
}
}
v = activ(v, activ_type) + bias(k0, k1);
v += bias(k0, k1);
v = activ(v, activ_type);
const int hox2 = ho * 2;
const int wox2 = wo * 2;
......@@ -253,9 +257,9 @@ void host_direct_convolution_maxpool_nchwc(const Tensor<TIn>& in,
auto f_nchw = [&](auto n, auto k0, auto ho, auto wo, auto k1) {
double v = 0;
auto k = k0 * out_host.mDesc.GetLengths()[4] + k1;
for(int c0 = 0; c0 < wei.mDesc.GetLengths()[1]; ++c0)
{
for(int c1 = 0; c1 < wei.mDesc.GetLengths()[4]; ++c1)
{
for(int y = 0; y < wei.mDesc.GetLengths()[2]; ++y)
{
......@@ -265,10 +269,11 @@ void host_direct_convolution_maxpool_nchwc(const Tensor<TIn>& in,
int wi = wo * conv_strides[I1] + x * conv_dilations[I1] - in_left_pads[I1];
if(hi >= 0 && hi < in.mDesc.GetLengths()[2] && wi >= 0 &&
wi < in.mDesc.GetLengths()[3])
{
for(int c1 = 0; c1 < wei.mDesc.GetLengths()[4]; ++c1)
{
v += static_cast<const double>(in(n, c0, hi, wi, c1)) *
static_cast<const double>(
wei(k0 * out_host.mDesc.GetLengths()[4] + k1, c0, y, x, c1));
static_cast<const double>(wei(k, c0, y, x, c1));
}
}
}
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment