Commit b725e3fc authored by Chao Liu's avatar Chao Liu
Browse files

Merge remote-tracking branch 'origin/develop' into miopen_downstream-dynamic_reduction_pr

parents df0d6810 f3acd251
#pragma once
#include "host_tensor.hpp"
template <typename TOut,
typename TIn,
typename TWei,
typename ConvStrides,
typename ConvDilations,
typename InLeftPads,
typename InRightPads>
void host_direct_convolution_backward_weights(
const Tensor<TOut>& out,
const Tensor<TIn>& in,
Tensor<TWei>& wei,
const ConvStrides& conv_strides,
const ConvDilations& conv_dilations,
const InLeftPads& in_left_pads,
const InRightPads&,
const ConvTensorLayout layout = ConvTensorLayout::NCHW)
{
using namespace ck;
constexpr auto I0 = Number<0>{};
constexpr auto I1 = Number<1>{};
auto f_kcyx = [&](auto k, auto c, auto y, auto x) {
double v = 0;
for(int n = 0; n < out.mDesc.GetLengths()[0]; ++n)
{
for(int ho = 0; ho < out.mDesc.GetLengths()[2]; ++ho)
{
int hi = ho * conv_strides[I0] + y * conv_dilations[I0] - in_left_pads[I0];
for(int wo = 0; wo < out.mDesc.GetLengths()[3]; ++wo)
{
int wi = wo * conv_strides[I1] + x * conv_dilations[I1] - in_left_pads[I1];
if(hi >= 0 && hi < in.mDesc.GetLengths()[2] && wi >= 0 &&
wi < in.mDesc.GetLengths()[3])
{
v += static_cast<const double>(in(n, c, hi, wi)) *
static_cast<const double>(out(n, k, ho, wo));
}
}
}
}
wei(k, c, y, x) = v;
};
auto f_kyxc = [&](auto k, auto y, auto x, auto c) {
double v = 0;
for(int n = 0; n < out.mDesc.GetLengths()[0]; ++n)
{
for(int ho = 0; ho < out.mDesc.GetLengths()[1]; ++ho)
{
int hi = ho * conv_strides[I0] + y * conv_dilations[I0] - in_left_pads[I0];
for(int wo = 0; wo < out.mDesc.GetLengths()[2]; ++wo)
{
int wi = wo * conv_strides[I1] + x * conv_dilations[I1] - in_left_pads[I1];
if(hi >= 0 && hi < in.mDesc.GetLengths()[1] && wi >= 0 &&
wi < in.mDesc.GetLengths()[2])
{
v += static_cast<const double>(in(n, hi, wi, c)) *
static_cast<const double>(out(n, ho, wo, k));
}
}
}
}
wei(k, y, x, c) = v;
};
if(layout == ConvTensorLayout::NCHW)
{
make_ParallelTensorFunctor(f_kcyx,
wei.mDesc.GetLengths()[0],
wei.mDesc.GetLengths()[1],
wei.mDesc.GetLengths()[2],
wei.mDesc.GetLengths()[3])(std::thread::hardware_concurrency());
}
else if(layout == ConvTensorLayout::NHWC)
{
make_ParallelTensorFunctor(f_kyxc,
wei.mDesc.GetLengths()[0],
wei.mDesc.GetLengths()[1],
wei.mDesc.GetLengths()[2],
wei.mDesc.GetLengths()[3])(std::thread::hardware_concurrency());
}
else
{
throw std::runtime_error("wrong! not supported layout");
}
}
#pragma once
#include "host_tensor.hpp"
#include "gemm_common.hpp"
template <typename AType, typename BType, typename CType>
void host_gemm(const Tensor<AType>& a,
const Tensor<BType>& b,
Tensor<CType>& c,
const GemmMatrixLayout layout)
{
if(layout == GemmMatrixLayout::MK_KN_MN)
{
auto f_mk_kn_mn = [&](auto m, auto n) {
const int K = a.mDesc.GetLengths()[1];
double v = 0;
for(int k = 0; k < K; ++k)
{
v += static_cast<const double>(a(m, k)) * static_cast<const double>(b(k, n));
}
c(m, n) = v;
};
make_ParallelTensorFunctor(f_mk_kn_mn, c.mDesc.GetLengths()[0], c.mDesc.GetLengths()[1])(
std::thread::hardware_concurrency());
}
else if(layout == GemmMatrixLayout::MK_NK_MN)
{
auto f_mk_nk_mn = [&](auto m, auto n) {
const int K = a.mDesc.GetLengths()[1];
double v = 0;
for(int k = 0; k < K; ++k)
{
v += static_cast<const double>(a(m, k)) * static_cast<const double>(b(n, k));
}
c(m, n) = v;
};
make_ParallelTensorFunctor(f_mk_nk_mn, c.mDesc.GetLengths()[0], c.mDesc.GetLengths()[1])(
std::thread::hardware_concurrency());
}
else if(layout == GemmMatrixLayout::KM_KN_MN)
{
auto f_km_kn_mn = [&](auto m, auto n) {
const int K = a.mDesc.GetLengths()[0];
double v = 0;
for(int k = 0; k < K; ++k)
{
v += static_cast<const double>(a(k, m)) * static_cast<const double>(b(k, n));
}
c(m, n) = v;
};
make_ParallelTensorFunctor(f_km_kn_mn, c.mDesc.GetLengths()[0], c.mDesc.GetLengths()[1])(
std::thread::hardware_concurrency());
}
else if(layout == GemmMatrixLayout::KM_NK_MN)
{
auto f_km_nk_mn = [&](auto m, auto n) {
const int K = a.mDesc.GetLengths()[0];
double v = 0;
for(int k = 0; k < K; ++k)
{
v += static_cast<const double>(a(k, m)) * static_cast<const double>(b(n, k));
}
c(m, n) = v;
};
make_ParallelTensorFunctor(f_km_nk_mn, c.mDesc.GetLengths()[0], c.mDesc.GetLengths()[1])(
std::thread::hardware_concurrency());
}
else
{
throw std::runtime_error("wrong! not supported layout");
}
}
......@@ -9,8 +9,8 @@ struct tunable_dyn_conv_fwd_v4r4_xdlops_nchw_kcyx_nkhw
int NPerBlock;
int KPerBlock;
int MPerWave;
int NPerWave;
int MPerXDL;
int NPerXDL;
int K1;
int MRepeat;
......@@ -45,8 +45,8 @@ static tunable_dyn_conv_fwd_v4r4_xdlops_nchw_kcyx_nkhw
128, // MPerBlock,
128, // NPerBlock,
4, // KPerBlock,
32, // MPerWave,
32, // NPerWave,
32, // MPerXDL,
32, // NPerXDL,
4, // K1,
2, // MRepeat,
2, // NRepeat,
......
......@@ -12,13 +12,16 @@
#export OLC_DEBUG_HIP_DUMP=1
#export OLC_DEBUG_SAVE_TEMP_DIR=1
make -j conv_fwd_driver_offline
make -j conv_bwd_driver_offline
make -j conv_fwd_driver_online
#rm -rf /root/_hip_binary_kernels_/
#rm -rf /tmp/olCompile*
#make -j conv_fwd_driver_offline
#make -j conv_bwd_driver_offline
#make -j conv_wrw_driver_offline
#make -j conv_fwd_driver_online
make -j gemm_driver_offline
LAYOUT=$1
ALGO=$2
VERIFY=$3
......@@ -30,7 +33,7 @@ REPEAT=$6
#./host/driver_offline/conv_fwd_driver_offline $LAYOUT $ALGO $VERIFY $INIT $LOG $REPEAT 128 128 192 3 3 71 71 2 2 1 1 1 1 1 1
#./host/driver_offline/conv_fwd_driver_offline $LAYOUT $ALGO $VERIFY $INIT $LOG $REPEAT 128 256 192 3 3 71 71 2 2 1 1 1 1 1 1
#./host/driver_offline/conv_fwd_driver_offline $LAYOUT $ALGO $VERIFY $INIT $LOG $REPEAT 128 256 1024 1 7 17 17 1 1 1 1 0 3 0 3
./host/driver_offline/conv_fwd_driver_offline $LAYOUT $ALGO $VERIFY $INIT $LOG $REPEAT 256 256 256 3 3 14 14 1 1 1 1 1 1 1 1
#./host/driver_offline/conv_fwd_driver_offline $LAYOUT $ALGO $VERIFY $INIT $LOG $REPEAT 256 256 256 3 3 14 14 1 1 1 1 1 1 1 1
#./host/driver_offline/conv_fwd_driver_offline $LAYOUT $ALGO $VERIFY $INIT $LOG $REPEAT 128 128 128 3 3 14 14 1 1 1 1 1 1 1 1
#./host/driver_offline/conv_fwd_driver_offline $LAYOUT $ALGO $VERIFY $INIT $LOG $REPEAT 256 512 512 3 3 7 7 1 1 1 1 1 1 1 1
......@@ -44,4 +47,12 @@ REPEAT=$6
#./host/driver_offline/conv_bwd_driver_offline $LAYOUT $ALGO $VERIFY $INIT $LOG $REPEAT 256 256 256 3 3 14 14 1 1 1 1 1 1 1 1
#./host/driver_online/conv_fwd_driver_online $LAYOUT $ALGO $VERIFY $INIT $LOG $REPEAT 128 256 192 3 3 71 71 2 2 1 1 1 1 1 1
#./host/driver_offline/conv_wrw_driver_offline $LAYOUT $ALGO $VERIFY $INIT $LOG $REPEAT 128 256 128 3 3 14 14 1 1 1 1 1 1 1 1
#./host/driver_online/conv_fwd_driver_online $LAYOUT $ALGO $VERIFY $INIT $LOG $REPEAT 128 128 192 3 3 71 71 2 2 1 1 1 1 1 1
################################################ layout algo verify init log repeat M___ N___ K___
#./host/driver_offline/gemm_driver_offline $LAYOUT $ALGO $VERIFY $INIT $LOG $REPEAT 960 1024 1024
#./host/driver_offline/gemm_driver_offline $LAYOUT $ALGO $VERIFY $INIT $LOG $REPEAT 1920 2048 2048
./host/driver_offline/gemm_driver_offline $LAYOUT $ALGO $VERIFY $INIT $LOG $REPEAT 3840 4096 4096
#./host/driver_offline/gemm_driver_offline $LAYOUT $ALGO $VERIFY $INIT $LOG $REPEAT 7680 8192 8192
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment